uvm_swap.c revision 1.1 1 /* $NetBSD: uvm_swap.c,v 1.1 1998/02/05 06:25:08 mrg Exp $ */
2 /* $Id: uvm_swap.c,v 1.1 1998/02/05 06:25:08 mrg Exp $ */
3
4 /*
5 * Copyright (c) 1995, 1996, 1997 Matthew R. Green
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. The name of the author may not be used to endorse or promote products
17 * derived from this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/buf.h>
35 #include <sys/proc.h>
36 #include <sys/namei.h>
37 #include <sys/disklabel.h>
38 #include <sys/errno.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/vnode.h>
42 #include <sys/file.h>
43 #include <sys/extent.h>
44 #include <sys/mount.h>
45 #include <sys/syscallargs.h>
46
47 #include <vm/vm.h>
48 #include <vm/vm_swap.h>
49 #include <vm/vm_conf.h>
50
51 #include <uvm/uvm.h>
52
53 #include <miscfs/specfs/specdev.h>
54
55 /*
56 * uvm_swap.c: manage configuration and i/o to swap space.
57 */
58
59 /*
60 * swap space is managed in the following way:
61 *
62 * each swap partition or file is described by a "swapdev" structure.
63 * each "swapdev" structure contains a "swapent" structure which contains
64 * information that is passed up to the user (via system calls).
65 *
66 * each swap partition is assigned a "priority" (int) which controls
67 * swap parition usage.
68 *
69 * the system maintains a global data structure describing all swap
70 * partitions/files. there is a sorted LIST of "swappri" structures
71 * which describe "swapdev"'s at that priority. this LIST is headed
72 * by the "swap_priority" global var. each "swappri" contains a
73 * CIRCLEQ of "swapdev" structures at that priority.
74 *
75 * the system maintains a fixed pool of "swapbuf" structures for use
76 * at swap i/o time. a swapbuf includes a "buf" structure and an
77 * "aiodone" [we want to avoid malloc()'ing anything at swapout time
78 * since memory may be low].
79 *
80 * locking:
81 * - swap_syscall_lock (sleep lock): this lock serializes the swapctl
82 * system call and prevents the swap priority list from changing
83 * while we are in the middle of a system call (e.g. SWAP_STATS).
84 * - swap_data_lock (simple_lock): this lock protects all swap data
85 * structures including the priority list, the swapdev structures,
86 * and the swapmap extent.
87 * - swap_buf_lock (simple_lock): this lock protects the free swapbuf
88 * pool.
89 *
90 * each swap device has the following info:
91 * - swap device in use (could be disabled, preventing future use)
92 * - swap enabled (allows new allocations on swap)
93 * - map info in /dev/drum
94 * - vnode pointer
95 * for swap files only:
96 * - block size
97 * - max byte count in buffer
98 * - buffer
99 * - credentials to use when doing i/o to file
100 *
101 * userland controls and configures swap with the swapctl(2) system call.
102 * the sys_swapctl performs the following operations:
103 * [1] SWAP_NSWAP: returns the number of swap devices currently configured
104 * [2] SWAP_STATS: given a pointer to an array of swapent structures
105 * (passed in via "arg") of a size passed in via "misc" ... we load
106 * the current swap config into the array.
107 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a
108 * priority in "misc", start swapping on it.
109 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
110 * [5] SWAP_CTL: changes the priority of a swap device (new priority in
111 * "misc")
112 */
113
114 /*
115 * SWAP_TO_FILES: allows swapping to plain files.
116 */
117
118 #define SWAP_TO_FILES
119
120 /*
121 * swapdev: describes a single swap partition/file
122 *
123 * note the following should be true:
124 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks]
125 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
126 */
127 struct swapdev {
128 struct swapent swd_se; /* swap entry struct */
129 #define swd_dev swd_se.se_dev /* dev_t for this dev */
130 #define swd_flags swd_se.se_flags /* flags:inuse/enable/fake*/
131 #define swd_priority swd_se.se_priority /* our priority */
132 /* also: swd_se.se_nblks, swd_se.se_inuse */
133 int swd_npages; /* #pages we can use */
134 int swd_npginuse; /* #pages in use */
135 int swd_drumoffset; /* page0 offset in drum */
136 int swd_drumsize; /* #pages in drum */
137 struct extent *swd_ex; /* extent for this swapdev*/
138 struct vnode *swd_vp; /* backing vnode */
139 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */
140
141 #ifdef SWAP_TO_FILES
142 int swd_bsize; /* blocksize (bytes) */
143 int swd_maxactive; /* max active i/o reqs */
144 struct buf swd_tab; /* buffer list */
145 struct ucred *swd_cred; /* cred for file access */
146 #endif
147 };
148
149 /*
150 * swap device priority entry; the list is kept sorted on `spi_priority'.
151 */
152 struct swappri {
153 int spi_priority; /* priority */
154 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev;
155 /* circleq of swapdevs at this priority */
156 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */
157 };
158
159 /*
160 * swapbuf, swapbuffer plus async i/o info
161 */
162 struct swapbuf {
163 struct buf sw_buf; /* a buffer structure */
164 struct uvm_aiodesc sw_aio; /* aiodesc structure, used if ASYNC */
165 SIMPLEQ_ENTRY(swapbuf) sw_sq; /* free list pointer */
166 };
167
168 /*
169 * The following two structures are used to keep track of data transfers
170 * on swap devices associated with regular files.
171 * NOTE: this code is more or less a copy of vnd.c; we use the same
172 * structure names here to ease porting..
173 */
174 struct vndxfer {
175 struct buf *vx_bp; /* Pointer to parent buffer */
176 struct swapdev *vx_sdp;
177 int vx_error;
178 int vx_pending; /* # of pending aux buffers */
179 int vx_flags;
180 #define VX_BUSY 1
181 #define VX_DEAD 2
182 };
183
184 struct vndbuf {
185 struct buf vb_buf;
186 struct vndxfer *vb_xfer;
187 };
188
189 /*
190 * XXX: Not a very good idea in a swap strategy module!
191 */
192 #define getvndxfer() \
193 ((struct vndxfer *)malloc(sizeof(struct vndxfer), M_DEVBUF, M_WAITOK))
194
195 #define putvndxfer(vnx) \
196 free((caddr_t)(vnx), M_DEVBUF)
197
198 #define getvndbuf() \
199 ((struct vndbuf *)malloc(sizeof(struct vndbuf), M_DEVBUF, M_WAITOK))
200
201 #define putvndbuf(vbp) \
202 free((caddr_t)(vbp), M_DEVBUF)
203
204 /*
205 * local variables
206 */
207 UVMHIST_DECL(pdhist);
208 static struct extent *swapmap; /* controls the mapping of /dev/drum */
209 SIMPLEQ_HEAD(swapbufhead, swapbuf);
210 static struct swapbufhead freesbufs; /* list of free swapbufs */
211 static int sbufs_wanted = 0; /* someone sleeping for swapbufs? */
212 #if NCPU > 1
213 static simple_lock_data_t swap_buf_lock;/* locks freesbufs and sbufs_wanted */
214 #endif
215
216 /* list of all active swap devices [by priority] */
217 LIST_HEAD(swap_priority, swappri);
218 static struct swap_priority swap_priority;
219
220 /* locks */
221 lock_data_t swap_syscall_lock;
222 #if NCPU > 1
223 static simple_lock_data_t swap_data_lock;
224 #endif
225
226 /*
227 * prototypes
228 */
229 static void swapdrum_add __P((struct swapdev *, int));
230 static struct swapdev *swapdrum_getsdp __P((int));
231
232 static struct swapdev *swaplist_find __P((struct vnode *, int));
233 static void swaplist_insert __P((struct swapdev *,
234 struct swappri *, int));
235 static void swaplist_trim __P((void));
236
237 static int swap_on __P((struct proc *, struct swapdev *));
238 #ifdef SWAP_OFF_WORKS
239 static int swap_off __P((struct proc *, struct swapdev *));
240 #endif
241
242 #ifdef SWAP_TO_FILES
243 static void sw_reg_strategy __P((struct swapdev *, struct buf *, int));
244 static void sw_reg_iodone __P((struct buf *));
245 static void sw_reg_start __P((struct swapdev *));
246 #endif
247
248 static void uvm_swap_aiodone __P((struct uvm_aiodesc *));
249 static void uvm_swap_bufdone __P((struct buf *));
250 static int uvm_swap_io __P((struct vm_page **, int, int, int));
251
252 /*
253 * uvm_swap_init: init the swap system data structures and locks
254 *
255 * => called at boot time from init_main.c after the filesystems
256 * are brought up (which happens after uvm_init())
257 */
258 void
259 uvm_swap_init()
260 {
261 struct swapbuf *sp;
262 struct proc *p = &proc0; /* XXX */
263 int i;
264 #if defined(UVMHIST)
265 static char histbuf[sizeof(struct uvm_history_ent) * 100];
266 #endif
267 UVMHIST_FUNC("uvm_swap_init");
268
269 UVMHIST_INIT_STATIC(pdhist, histbuf);
270 UVMHIST_CALLED(pdhist);
271 /*
272 * first, init the swap list, its counter, and its lock.
273 * then get a handle on the vnode for /dev/drum by using
274 * the its dev_t number ("swapdev", from MD conf.c).
275 */
276
277 LIST_INIT(&swap_priority);
278 uvmexp.nswapdev = 0;
279 lockinit(&swap_syscall_lock, PVM, "swapsys", 0, 0);
280 simple_lock_init(&swap_data_lock);
281 if (bdevvp(swapdev, &swapdev_vp))
282 panic("uvm_swap_init: can't get vnode for swap device");
283
284 /*
285 * create swap block resource map to map /dev/drum. the range
286 * from 1 to INT_MAX allows 2 gigablocks of swap space. note
287 * that block 0 is reserved (used to indicate an allocation
288 * failure, or no allocation).
289 */
290 swapmap = extent_create("swapmap", 1, INT_MAX,
291 M_VMSWAP, 0, 0, EX_NOWAIT);
292 if (swapmap == 0)
293 panic("uvm_swap_init: extent_create failed");
294
295 /*
296 * allocate our private pool of "swapbuf" structures (includes
297 * a "buf" structure). ["nswbuf" comes from param.c and can
298 * be adjusted by MD code before we get here].
299 */
300
301 sp = malloc(sizeof(*sp) * nswbuf, M_VMSWAP, M_NOWAIT);
302 if (sp == NULL)
303 panic("uvm_swap_init: unable to malloc swap bufs");
304 bzero(sp, sizeof(*sp) * nswbuf);
305 SIMPLEQ_INIT(&freesbufs);
306 simple_lock_init(&swap_buf_lock);
307
308 /* build free list */
309 for (i = 0 ; i < nswbuf ; i++, sp++) {
310 /* p == proc0 */
311 sp->sw_buf.b_rcred = sp->sw_buf.b_wcred = p->p_ucred;
312 sp->sw_buf.b_vnbufs.le_next = NOLIST;
313 SIMPLEQ_INSERT_HEAD(&freesbufs, sp, sw_sq);
314 }
315 printf("uvm_swap: allocated %d swap buffer headers\n", nswbuf);
316
317 /*
318 * done!
319 */
320 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
321 }
322
323 /*
324 * swaplist functions: functions that operate on the list of swap
325 * devices on the system.
326 */
327
328 /*
329 * swaplist_insert: insert swap device "sdp" into the global list
330 *
331 * => caller must hold both swap_syscall_lock and swap_data_lock
332 * => caller must provide a newly malloc'd swappri structure (we will
333 * FREE it if we don't need it... this it to prevent malloc blocking
334 * here while adding swap)
335 */
336 static void
337 swaplist_insert(sdp, newspp, priority)
338 struct swapdev *sdp;
339 struct swappri *newspp;
340 int priority;
341 {
342 struct swappri *spp, *pspp;
343 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
344
345 /*
346 * find entry at or after which to insert the new device.
347 */
348 for (pspp = NULL, spp = swap_priority.lh_first; spp != NULL;
349 spp = spp->spi_swappri.le_next) {
350 if (priority <= spp->spi_priority)
351 break;
352 pspp = spp;
353 }
354
355 /*
356 * new priority?
357 */
358 if (spp == NULL || spp->spi_priority != priority) {
359 spp = newspp; /* use newspp! */
360 UVMHIST_LOG(pdhist, "created new swappri = %d", priority, 0, 0, 0);
361
362 spp->spi_priority = priority;
363 CIRCLEQ_INIT(&spp->spi_swapdev);
364
365 if (pspp)
366 LIST_INSERT_AFTER(pspp, spp, spi_swappri);
367 else
368 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
369 } else {
370 /* we don't need a new priority structure, free it */
371 FREE(newspp, M_VMSWAP);
372 }
373
374 /*
375 * priority found (or created). now insert on the priority's
376 * circleq list and bump the total number of swapdevs.
377 */
378 sdp->swd_priority = priority;
379 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
380 uvmexp.nswapdev++;
381
382 /*
383 * done!
384 */
385 }
386
387 /*
388 * swaplist_find: find and optionally remove a swap device from the
389 * global list.
390 *
391 * => caller must hold both swap_syscall_lock and swap_data_lock
392 * => we return the swapdev we found (and removed)
393 */
394 static struct swapdev *
395 swaplist_find(vp, remove)
396 struct vnode *vp;
397 boolean_t remove;
398 {
399 struct swapdev *sdp;
400 struct swappri *spp;
401
402 /*
403 * search the lists for the requested vp
404 */
405 for (spp = swap_priority.lh_first; spp != NULL;
406 spp = spp->spi_swappri.le_next) {
407 for (sdp = spp->spi_swapdev.cqh_first;
408 sdp != (void *)&spp->spi_swapdev;
409 sdp = sdp->swd_next.cqe_next)
410 if (sdp->swd_vp == vp) {
411 if (remove) {
412 CIRCLEQ_REMOVE(&spp->spi_swapdev,
413 sdp, swd_next);
414 uvmexp.nswapdev--;
415 }
416 return(sdp);
417 }
418 }
419 return (NULL);
420 }
421
422
423 /*
424 * swaplist_trim: scan priority list for empty priority entries and kill
425 * them.
426 *
427 * => caller must hold both swap_syscall_lock and swap_data_lock
428 */
429 static void
430 swaplist_trim()
431 {
432 struct swappri *spp, *nextspp;
433
434 for (spp = swap_priority.lh_first; spp != NULL; spp = nextspp) {
435 nextspp = spp->spi_swappri.le_next;
436 if (spp->spi_swapdev.cqh_first != (void *)&spp->spi_swapdev)
437 continue;
438 LIST_REMOVE(spp, spi_swappri);
439 free((caddr_t)spp, M_VMSWAP);
440 }
441 }
442
443 /*
444 * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area.
445 *
446 * => caller must hold swap_syscall_lock
447 * => swap_data_lock should be unlocked (we may sleep)
448 */
449 static void
450 swapdrum_add(sdp, npages)
451 struct swapdev *sdp;
452 int npages;
453 {
454 u_long result;
455
456 if (extent_alloc(swapmap, npages, EX_NOALIGN, EX_NOBOUNDARY,
457 EX_WAITOK, &result))
458 panic("swapdrum_add");
459
460 sdp->swd_drumoffset = result;
461 sdp->swd_drumsize = npages;
462 }
463
464 /*
465 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
466 * to the "swapdev" that maps that section of the drum.
467 *
468 * => each swapdev takes one big contig chunk of the drum
469 * => caller must hold swap_data_lock
470 */
471 static struct swapdev *
472 swapdrum_getsdp(pgno)
473 int pgno;
474 {
475 struct swapdev *sdp;
476 struct swappri *spp;
477
478 for (spp = swap_priority.lh_first; spp != NULL;
479 spp = spp->spi_swappri.le_next)
480 for (sdp = spp->spi_swapdev.cqh_first;
481 sdp != (void *)&spp->spi_swapdev;
482 sdp = sdp->swd_next.cqe_next)
483 if (pgno >= sdp->swd_drumoffset &&
484 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
485 return sdp;
486 }
487 return NULL;
488 }
489
490
491 /*
492 * sys_swapctl: main entry point for swapctl(2) system call
493 * [with two helper functions: swap_on and swap_off]
494 */
495 int
496 sys_swapctl(p, v, retval)
497 struct proc *p;
498 void *v;
499 register_t *retval;
500 {
501 struct sys_swapctl_args /* {
502 syscallarg(int) cmd;
503 syscallarg(void *) arg;
504 syscallarg(int) misc;
505 } */ *uap = (struct sys_swapctl_args *)v;
506 struct vnode *vp;
507 struct nameidata nd;
508 struct swappri *spp;
509 struct swapdev *sdp;
510 struct swapent *sep;
511 int count, error, misc;
512 int priority;
513 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
514
515 misc = SCARG(uap, misc);
516
517 /*
518 * ensure serialized syscall access by grabbing the swap_syscall_lock
519 */
520 lockmgr(&swap_syscall_lock, LK_EXCLUSIVE, (void *)0, curproc);
521
522 /*
523 * we handle the non-priv NSWAP and STATS request first.
524 *
525 * SWAP_NSWAP: return number of config'd swap devices
526 * [can also be obtained with uvmexp sysctl]
527 */
528 if (SCARG(uap, cmd) == SWAP_NSWAP) {
529 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev, 0, 0, 0);
530 *retval = uvmexp.nswapdev;
531 lockmgr(&swap_syscall_lock, LK_RELEASE, (void *)0, curproc);
532 return (0);
533 }
534
535 /*
536 * SWAP_STATS: get stats on current # of configured swap devs
537 *
538 * note that the swap_priority list can't change as long
539 * as we are holding the swap_syscall_lock. we don't want
540 * to grab the swap_data_lock because we may fault&sleep during
541 * copyout() and we don't want to be holding that lock then!
542 */
543 if (SCARG(uap, cmd) == SWAP_STATS) {
544 sep = (struct swapent *)SCARG(uap, arg);
545 count = 0;
546
547 for (spp = swap_priority.lh_first; spp != NULL;
548 spp = spp->spi_swappri.le_next) {
549 for (sdp = spp->spi_swapdev.cqh_first;
550 sdp != (void *)&spp->spi_swapdev && misc-- > 0;
551 sdp = sdp->swd_next.cqe_next) {
552 /* backwards compatibility for system call */
553 sdp->swd_se.se_inuse =
554 btodb(sdp->swd_npginuse * PAGE_SIZE);
555 error = copyout((caddr_t)&sdp->swd_se,
556 (caddr_t)sep, sizeof(struct swapent));
557 if (error) {
558 lockmgr(&swap_syscall_lock,
559 LK_RELEASE, (void *)0, curproc);
560 return (error);
561 }
562 count++;
563 sep++;
564 }
565 }
566
567 UVMHIST_LOG(pdhist, "<-done SWAP_STATS", 0, 0, 0, 0);
568
569 *retval = count;
570 lockmgr(&swap_syscall_lock, LK_RELEASE, (void *)0, curproc);
571 return (0);
572 }
573
574 /*
575 * all other requests require superuser privs. verify.
576 */
577 if ((error = suser(p->p_ucred, &p->p_acflag))) {
578 lockmgr(&swap_syscall_lock, LK_RELEASE, (void *)0, curproc);
579 return (error);
580 }
581
582 /*
583 * at this point we expect a path name in arg. we will
584 * use namei() to gain a vnode reference (vref), and lock
585 * the vnode (VOP_LOCK).
586 *
587 * XXX: a NULL arg means use the root vnode pointer (e.g. for
588 * miniroot
589 */
590 if (SCARG(uap, arg) == NULL) {
591 vp = rootvp; /* miniroot */
592 if (vget(vp, 1)) {
593 lockmgr(&swap_syscall_lock, LK_RELEASE,
594 (void *)0, curproc);
595 return (EBUSY);
596 }
597 } else {
598 NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, UIO_USERSPACE,
599 SCARG(uap, arg), p);
600 if ((error = namei(&nd))) {
601 lockmgr(&swap_syscall_lock, LK_RELEASE,
602 (void *)0, curproc);
603 return (error);
604 }
605 vp = nd.ni_vp;
606 }
607 /* note: "vp" is referenced and locked */
608
609 error = 0; /* assume no error */
610 switch(SCARG(uap, cmd)) {
611 case SWAP_CTL:
612 /*
613 * get new priority, remove old entry (if any) and then
614 * reinsert it in the correct place. finally, prune out
615 * any empty priority structures.
616 */
617 priority = SCARG(uap, misc);
618 spp = (struct swappri *)
619 malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
620 simple_lock(&swap_data_lock);
621 if ((sdp = swaplist_find(vp, 1)) == NULL) {
622 error = ENOENT;
623 } else {
624 swaplist_insert(sdp, spp, priority);
625 swaplist_trim();
626 }
627 simple_unlock(&swap_data_lock);
628 if (error)
629 free(spp, M_VMSWAP);
630 break;
631
632 case SWAP_ON:
633 /*
634 * check for duplicates. if none found, then insert a
635 * dummy entry on the list to prevent someone else from
636 * trying to enable this device while we are working on
637 * it.
638 */
639 priority = SCARG(uap, misc);
640 simple_lock(&swap_data_lock);
641 if ((sdp = swaplist_find(vp, 0)) != NULL) {
642 error = EBUSY;
643 simple_unlock(&swap_data_lock);
644 goto bad;
645 }
646 sdp = (struct swapdev *)
647 malloc(sizeof *sdp, M_VMSWAP, M_WAITOK);
648 spp = (struct swappri *)
649 malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
650 bzero(sdp, sizeof(*sdp));
651 sdp->swd_flags = SWF_FAKE; /* placeholder only */
652 sdp->swd_vp = vp;
653 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
654 #ifdef SWAP_TO_FILES
655 /*
656 * XXX Is NFS elaboration necessary?
657 */
658 if (vp->v_type == VREG)
659 sdp->swd_cred = crdup(p->p_ucred);
660 #endif
661 swaplist_insert(sdp, spp, priority);
662 simple_unlock(&swap_data_lock);
663
664 /*
665 * we've now got a FAKE placeholder in the swap list.
666 * now attempt to enable swap on it. if we fail, undo
667 * what we've done and kill the fake entry we just inserted.
668 * if swap_on is a success, it will clear the SWF_FAKE flag
669 */
670 if ((error = swap_on(p, sdp)) != 0) {
671 simple_lock(&swap_data_lock);
672 (void) swaplist_find(vp, 1); /* kill fake entry */
673 swaplist_trim();
674 simple_unlock(&swap_data_lock);
675 #ifdef SWAP_TO_FILES
676 if (vp->v_type == VREG)
677 crfree(sdp->swd_cred);
678 #endif
679 free((caddr_t)sdp, M_VMSWAP);
680 break;
681 }
682
683 /*
684 * got it! now add a second reference to vp so that
685 * we keep a reference to the vnode after we return.
686 */
687 vref(vp);
688 break;
689
690 case SWAP_OFF:
691 UVMHIST_LOG(pdhist, "someone is using SWAP_OFF...??", 0,0,0,0);
692 #ifdef SWAP_OFF_WORKS
693 /*
694 * find the entry of interest and ensure it is enabled.
695 */
696 simple_lock(&swap_data_lock);
697 if ((sdp = swaplist_find(vp, 0)) == NULL) {
698 simple_unlock(&swap_data_lock);
699 error = ENXIO;
700 break;
701 }
702 /*
703 * If a device isn't in use or enabled, we
704 * can't stop swapping from it (again).
705 */
706 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
707 simple_unlock(&swap_data_lock);
708 error = EBUSY;
709 goto bad;
710 }
711 /* XXXCDC: should we call with list locked or unlocked? */
712 if ((error = swap_off(p, sdp)) != 0)
713 goto bad;
714 /* XXXCDC: might need relock here */
715
716 /*
717 * now we can kill the entry.
718 */
719 if ((sdp = swaplist_find(vp, 1)) == NULL) {
720 error = ENXIO;
721 break;
722 }
723 simple_unlock(&swap_data_lock);
724 free((caddr_t)sdp, M_VMSWAP);
725 #else
726 error = EINVAL;
727 #endif
728 break;
729
730 default:
731 UVMHIST_LOG(pdhist, "unhandled command: %#x",
732 SCARG(uap, cmd), 0, 0, 0);
733 error = EINVAL;
734 }
735
736 bad:
737 /*
738 * done! use vput to drop our reference and unlock
739 */
740 vput(vp);
741 lockmgr(&swap_syscall_lock, LK_RELEASE, (void *)0, curproc);
742
743 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0);
744 return (error);
745 }
746
747 /*
748 * swap_on: attempt to enable a swapdev for swapping. note that the
749 * swapdev is already on the global list, but disabled (marked
750 * SWF_FAKE).
751 *
752 * => we avoid the start of the disk (to protect disk labels)
753 * => we also avoid the miniroot, if we are swapping to root.
754 * => caller should leave swap_data_lock unlocked, we may lock it
755 * if needed.
756 */
757 static int
758 swap_on(p, sdp)
759 struct proc *p;
760 struct swapdev *sdp;
761 {
762 static int count = 0; /* static */
763 struct vnode *vp;
764 int error, npages, nblocks, size;
765 long addr;
766 char *storage;
767 int storagesize;
768 #ifdef SWAP_TO_FILES
769 struct vattr va;
770 #endif
771 #ifdef NFS
772 extern int (**nfsv2_vnodeop_p) __P((void *));
773 #endif /* NFS */
774 dev_t dev;
775 char *name;
776 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
777
778 /*
779 * we want to enable swapping on sdp. the swd_vp contains
780 * the vnode we want (locked and ref'd), and the swd_dev
781 * contains the dev_t of the file, if it a block device.
782 */
783
784 vp = sdp->swd_vp;
785 dev = sdp->swd_dev;
786
787 /*
788 * open the swap file (mostly useful for block device files to
789 * let device driver know what is up).
790 *
791 * we skip the open/close for root on swap because the root
792 * has already been opened when root was mounted (mountroot).
793 */
794 if (vp != rootvp) {
795 if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)))
796 return (error);
797 }
798
799 /* XXX this only works for block devices */
800 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0);
801
802 /*
803 * we now need to determine the size of the swap area. for
804 * block specials we can call the d_psize function.
805 * for normal files, we must stat [get attrs].
806 *
807 * we put the result in nblks.
808 * for normal files, we also want the filesystem block size
809 * (which we get with statfs).
810 */
811 switch (vp->v_type) {
812 case VBLK:
813 if (bdevsw[major(dev)].d_psize == 0 ||
814 (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) {
815 error = ENXIO;
816 goto bad;
817 }
818 break;
819
820 #ifdef SWAP_TO_FILES
821 case VREG:
822 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)))
823 goto bad;
824 nblocks = (int)btodb(va.va_size);
825 if ((error =
826 VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0)
827 goto bad;
828
829 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
830 /*
831 * limit the max # of outstanding I/O requests we issue
832 * at any one time. take it easy on NFS servers.
833 */
834 #ifdef NFS
835 if (vp->v_op == nfsv2_vnodeop_p)
836 sdp->swd_maxactive = 2; /* XXX */
837 else
838 #endif /* NFS */
839 sdp->swd_maxactive = 8; /* XXX */
840 break;
841 #endif
842
843 default:
844 error = ENXIO;
845 goto bad;
846 }
847
848 /*
849 * save nblocks in a safe place and convert to pages.
850 */
851
852 sdp->swd_se.se_nblks = nblocks;
853 npages = dbtob(nblocks) / PAGE_SIZE;
854
855 /*
856 * for block special files, we want to make sure that leave
857 * the disklabel and bootblocks alone, so we arrange to skip
858 * over them (randomly choosing to skip PAGE_SIZE bytes).
859 * note that because of this the "size" can be less than the
860 * actual number of blocks on the device.
861 */
862 if (vp->v_type == VBLK) {
863 /* we use pages 1 to (size - 1) [inclusive] */
864 size = npages - 1;
865 addr = 1;
866 } else {
867 /* we use pages 0 to (size - 1) [inclusive] */
868 size = npages;
869 addr = 0;
870 }
871
872 /*
873 * make sure we have enough blocks for a reasonable sized swap
874 * area. we want at least one page.
875 */
876
877 if (size < 1) {
878 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0);
879 error = EINVAL;
880 goto bad;
881 }
882
883 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0);
884
885 /*
886 * now we need to allocate an extent to manage this swap device
887 */
888 name = malloc(12, M_VMSWAP, M_WAITOK);
889 sprintf(name, "swap0x%04x", count++);
890
891 /*
892 * XXXCDC: what should we make of this extent storage size stuff
893 *
894 * XXXMRG: well, i've come to realise that we need, at most,
895 * blocks2pages(npages)/2 extents (or so), to cover all possible
896 * allocations that may occur in the extent -- every other page
897 * being allocated.
898 */
899 #if 1
900 storagesize = EXTENT_FIXED_STORAGE_SIZE(maxproc * 2);
901 #else
902 /* XXXMRG: this uses lots of memory */
903 storagesize = EXTENT_FIXED_STORAGE_SIZE(npages / 2);
904 #endif
905 storage = malloc(storagesize, M_VMSWAP, M_WAITOK);
906 /* note that extent_create's 3rd arg is inclusive, thus "- 1" */
907 sdp->swd_ex = extent_create(name, 0, npages - 1, M_VMSWAP,
908 storage, storagesize, EX_WAITOK);
909 /* allocate the `saved' region from the extent so it won't be used */
910 if (addr) {
911 if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK))
912 panic("disklabel region");
913 sdp->swd_npginuse += addr;
914 uvmexp.swpginuse += addr;
915 }
916
917
918 /*
919 * if the vnode we are swapping to is the root vnode
920 * (i.e. we are swapping to the miniroot) then we want
921 * to make sure we don't overwrite it. do a statfs to
922 * find its size and skip over it.
923 */
924 if (vp == rootvp) {
925 struct mount *mp;
926 struct statfs *sp;
927 int rootblocks, rootpages;
928
929 mp = rootvnode->v_mount;
930 sp = &mp->mnt_stat;
931 rootblocks = sp->f_blocks * btodb(sp->f_bsize);
932 rootpages = round_page(dbtob(rootblocks)) / PAGE_SIZE;
933 if (rootpages > npages)
934 panic("swap_on: miniroot larger than swap?");
935
936 if (extent_alloc_region(sdp->swd_ex, addr,
937 rootpages, EX_WAITOK))
938 panic("swap_on: unable to preserve miniroot");
939
940 sdp->swd_npginuse += (rootpages - addr);
941 uvmexp.swpginuse += (rootpages - addr);
942
943 printf("Preserved %d pages of miniroot ", rootpages);
944 printf("leaving %d pages of swap\n", size - rootpages);
945 }
946
947 /*
948 * now add the new swapdev to the drum and enable.
949 */
950 simple_lock(&swap_data_lock);
951 swapdrum_add(sdp, npages);
952 sdp->swd_npages = npages;
953 sdp->swd_flags &= ~SWF_FAKE; /* going live */
954 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
955 simple_unlock(&swap_data_lock);
956 uvmexp.swpages += npages;
957
958 /*
959 * add anon's to reflect the swap space we added
960 */
961 uvm_anon_add(size);
962
963 return (0);
964
965 bad:
966 /*
967 * failure: close device if necessary and return error.
968 */
969 if (vp != rootvp)
970 (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
971 return (error);
972 }
973
974 #ifdef SWAP_OFF_WORKS
975 /*
976 * swap_off: stop swapping on swapdev
977 *
978 * XXXCDC: what conditions go here?
979 */
980 static int
981 swap_off(p, sdp)
982 struct proc *p;
983 struct swapdev *sdp;
984 {
985 char *name;
986 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
987
988 /* turn off the enable flag */
989 sdp->swd_flags &= ~SWF_ENABLE;
990
991 UVMHIST_LOG(pdhist, " dev=%x", sdp->swd_dev);
992
993 /*
994 * XXX write me
995 *
996 * the idea is to find out which processes are using this swap
997 * device, and page them all in.
998 *
999 * eventually, we should try to move them out to other swap areas
1000 * if available.
1001 *
1002 * The alternative is to create a redirection map for this swap
1003 * device. This should work by moving all the pages of data from
1004 * the ex-swap device to another one, and making an entry in the
1005 * redirection map for it. locking is going to be important for
1006 * this!
1007 *
1008 * XXXCDC: also need to shrink anon pool
1009 */
1010
1011 /* until the above code is written, we must ENODEV */
1012 return ENODEV;
1013
1014 extent_free(swapmap, sdp->swd_mapoffset, sdp->swd_mapsize, EX_WAITOK);
1015 name = sdp->swd_ex->ex_name;
1016 extent_destroy(sdp->swd_ex);
1017 free(name, M_VMSWAP);
1018 free((caddr_t)sdp->swd_ex, M_VMSWAP);
1019 if (sdp->swp_vp != rootvp)
1020 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
1021 if (sdp->swd_vp)
1022 vrele(sdp->swd_vp);
1023 free((caddr_t)sdp, M_VMSWAP);
1024 return (0);
1025 }
1026 #endif
1027
1028 /*
1029 * /dev/drum interface and i/o functions
1030 */
1031
1032 /*
1033 * swread: the read function for the drum (just a call to physio)
1034 */
1035 /*ARGSUSED*/
1036 int
1037 swread(dev, uio, ioflag)
1038 dev_t dev;
1039 struct uio *uio;
1040 int ioflag;
1041 {
1042 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
1043
1044 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1045 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1046 }
1047
1048 /*
1049 * swwrite: the write function for the drum (just a call to physio)
1050 */
1051 /*ARGSUSED*/
1052 int
1053 swwrite(dev, uio, ioflag)
1054 dev_t dev;
1055 struct uio *uio;
1056 int ioflag;
1057 {
1058 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
1059
1060 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1061 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1062 }
1063
1064 /*
1065 * swstrategy: perform I/O on the drum
1066 *
1067 * => we must map the i/o request from the drum to the correct swapdev.
1068 */
1069 void
1070 swstrategy(bp)
1071 struct buf *bp;
1072 {
1073 struct swapdev *sdp;
1074 struct vnode *vp;
1075 int pageno;
1076 int bn;
1077 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
1078
1079 /*
1080 * convert block number to swapdev. note that swapdev can't
1081 * be yanked out from under us because we are holding resources
1082 * in it (i.e. the blocks we are doing I/O on).
1083 */
1084 pageno = dbtob(bp->b_blkno) / PAGE_SIZE;
1085 simple_lock(&swap_data_lock);
1086 sdp = swapdrum_getsdp(pageno);
1087 simple_unlock(&swap_data_lock);
1088 if (sdp == NULL) {
1089 bp->b_error = EINVAL;
1090 bp->b_flags |= B_ERROR;
1091 biodone(bp);
1092 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0);
1093 return;
1094 }
1095
1096 /*
1097 * convert drum page number to block number on this swapdev.
1098 */
1099
1100 pageno = pageno - sdp->swd_drumoffset; /* page # on swapdev */
1101 bn = btodb(pageno * PAGE_SIZE); /* convert to diskblock */
1102
1103 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld\n",
1104 ((bp->b_flags & B_READ) == 0) ? "write" : "read",
1105 sdp->swd_drumoffset, bn, bp->b_bcount);
1106
1107
1108 /*
1109 * for block devices we finish up here.
1110 * for regular files we have to do more work which we deligate
1111 * to sw_reg_strategy().
1112 */
1113
1114 switch (sdp->swd_vp->v_type) {
1115 default:
1116 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type);
1117 case VBLK:
1118
1119 /*
1120 * must convert "bp" from an I/O on /dev/drum to an I/O
1121 * on the swapdev (sdp).
1122 */
1123 bp->b_blkno = bn; /* swapdev block number */
1124 vp = sdp->swd_vp; /* swapdev vnode pointer */
1125 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */
1126 VHOLD(vp); /* "hold" swapdev vp for i/o */
1127
1128 /*
1129 * if we are doing a write, we have to redirect the i/o on
1130 * drum's v_numoutput counter to the swapdevs.
1131 */
1132 if ((bp->b_flags & B_READ) == 0) {
1133 int s = splbio();
1134 vwakeup(bp); /* kills one 'v_numoutput' on drum */
1135 vp->v_numoutput++; /* put it on swapdev */
1136 splx(s);
1137 }
1138
1139 /*
1140 * dissassocate buffer with /dev/drum vnode
1141 * [could be null if buf was from physio]
1142 */
1143 if (bp->b_vp != NULLVP)
1144 brelvp(bp);
1145
1146 /*
1147 * finally plug in swapdev vnode and start I/O
1148 */
1149 bp->b_vp = vp;
1150 VOP_STRATEGY(bp);
1151 return;
1152 #ifdef SWAP_TO_FILES
1153 case VREG:
1154 /*
1155 * deligate to sw_reg_strategy function.
1156 */
1157 sw_reg_strategy(sdp, bp, bn);
1158 return;
1159 #endif
1160 }
1161 /* NOTREACHED */
1162 }
1163
1164 #ifdef SWAP_TO_FILES
1165 /*
1166 * sw_reg_strategy: handle swap i/o to regular files
1167 */
1168 static void
1169 sw_reg_strategy(sdp, bp, bn)
1170 struct swapdev *sdp;
1171 struct buf *bp;
1172 int bn;
1173 {
1174 struct vnode *vp;
1175 struct vndxfer *vnx;
1176 daddr_t nbn;
1177 caddr_t addr;
1178 int byteoff, s, off, nra, error, sz, resid;
1179 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
1180
1181 /*
1182 * allocate a vndxfer head for this transfer and point it to
1183 * our buffer.
1184 */
1185 vnx = getvndxfer();
1186 vnx->vx_flags = VX_BUSY;
1187 vnx->vx_error = 0;
1188 vnx->vx_pending = 0;
1189 vnx->vx_bp = bp;
1190 vnx->vx_sdp = sdp;
1191
1192 /*
1193 * setup for main loop where we read filesystem blocks into
1194 * our buffer.
1195 */
1196 error = 0;
1197 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */
1198 addr = bp->b_data; /* current position in buffer */
1199 byteoff = dbtob(bn); /* XXX: should it be an off_t? */
1200
1201 for (resid = bp->b_resid; resid; resid -= sz) {
1202 struct vndbuf *nbp;
1203
1204 /*
1205 * translate byteoffset into block number. return values:
1206 * vp = vnode of underlying device
1207 * nbn = new block number (on underlying vnode dev)
1208 * nra = num blocks we can read-ahead (excludes requested
1209 * block)
1210 */
1211 nra = 0;
1212 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1213 &vp, &nbn, &nra);
1214
1215 if (error == 0 && (long)nbn == -1)
1216 error = EIO; /* failure */
1217
1218 /*
1219 * punt if there was an error or a hole in the file.
1220 * we must wait for any i/o ops we have already started
1221 * to finish before returning.
1222 *
1223 * XXX we could deal with holes here but it would be
1224 * a hassle (in the write case).
1225 */
1226 if (error) {
1227 s = splbio();
1228 vnx->vx_error = error; /* pass error up */
1229 goto out;
1230 }
1231
1232 /*
1233 * compute the size ("sz") of this transfer (in bytes).
1234 * XXXCDC: ignores read-ahead for non-zero offset
1235 */
1236 if ((off = (byteoff % sdp->swd_bsize)) != 0)
1237 sz = sdp->swd_bsize - off;
1238 else
1239 sz = (1 + nra) * sdp->swd_bsize;
1240
1241 if (resid < sz)
1242 sz = resid;
1243
1244 UVMHIST_LOG(pdhist, "sw_reg_strategy: vp %p/%p bn 0x%x/0x%x",
1245 sdp->swd_vp, vp, bn, nbn);
1246
1247 /*
1248 * now get a buf structure. note that the vb_buf is
1249 * at the front of the nbp structure so that you can
1250 * cast pointers between the two structure easily.
1251 */
1252 nbp = getvndbuf();
1253 nbp->vb_buf.b_flags = bp->b_flags | B_CALL;
1254 nbp->vb_buf.b_bcount = sz;
1255 nbp->vb_buf.b_bufsize = bp->b_bufsize; /* XXXCDC: really? */
1256 nbp->vb_buf.b_error = 0;
1257 nbp->vb_buf.b_data = addr;
1258 nbp->vb_buf.b_blkno = nbn + btodb(off);
1259 nbp->vb_buf.b_proc = bp->b_proc;
1260 nbp->vb_buf.b_iodone = sw_reg_iodone;
1261 nbp->vb_buf.b_vp = NULLVP;
1262 nbp->vb_buf.b_rcred = sdp->swd_cred;
1263 nbp->vb_buf.b_wcred = sdp->swd_cred;
1264
1265 /*
1266 * set b_dirtyoff/end and b_vaildoff/end. this is
1267 * required by the NFS client code (otherwise it will
1268 * just discard our I/O request).
1269 */
1270 if (bp->b_dirtyend == 0) {
1271 nbp->vb_buf.b_dirtyoff = 0;
1272 nbp->vb_buf.b_dirtyend = sz;
1273 } else {
1274 nbp->vb_buf.b_dirtyoff =
1275 max(0, bp->b_dirtyoff - (bp->b_bcount-resid));
1276 nbp->vb_buf.b_dirtyend =
1277 min(sz,
1278 max(0, bp->b_dirtyend - (bp->b_bcount-resid)));
1279 }
1280 if (bp->b_validend == 0) {
1281 nbp->vb_buf.b_validoff = 0;
1282 nbp->vb_buf.b_validend = sz;
1283 } else {
1284 nbp->vb_buf.b_validoff =
1285 max(0, bp->b_validoff - (bp->b_bcount-resid));
1286 nbp->vb_buf.b_validend =
1287 min(sz,
1288 max(0, bp->b_validend - (bp->b_bcount-resid)));
1289 }
1290
1291 nbp->vb_xfer = vnx; /* patch it back in to vnx */
1292
1293 /*
1294 * Just sort by block number
1295 */
1296 nbp->vb_buf.b_cylinder = nbp->vb_buf.b_blkno;
1297 s = splbio();
1298 if (vnx->vx_error != 0) {
1299 putvndbuf(nbp);
1300 goto out;
1301 }
1302 vnx->vx_pending++;
1303
1304 /* assoc new buffer with underlying vnode */
1305 bgetvp(vp, &nbp->vb_buf);
1306
1307 /* sort it in and start I/O if we are not over our limit */
1308 disksort(&sdp->swd_tab, &nbp->vb_buf);
1309 sw_reg_start(sdp);
1310 splx(s);
1311
1312 /*
1313 * advance to the next I/O
1314 */
1315 bn += sz;
1316 addr += sz;
1317 }
1318
1319 s = splbio();
1320
1321 out: /* Arrive here at splbio */
1322 vnx->vx_flags &= ~VX_BUSY;
1323 if (vnx->vx_pending == 0) {
1324 if (vnx->vx_error != 0) {
1325 bp->b_error = vnx->vx_error;
1326 bp->b_flags |= B_ERROR;
1327 }
1328 putvndxfer(vnx);
1329 biodone(bp);
1330 }
1331 splx(s);
1332 }
1333
1334 /*
1335 * sw_reg_start: start an I/O request on the requested swapdev
1336 *
1337 * => reqs are sorted by disksort (above)
1338 */
1339 static void
1340 sw_reg_start(sdp)
1341 struct swapdev *sdp;
1342 {
1343 struct buf *bp;
1344 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
1345
1346 /* recursion control */
1347 if ((sdp->swd_flags & SWF_BUSY) != 0)
1348 return;
1349
1350 sdp->swd_flags |= SWF_BUSY;
1351
1352 while (sdp->swd_tab.b_active < sdp->swd_maxactive) {
1353 bp = sdp->swd_tab.b_actf;
1354 if (bp == NULL)
1355 break;
1356 sdp->swd_tab.b_actf = bp->b_actf;
1357 sdp->swd_tab.b_active++;
1358
1359 UVMHIST_LOG(pdhist,
1360 "sw_reg_start: bp %p vp %p blkno %p cnt %lx",
1361 bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
1362 if ((bp->b_flags & B_READ) == 0)
1363 bp->b_vp->v_numoutput++;
1364 VOP_STRATEGY(bp);
1365 }
1366 sdp->swd_flags &= ~SWF_BUSY;
1367 }
1368
1369 /*
1370 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1371 *
1372 * => note that we can recover the vndbuf struct by casting the buf ptr
1373 */
1374 static void
1375 sw_reg_iodone(bp)
1376 struct buf *bp;
1377 {
1378 struct vndbuf *vbp = (struct vndbuf *) bp;
1379 struct vndxfer *vnx = vbp->vb_xfer;
1380 struct buf *pbp = vnx->vx_bp; /* parent buffer */
1381 struct swapdev *sdp = vnx->vx_sdp;
1382 int s, resid;
1383 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
1384
1385 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p",
1386 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
1387 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx",
1388 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1389
1390 /*
1391 * protect vbp at splbio and update.
1392 */
1393
1394 s = splbio();
1395 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1396 pbp->b_resid -= resid;
1397 vnx->vx_pending--;
1398
1399 if (vbp->vb_buf.b_error) {
1400 UVMHIST_LOG(pdhist, " got error=%d !",
1401 vbp->vb_buf.b_error, 0, 0, 0);
1402
1403 /* pass error upward */
1404 vnx->vx_error = vbp->vb_buf.b_error;
1405 }
1406
1407 /*
1408 * drop "hold" reference to vnode (if one)
1409 * XXXCDC: always set to NULLVP, this is useless, right?
1410 */
1411 if (vbp->vb_buf.b_vp != NULLVP)
1412 brelvp(&vbp->vb_buf);
1413
1414 /*
1415 * kill vbp structure
1416 */
1417 putvndbuf(vbp);
1418
1419 /*
1420 * wrap up this transaction if it has run to completion or, in
1421 * case of an error, when all auxiliary buffers have returned.
1422 */
1423 if (vnx->vx_error != 0) {
1424 /* pass error upward */
1425 pbp->b_flags |= B_ERROR;
1426 pbp->b_error = vnx->vx_error;
1427 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1428 putvndxfer(vnx);
1429 biodone(pbp);
1430 }
1431 }
1432
1433 if (pbp->b_resid == 0) {
1434 #ifdef DIAGNOSTIC
1435 if (vnx->vx_pending != 0)
1436 panic("sw_reg_iodone: vnx pending: %d", vnx->vx_pending);
1437 #endif
1438
1439 if ((vnx->vx_flags & VX_BUSY) == 0) {
1440 UVMHIST_LOG(pdhist, " iodone error=%d !",
1441 pbp, vnx->vx_error, 0, 0);
1442 putvndxfer(vnx);
1443 biodone(pbp);
1444 }
1445 }
1446
1447 /*
1448 * done! start next swapdev I/O if one is pending
1449 */
1450 sdp->swd_tab.b_active--;
1451 sw_reg_start(sdp);
1452
1453 splx(s);
1454 }
1455 #endif /* SWAP_TO_FILES */
1456
1457
1458 /*
1459 * uvm_swap_alloc: allocate space on swap
1460 *
1461 * => allocation is done "round robin" down the priority list, as we
1462 * allocate in a priority we "rotate" the circle queue.
1463 * => space can be freed with uvm_swap_free
1464 * => we return the page slot number in /dev/drum (0 == invalid slot)
1465 * => we lock swap_data_lock
1466 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1467 */
1468 int
1469 uvm_swap_alloc(nslots, lessok)
1470 int *nslots; /* IN/OUT */
1471 boolean_t lessok;
1472 {
1473 struct swapdev *sdp;
1474 struct swappri *spp;
1475 u_long result;
1476 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
1477
1478 /*
1479 * no swap devices configured yet? definite failure.
1480 */
1481 if (uvmexp.nswapdev < 1)
1482 return 0;
1483
1484 /*
1485 * lock data lock, convert slots into blocks, and enter loop
1486 */
1487 simple_lock(&swap_data_lock);
1488
1489 ReTry: /* XXXMRG */
1490 for (spp = swap_priority.lh_first; spp != NULL;
1491 spp = spp->spi_swappri.le_next) {
1492 for (sdp = spp->spi_swapdev.cqh_first;
1493 sdp != (void *)&spp->spi_swapdev;
1494 sdp = sdp->swd_next.cqe_next) {
1495 /* if it's not enabled, then we can't swap from it */
1496 if ((sdp->swd_flags & SWF_ENABLE) == 0)
1497 continue;
1498 if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1499 continue;
1500 if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN,
1501 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT,
1502 &result) != 0) {
1503 continue;
1504 }
1505
1506 /*
1507 * successful allocation! now rotate the circleq.
1508 */
1509 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1510 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1511 sdp->swd_npginuse += *nslots;
1512 uvmexp.swpginuse += *nslots;
1513 simple_unlock(&swap_data_lock);
1514 /* done! return drum slot number */
1515 UVMHIST_LOG(pdhist,
1516 "success! returning %d slots starting at %d",
1517 *nslots, result + sdp->swd_drumoffset, 0, 0);
1518 #if 0
1519 {
1520 struct swapdev *sdp2;
1521
1522 sdp2 = swapdrum_getsdp(result + sdp->swd_drumoffset);
1523 if (sdp2 == NULL) {
1524 printf("uvm_swap_alloc: nslots=%d, dev=%x, drumoff=%d, result=%ld",
1525 *nslots, sdp->swd_dev, sdp->swd_drumoffset, result);
1526 panic("uvm_swap_alloc: allocating unmapped swap block!");
1527 }
1528 }
1529 #endif
1530 return(result + sdp->swd_drumoffset);
1531 }
1532 }
1533
1534 /* XXXMRG: BEGIN HACK */
1535 if (*nslots > 1 && lessok) {
1536 *nslots = 1;
1537 goto ReTry; /* XXXMRG: ugh! extent should support this for us */
1538 }
1539 /* XXXMRG: END HACK */
1540
1541 simple_unlock(&swap_data_lock);
1542 return 0; /* failed */
1543 }
1544
1545 /*
1546 * uvm_swap_free: free swap slots
1547 *
1548 * => this can be all or part of an allocation made by uvm_swap_alloc
1549 * => we lock swap_data_lock
1550 */
1551 void
1552 uvm_swap_free(startslot, nslots)
1553 int startslot;
1554 int nslots;
1555 {
1556 struct swapdev *sdp;
1557 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
1558
1559 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots,
1560 startslot, 0, 0);
1561 /*
1562 * convert drum slot offset back to sdp, free the blocks
1563 * in the extent, and return. must hold pri lock to do
1564 * lookup and access the extent.
1565 */
1566 simple_lock(&swap_data_lock);
1567 sdp = swapdrum_getsdp(startslot);
1568
1569 #ifdef DIAGNOSTIC
1570 if (uvmexp.nswapdev < 1)
1571 panic("uvm_swap_free: uvmexp.nswapdev < 1\n");
1572 if (sdp == NULL) {
1573 printf("uvm_swap_free: startslot %d, nslots %d\n", startslot,
1574 nslots);
1575 panic("uvm_swap_free: unmapped address\n");
1576 }
1577 #endif
1578 extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots,
1579 EX_MALLOCOK|EX_NOWAIT);
1580 sdp->swd_npginuse -= nslots;
1581 uvmexp.swpginuse -= nslots;
1582 #ifdef DIAGNOSTIC
1583 if (sdp->swd_npginuse < 0)
1584 panic("uvm_swap_free: inuse < 0");
1585 #endif
1586 simple_unlock(&swap_data_lock);
1587 }
1588
1589 /*
1590 * uvm_swap_put: put any number of pages into a contig place on swap
1591 *
1592 * => can be sync or async
1593 * => XXXMRG: consider making it an inline or macro
1594 */
1595 int
1596 uvm_swap_put(swslot, ppsp, npages, flags)
1597 int swslot;
1598 struct vm_page **ppsp;
1599 int npages;
1600 int flags;
1601 {
1602 int result;
1603
1604 #if 0
1605 flags |= PGO_SYNCIO; /* XXXMRG: tmp, force sync */
1606 #endif
1607
1608 result = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1609 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1610
1611 return (result);
1612 }
1613
1614 /*
1615 * uvm_swap_get: get a single page from swap
1616 *
1617 * => usually a sync op (from fault)
1618 * => XXXMRG: consider making it an inline or macro
1619 */
1620 int
1621 uvm_swap_get(page, swslot, flags)
1622 struct vm_page *page;
1623 int swslot, flags;
1624 {
1625 int result;
1626
1627 uvmexp.nswget++;
1628 #ifdef DIAGNOSTIC
1629 if ((flags & PGO_SYNCIO) == 0)
1630 printf("uvm_swap_get: ASYNC get requested?\n");
1631 #endif
1632
1633 result = uvm_swap_io(&page, swslot, 1, B_READ |
1634 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1635
1636 return (result);
1637 }
1638
1639 /*
1640 * uvm_swap_io: do an i/o operation to swap
1641 */
1642
1643 static int
1644 uvm_swap_io(pps, startslot, npages, flags)
1645 struct vm_page **pps;
1646 int startslot, npages, flags;
1647 {
1648 daddr_t startblk;
1649 struct swapbuf *sbp;
1650 struct buf *bp;
1651 vm_offset_t kva;
1652 int result, s, waitf;
1653 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
1654
1655 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d",
1656 startslot, npages, flags, 0);
1657 /*
1658 * convert starting drum slot to block number
1659 */
1660 startblk = btodb(startslot * PAGE_SIZE);
1661
1662 /*
1663 * first, map the pages into the kernel (XXX: currently required
1664 * by buffer system). note that we don't let pagermapin alloc
1665 * an aiodesc structure because we don't want to chance a malloc.
1666 * we've got our own pool of aiodesc structures (in swapbuf).
1667 */
1668 waitf = (flags & B_ASYNC) ? M_NOWAIT : M_WAITOK;
1669 kva = uvm_pagermapin(pps, npages, NULL, waitf);
1670 if (kva == NULL)
1671 return (VM_PAGER_AGAIN);
1672
1673 /*
1674 * now allocate a swap buffer off of freesbufs
1675 * [make sure we don't put the pagedaemon to sleep...]
1676 */
1677 s = splbio();
1678 simple_lock(&swap_buf_lock);
1679
1680 /* never put the pagedaemon to sleep! */
1681 if ((flags & B_ASYNC) != 0 || curproc == uvm.pagedaemon_proc) {
1682
1683 sbp = freesbufs.sqh_first;
1684
1685 } else {
1686
1687 /* we can sleep for a sbuf if needed */
1688 while (freesbufs.sqh_first == NULL) {
1689
1690 sbufs_wanted = 1;
1691 UVM_UNLOCK_AND_WAIT(&freesbufs, &swap_buf_lock, 0,
1692 "uvmswiobuf",0);
1693
1694 simple_lock(&swap_buf_lock); /* relock */
1695 }
1696 sbp = freesbufs.sqh_first;
1697 }
1698
1699 if (sbp)
1700 SIMPLEQ_REMOVE_HEAD(&freesbufs, sbp, sw_sq);
1701 simple_unlock(&swap_buf_lock);
1702 splx(s); /* drop splbio */
1703
1704 /*
1705 * if we failed to get a swapbuf, return "try again"
1706 */
1707 if (sbp == NULL)
1708 return (VM_PAGER_AGAIN);
1709
1710 /*
1711 * fill in the bp/sbp. we currently route our i/o through
1712 * /dev/drum's vnode [swapdev_vp].
1713 */
1714 bp = &sbp->sw_buf;
1715 bp->b_flags = B_BUSY | (flags & (B_READ|B_ASYNC));
1716 bp->b_proc = &proc0; /* XXX */
1717 bp->b_data = (caddr_t)kva;
1718 bp->b_blkno = startblk;
1719 VHOLD(swapdev_vp);
1720 bp->b_vp = swapdev_vp;
1721 /* XXXCDC: isn't swapdev_vp always a VCHR? */
1722 /* XXXMRG: probably -- this is obviously something inherited... */
1723 if (swapdev_vp->v_type == VBLK)
1724 bp->b_dev = swapdev_vp->v_rdev;
1725 bp->b_bcount = npages * PAGE_SIZE;
1726
1727 /*
1728 * for pageouts we must set "dirtyoff" [NFS client code needs it].
1729 * and we bump v_numoutput (counter of number of active outputs).
1730 */
1731 if ((bp->b_flags & B_READ) == 0) {
1732 bp->b_dirtyoff = 0;
1733 bp->b_dirtyend = npages * PAGE_SIZE;
1734 s = splbio();
1735 swapdev_vp->v_numoutput++;
1736 splx(s);
1737 }
1738
1739 /*
1740 * for async ops we must set up the aiodesc and setup the callback
1741 * XXX: we expect no async-reads, but we don't prevent it here.
1742 */
1743 if (flags & B_ASYNC) {
1744 sbp->sw_aio.aiodone = uvm_swap_aiodone;
1745 sbp->sw_aio.kva = kva;
1746 sbp->sw_aio.npages = npages;
1747 sbp->sw_aio.pd_ptr = sbp; /* backpointer */
1748 bp->b_flags |= B_CALL; /* set callback */
1749 bp->b_iodone = uvm_swap_bufdone;/* "buf" iodone function */
1750 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1751 }
1752 UVMHIST_LOG(pdhist,
1753 "about to start io: data = 0x%p blkno = 0x%x, bcount = %ld",
1754 bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1755
1756 /*
1757 * now we start the I/O, and if async, return.
1758 */
1759 VOP_STRATEGY(bp);
1760 if (flags & B_ASYNC)
1761 return (VM_PAGER_PEND);
1762
1763 /*
1764 * must be sync i/o. wait for it to finish
1765 */
1766 bp->b_error = biowait(bp);
1767 result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK;
1768
1769 /*
1770 * kill the pager mapping
1771 */
1772 uvm_pagermapout(kva, npages);
1773
1774 /*
1775 * now dispose of the swap buffer
1776 */
1777 s = splbio();
1778 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
1779 if (bp->b_vp)
1780 brelvp(bp);
1781
1782 simple_lock(&swap_buf_lock);
1783 SIMPLEQ_INSERT_HEAD(&freesbufs, sbp, sw_sq);
1784 if (sbufs_wanted) {
1785 sbufs_wanted = 0;
1786 thread_wakeup(&freesbufs);
1787 }
1788 simple_unlock(&swap_buf_lock);
1789 splx(s);
1790
1791 /*
1792 * finally return.
1793 */
1794 UVMHIST_LOG(pdhist, "<- done (sync) result=%d", result, 0, 0, 0);
1795 return (result);
1796 }
1797
1798 /*
1799 * uvm_swap_bufdone: called from the buffer system when the i/o is done
1800 */
1801 static void
1802 uvm_swap_bufdone(bp)
1803 struct buf *bp;
1804 {
1805 struct swapbuf *sbp = (struct swapbuf *) bp;
1806 int s = splbio();
1807 UVMHIST_FUNC("uvm_swap_bufdone"); UVMHIST_CALLED(pdhist);
1808
1809 UVMHIST_LOG(pdhist, "cleaning buf %p", buf, 0, 0, 0);
1810 #ifdef DIAGNOSTIC
1811 /*
1812 * sanity check: swapbufs are private, so they shouldn't be wanted
1813 */
1814 if (bp->b_flags & B_WANTED)
1815 panic("uvm_swap_bufdone: private buf wanted");
1816 #endif
1817
1818 /*
1819 * drop buffers reference to the vnode and its flags.
1820 */
1821 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
1822 if (bp->b_vp)
1823 brelvp(bp);
1824
1825 /*
1826 * now put the aio on the uvm.aio_done list and wake the
1827 * pagedaemon (which will finish up our job in its context).
1828 */
1829 simple_lock(&uvm.pagedaemon_lock); /* locks uvm.aio_done */
1830 TAILQ_INSERT_TAIL(&uvm.aio_done, &sbp->sw_aio, aioq);
1831 simple_unlock(&uvm.pagedaemon_lock);
1832
1833 thread_wakeup(&uvm.pagedaemon);
1834 splx(s);
1835 }
1836
1837 /*
1838 * uvm_swap_aiodone: aiodone function for anonymous memory
1839 *
1840 * => this is called in the context of the pagedaemon (but with the
1841 * page queues unlocked!)
1842 * => our "aio" structure must be part of a "swapbuf"
1843 */
1844 static void
1845 uvm_swap_aiodone(aio)
1846 struct uvm_aiodesc *aio;
1847 {
1848 struct swapbuf *sbp = aio->pd_ptr;
1849 /* XXXMRG: does this work if PAGE_SIZE is a variable, eg SUN4C&&SUN4 */
1850 /* XXX it does with GCC */
1851 struct vm_page *pps[MAXBSIZE/PAGE_SIZE];
1852 int lcv, s;
1853 vm_offset_t addr;
1854 UVMHIST_FUNC("uvm_swap_aiodone"); UVMHIST_CALLED(pdhist);
1855
1856 UVMHIST_LOG(pdhist, "done with aio %p", aio, 0, 0, 0);
1857 #ifdef DIAGNOSTIC
1858 /*
1859 * sanity check
1860 */
1861 if (aio->npages > (MAXBSIZE/PAGE_SIZE))
1862 panic("uvm_swap_aiodone: aio too big!");
1863 #endif
1864
1865 /*
1866 * first, we have to recover the page pointers (pps) by poking in the
1867 * kernel pmap (XXX: should be saved in the buf structure).
1868 */
1869 for (addr = aio->kva, lcv = 0 ; lcv < aio->npages ;
1870 addr += PAGE_SIZE, lcv++) {
1871 pps[lcv] = uvm_pageratop(addr);
1872 }
1873
1874 /*
1875 * now we can dispose of the kernel mappings of the buffer
1876 */
1877 uvm_pagermapout(aio->kva, aio->npages);
1878
1879 /*
1880 * now we can dispose of the pages by using the dropcluster function
1881 * [note that we have no "page of interest" so we pass in null]
1882 */
1883 uvm_pager_dropcluster(NULL, NULL, pps, &aio->npages,
1884 PGO_PDFREECLUST, 0);
1885
1886 /*
1887 * finally, we can dispose of the swapbuf
1888 */
1889 s = splbio();
1890 simple_lock(&swap_buf_lock);
1891 SIMPLEQ_INSERT_HEAD(&freesbufs, sbp, sw_sq);
1892 if (sbufs_wanted) {
1893 sbufs_wanted = 0;
1894 thread_wakeup(&freesbufs);
1895 }
1896 simple_unlock(&swap_buf_lock);
1897 splx(s);
1898
1899 /*
1900 * done!
1901 */
1902 }
1903