uvm_swap.c revision 1.3 1 /* $NetBSD: uvm_swap.c,v 1.3 1998/02/07 11:09:45 mrg Exp $ */
2
3 /*
4 * Copyright (c) 1995, 1996, 1997 Matthew R. Green
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. The name of the author may not be used to endorse or promote products
16 * derived from this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
31 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
32 */
33
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/buf.h>
37 #include <sys/proc.h>
38 #include <sys/namei.h>
39 #include <sys/disklabel.h>
40 #include <sys/errno.h>
41 #include <sys/kernel.h>
42 #include <sys/malloc.h>
43 #include <sys/vnode.h>
44 #include <sys/file.h>
45 #include <sys/extent.h>
46 #include <sys/mount.h>
47 #include <sys/syscallargs.h>
48
49 #include <vm/vm.h>
50 #include <vm/vm_swap.h>
51 #include <vm/vm_conf.h>
52
53 #include <uvm/uvm.h>
54
55 #include <miscfs/specfs/specdev.h>
56
57 /*
58 * uvm_swap.c: manage configuration and i/o to swap space.
59 */
60
61 /*
62 * swap space is managed in the following way:
63 *
64 * each swap partition or file is described by a "swapdev" structure.
65 * each "swapdev" structure contains a "swapent" structure which contains
66 * information that is passed up to the user (via system calls).
67 *
68 * each swap partition is assigned a "priority" (int) which controls
69 * swap parition usage.
70 *
71 * the system maintains a global data structure describing all swap
72 * partitions/files. there is a sorted LIST of "swappri" structures
73 * which describe "swapdev"'s at that priority. this LIST is headed
74 * by the "swap_priority" global var. each "swappri" contains a
75 * CIRCLEQ of "swapdev" structures at that priority.
76 *
77 * the system maintains a fixed pool of "swapbuf" structures for use
78 * at swap i/o time. a swapbuf includes a "buf" structure and an
79 * "aiodone" [we want to avoid malloc()'ing anything at swapout time
80 * since memory may be low].
81 *
82 * locking:
83 * - swap_syscall_lock (sleep lock): this lock serializes the swapctl
84 * system call and prevents the swap priority list from changing
85 * while we are in the middle of a system call (e.g. SWAP_STATS).
86 * - swap_data_lock (simple_lock): this lock protects all swap data
87 * structures including the priority list, the swapdev structures,
88 * and the swapmap extent.
89 * - swap_buf_lock (simple_lock): this lock protects the free swapbuf
90 * pool.
91 *
92 * each swap device has the following info:
93 * - swap device in use (could be disabled, preventing future use)
94 * - swap enabled (allows new allocations on swap)
95 * - map info in /dev/drum
96 * - vnode pointer
97 * for swap files only:
98 * - block size
99 * - max byte count in buffer
100 * - buffer
101 * - credentials to use when doing i/o to file
102 *
103 * userland controls and configures swap with the swapctl(2) system call.
104 * the sys_swapctl performs the following operations:
105 * [1] SWAP_NSWAP: returns the number of swap devices currently configured
106 * [2] SWAP_STATS: given a pointer to an array of swapent structures
107 * (passed in via "arg") of a size passed in via "misc" ... we load
108 * the current swap config into the array.
109 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a
110 * priority in "misc", start swapping on it.
111 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
112 * [5] SWAP_CTL: changes the priority of a swap device (new priority in
113 * "misc")
114 */
115
116 /*
117 * SWAP_TO_FILES: allows swapping to plain files.
118 */
119
120 #define SWAP_TO_FILES
121
122 /*
123 * swapdev: describes a single swap partition/file
124 *
125 * note the following should be true:
126 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks]
127 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
128 */
129 struct swapdev {
130 struct swapent swd_se; /* swap entry struct */
131 #define swd_dev swd_se.se_dev /* dev_t for this dev */
132 #define swd_flags swd_se.se_flags /* flags:inuse/enable/fake*/
133 #define swd_priority swd_se.se_priority /* our priority */
134 /* also: swd_se.se_nblks, swd_se.se_inuse */
135 int swd_npages; /* #pages we can use */
136 int swd_npginuse; /* #pages in use */
137 int swd_drumoffset; /* page0 offset in drum */
138 int swd_drumsize; /* #pages in drum */
139 struct extent *swd_ex; /* extent for this swapdev*/
140 struct vnode *swd_vp; /* backing vnode */
141 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */
142
143 #ifdef SWAP_TO_FILES
144 int swd_bsize; /* blocksize (bytes) */
145 int swd_maxactive; /* max active i/o reqs */
146 struct buf swd_tab; /* buffer list */
147 struct ucred *swd_cred; /* cred for file access */
148 #endif
149 };
150
151 /*
152 * swap device priority entry; the list is kept sorted on `spi_priority'.
153 */
154 struct swappri {
155 int spi_priority; /* priority */
156 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev;
157 /* circleq of swapdevs at this priority */
158 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */
159 };
160
161 /*
162 * swapbuf, swapbuffer plus async i/o info
163 */
164 struct swapbuf {
165 struct buf sw_buf; /* a buffer structure */
166 struct uvm_aiodesc sw_aio; /* aiodesc structure, used if ASYNC */
167 SIMPLEQ_ENTRY(swapbuf) sw_sq; /* free list pointer */
168 };
169
170 /*
171 * The following two structures are used to keep track of data transfers
172 * on swap devices associated with regular files.
173 * NOTE: this code is more or less a copy of vnd.c; we use the same
174 * structure names here to ease porting..
175 */
176 struct vndxfer {
177 struct buf *vx_bp; /* Pointer to parent buffer */
178 struct swapdev *vx_sdp;
179 int vx_error;
180 int vx_pending; /* # of pending aux buffers */
181 int vx_flags;
182 #define VX_BUSY 1
183 #define VX_DEAD 2
184 };
185
186 struct vndbuf {
187 struct buf vb_buf;
188 struct vndxfer *vb_xfer;
189 };
190
191 /*
192 * XXX: Not a very good idea in a swap strategy module!
193 */
194 #define getvndxfer() \
195 ((struct vndxfer *)malloc(sizeof(struct vndxfer), M_DEVBUF, M_WAITOK))
196
197 #define putvndxfer(vnx) \
198 free((caddr_t)(vnx), M_DEVBUF)
199
200 #define getvndbuf() \
201 ((struct vndbuf *)malloc(sizeof(struct vndbuf), M_DEVBUF, M_WAITOK))
202
203 #define putvndbuf(vbp) \
204 free((caddr_t)(vbp), M_DEVBUF)
205
206 /*
207 * local variables
208 */
209 UVMHIST_DECL(pdhist);
210 static struct extent *swapmap; /* controls the mapping of /dev/drum */
211 SIMPLEQ_HEAD(swapbufhead, swapbuf);
212 static struct swapbufhead freesbufs; /* list of free swapbufs */
213 static int sbufs_wanted = 0; /* someone sleeping for swapbufs? */
214 #if NCPU > 1
215 static simple_lock_data_t swap_buf_lock;/* locks freesbufs and sbufs_wanted */
216 #endif
217
218 /* list of all active swap devices [by priority] */
219 LIST_HEAD(swap_priority, swappri);
220 static struct swap_priority swap_priority;
221
222 /* locks */
223 lock_data_t swap_syscall_lock;
224 #if NCPU > 1
225 static simple_lock_data_t swap_data_lock;
226 #endif
227
228 /*
229 * prototypes
230 */
231 static void swapdrum_add __P((struct swapdev *, int));
232 static struct swapdev *swapdrum_getsdp __P((int));
233
234 static struct swapdev *swaplist_find __P((struct vnode *, int));
235 static void swaplist_insert __P((struct swapdev *,
236 struct swappri *, int));
237 static void swaplist_trim __P((void));
238
239 static int swap_on __P((struct proc *, struct swapdev *));
240 #ifdef SWAP_OFF_WORKS
241 static int swap_off __P((struct proc *, struct swapdev *));
242 #endif
243
244 #ifdef SWAP_TO_FILES
245 static void sw_reg_strategy __P((struct swapdev *, struct buf *, int));
246 static void sw_reg_iodone __P((struct buf *));
247 static void sw_reg_start __P((struct swapdev *));
248 #endif
249
250 static void uvm_swap_aiodone __P((struct uvm_aiodesc *));
251 static void uvm_swap_bufdone __P((struct buf *));
252 static int uvm_swap_io __P((struct vm_page **, int, int, int));
253
254 /*
255 * uvm_swap_init: init the swap system data structures and locks
256 *
257 * => called at boot time from init_main.c after the filesystems
258 * are brought up (which happens after uvm_init())
259 */
260 void
261 uvm_swap_init()
262 {
263 struct swapbuf *sp;
264 struct proc *p = &proc0; /* XXX */
265 int i;
266 #if defined(UVMHIST)
267 static char histbuf[sizeof(struct uvm_history_ent) * 100];
268 #endif
269 UVMHIST_FUNC("uvm_swap_init");
270
271 UVMHIST_INIT_STATIC(pdhist, histbuf);
272 UVMHIST_CALLED(pdhist);
273 /*
274 * first, init the swap list, its counter, and its lock.
275 * then get a handle on the vnode for /dev/drum by using
276 * the its dev_t number ("swapdev", from MD conf.c).
277 */
278
279 LIST_INIT(&swap_priority);
280 uvmexp.nswapdev = 0;
281 lockinit(&swap_syscall_lock, PVM, "swapsys", 0, 0);
282 simple_lock_init(&swap_data_lock);
283 if (bdevvp(swapdev, &swapdev_vp))
284 panic("uvm_swap_init: can't get vnode for swap device");
285
286 /*
287 * create swap block resource map to map /dev/drum. the range
288 * from 1 to INT_MAX allows 2 gigablocks of swap space. note
289 * that block 0 is reserved (used to indicate an allocation
290 * failure, or no allocation).
291 */
292 swapmap = extent_create("swapmap", 1, INT_MAX,
293 M_VMSWAP, 0, 0, EX_NOWAIT);
294 if (swapmap == 0)
295 panic("uvm_swap_init: extent_create failed");
296
297 /*
298 * allocate our private pool of "swapbuf" structures (includes
299 * a "buf" structure). ["nswbuf" comes from param.c and can
300 * be adjusted by MD code before we get here].
301 */
302
303 sp = malloc(sizeof(*sp) * nswbuf, M_VMSWAP, M_NOWAIT);
304 if (sp == NULL)
305 panic("uvm_swap_init: unable to malloc swap bufs");
306 bzero(sp, sizeof(*sp) * nswbuf);
307 SIMPLEQ_INIT(&freesbufs);
308 simple_lock_init(&swap_buf_lock);
309
310 /* build free list */
311 for (i = 0 ; i < nswbuf ; i++, sp++) {
312 /* p == proc0 */
313 sp->sw_buf.b_rcred = sp->sw_buf.b_wcred = p->p_ucred;
314 sp->sw_buf.b_vnbufs.le_next = NOLIST;
315 SIMPLEQ_INSERT_HEAD(&freesbufs, sp, sw_sq);
316 }
317 printf("uvm_swap: allocated %d swap buffer headers\n", nswbuf);
318
319 /*
320 * done!
321 */
322 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
323 }
324
325 /*
326 * swaplist functions: functions that operate on the list of swap
327 * devices on the system.
328 */
329
330 /*
331 * swaplist_insert: insert swap device "sdp" into the global list
332 *
333 * => caller must hold both swap_syscall_lock and swap_data_lock
334 * => caller must provide a newly malloc'd swappri structure (we will
335 * FREE it if we don't need it... this it to prevent malloc blocking
336 * here while adding swap)
337 */
338 static void
339 swaplist_insert(sdp, newspp, priority)
340 struct swapdev *sdp;
341 struct swappri *newspp;
342 int priority;
343 {
344 struct swappri *spp, *pspp;
345 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
346
347 /*
348 * find entry at or after which to insert the new device.
349 */
350 for (pspp = NULL, spp = swap_priority.lh_first; spp != NULL;
351 spp = spp->spi_swappri.le_next) {
352 if (priority <= spp->spi_priority)
353 break;
354 pspp = spp;
355 }
356
357 /*
358 * new priority?
359 */
360 if (spp == NULL || spp->spi_priority != priority) {
361 spp = newspp; /* use newspp! */
362 UVMHIST_LOG(pdhist, "created new swappri = %d", priority, 0, 0, 0);
363
364 spp->spi_priority = priority;
365 CIRCLEQ_INIT(&spp->spi_swapdev);
366
367 if (pspp)
368 LIST_INSERT_AFTER(pspp, spp, spi_swappri);
369 else
370 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
371 } else {
372 /* we don't need a new priority structure, free it */
373 FREE(newspp, M_VMSWAP);
374 }
375
376 /*
377 * priority found (or created). now insert on the priority's
378 * circleq list and bump the total number of swapdevs.
379 */
380 sdp->swd_priority = priority;
381 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
382 uvmexp.nswapdev++;
383
384 /*
385 * done!
386 */
387 }
388
389 /*
390 * swaplist_find: find and optionally remove a swap device from the
391 * global list.
392 *
393 * => caller must hold both swap_syscall_lock and swap_data_lock
394 * => we return the swapdev we found (and removed)
395 */
396 static struct swapdev *
397 swaplist_find(vp, remove)
398 struct vnode *vp;
399 boolean_t remove;
400 {
401 struct swapdev *sdp;
402 struct swappri *spp;
403
404 /*
405 * search the lists for the requested vp
406 */
407 for (spp = swap_priority.lh_first; spp != NULL;
408 spp = spp->spi_swappri.le_next) {
409 for (sdp = spp->spi_swapdev.cqh_first;
410 sdp != (void *)&spp->spi_swapdev;
411 sdp = sdp->swd_next.cqe_next)
412 if (sdp->swd_vp == vp) {
413 if (remove) {
414 CIRCLEQ_REMOVE(&spp->spi_swapdev,
415 sdp, swd_next);
416 uvmexp.nswapdev--;
417 }
418 return(sdp);
419 }
420 }
421 return (NULL);
422 }
423
424
425 /*
426 * swaplist_trim: scan priority list for empty priority entries and kill
427 * them.
428 *
429 * => caller must hold both swap_syscall_lock and swap_data_lock
430 */
431 static void
432 swaplist_trim()
433 {
434 struct swappri *spp, *nextspp;
435
436 for (spp = swap_priority.lh_first; spp != NULL; spp = nextspp) {
437 nextspp = spp->spi_swappri.le_next;
438 if (spp->spi_swapdev.cqh_first != (void *)&spp->spi_swapdev)
439 continue;
440 LIST_REMOVE(spp, spi_swappri);
441 free((caddr_t)spp, M_VMSWAP);
442 }
443 }
444
445 /*
446 * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area.
447 *
448 * => caller must hold swap_syscall_lock
449 * => swap_data_lock should be unlocked (we may sleep)
450 */
451 static void
452 swapdrum_add(sdp, npages)
453 struct swapdev *sdp;
454 int npages;
455 {
456 u_long result;
457
458 if (extent_alloc(swapmap, npages, EX_NOALIGN, EX_NOBOUNDARY,
459 EX_WAITOK, &result))
460 panic("swapdrum_add");
461
462 sdp->swd_drumoffset = result;
463 sdp->swd_drumsize = npages;
464 }
465
466 /*
467 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
468 * to the "swapdev" that maps that section of the drum.
469 *
470 * => each swapdev takes one big contig chunk of the drum
471 * => caller must hold swap_data_lock
472 */
473 static struct swapdev *
474 swapdrum_getsdp(pgno)
475 int pgno;
476 {
477 struct swapdev *sdp;
478 struct swappri *spp;
479
480 for (spp = swap_priority.lh_first; spp != NULL;
481 spp = spp->spi_swappri.le_next)
482 for (sdp = spp->spi_swapdev.cqh_first;
483 sdp != (void *)&spp->spi_swapdev;
484 sdp = sdp->swd_next.cqe_next)
485 if (pgno >= sdp->swd_drumoffset &&
486 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
487 return sdp;
488 }
489 return NULL;
490 }
491
492
493 /*
494 * sys_swapctl: main entry point for swapctl(2) system call
495 * [with two helper functions: swap_on and swap_off]
496 */
497 int
498 sys_swapctl(p, v, retval)
499 struct proc *p;
500 void *v;
501 register_t *retval;
502 {
503 struct sys_swapctl_args /* {
504 syscallarg(int) cmd;
505 syscallarg(void *) arg;
506 syscallarg(int) misc;
507 } */ *uap = (struct sys_swapctl_args *)v;
508 struct vnode *vp;
509 struct nameidata nd;
510 struct swappri *spp;
511 struct swapdev *sdp;
512 struct swapent *sep;
513 int count, error, misc;
514 int priority;
515 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
516
517 misc = SCARG(uap, misc);
518
519 /*
520 * ensure serialized syscall access by grabbing the swap_syscall_lock
521 */
522 lockmgr(&swap_syscall_lock, LK_EXCLUSIVE, (void *)0, curproc);
523
524 /*
525 * we handle the non-priv NSWAP and STATS request first.
526 *
527 * SWAP_NSWAP: return number of config'd swap devices
528 * [can also be obtained with uvmexp sysctl]
529 */
530 if (SCARG(uap, cmd) == SWAP_NSWAP) {
531 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev, 0, 0, 0);
532 *retval = uvmexp.nswapdev;
533 lockmgr(&swap_syscall_lock, LK_RELEASE, (void *)0, curproc);
534 return (0);
535 }
536
537 /*
538 * SWAP_STATS: get stats on current # of configured swap devs
539 *
540 * note that the swap_priority list can't change as long
541 * as we are holding the swap_syscall_lock. we don't want
542 * to grab the swap_data_lock because we may fault&sleep during
543 * copyout() and we don't want to be holding that lock then!
544 */
545 if (SCARG(uap, cmd) == SWAP_STATS) {
546 sep = (struct swapent *)SCARG(uap, arg);
547 count = 0;
548
549 for (spp = swap_priority.lh_first; spp != NULL;
550 spp = spp->spi_swappri.le_next) {
551 for (sdp = spp->spi_swapdev.cqh_first;
552 sdp != (void *)&spp->spi_swapdev && misc-- > 0;
553 sdp = sdp->swd_next.cqe_next) {
554 /* backwards compatibility for system call */
555 sdp->swd_se.se_inuse =
556 btodb(sdp->swd_npginuse * PAGE_SIZE);
557 error = copyout((caddr_t)&sdp->swd_se,
558 (caddr_t)sep, sizeof(struct swapent));
559 if (error) {
560 lockmgr(&swap_syscall_lock,
561 LK_RELEASE, (void *)0, curproc);
562 return (error);
563 }
564 count++;
565 sep++;
566 }
567 }
568
569 UVMHIST_LOG(pdhist, "<-done SWAP_STATS", 0, 0, 0, 0);
570
571 *retval = count;
572 lockmgr(&swap_syscall_lock, LK_RELEASE, (void *)0, curproc);
573 return (0);
574 }
575
576 /*
577 * all other requests require superuser privs. verify.
578 */
579 if ((error = suser(p->p_ucred, &p->p_acflag))) {
580 lockmgr(&swap_syscall_lock, LK_RELEASE, (void *)0, curproc);
581 return (error);
582 }
583
584 /*
585 * at this point we expect a path name in arg. we will
586 * use namei() to gain a vnode reference (vref), and lock
587 * the vnode (VOP_LOCK).
588 *
589 * XXX: a NULL arg means use the root vnode pointer (e.g. for
590 * miniroot
591 */
592 if (SCARG(uap, arg) == NULL) {
593 vp = rootvp; /* miniroot */
594 if (vget(vp, 1)) {
595 lockmgr(&swap_syscall_lock, LK_RELEASE,
596 (void *)0, curproc);
597 return (EBUSY);
598 }
599 } else {
600 NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, UIO_USERSPACE,
601 SCARG(uap, arg), p);
602 if ((error = namei(&nd))) {
603 lockmgr(&swap_syscall_lock, LK_RELEASE,
604 (void *)0, curproc);
605 return (error);
606 }
607 vp = nd.ni_vp;
608 }
609 /* note: "vp" is referenced and locked */
610
611 error = 0; /* assume no error */
612 switch(SCARG(uap, cmd)) {
613 case SWAP_CTL:
614 /*
615 * get new priority, remove old entry (if any) and then
616 * reinsert it in the correct place. finally, prune out
617 * any empty priority structures.
618 */
619 priority = SCARG(uap, misc);
620 spp = (struct swappri *)
621 malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
622 simple_lock(&swap_data_lock);
623 if ((sdp = swaplist_find(vp, 1)) == NULL) {
624 error = ENOENT;
625 } else {
626 swaplist_insert(sdp, spp, priority);
627 swaplist_trim();
628 }
629 simple_unlock(&swap_data_lock);
630 if (error)
631 free(spp, M_VMSWAP);
632 break;
633
634 case SWAP_ON:
635 /*
636 * check for duplicates. if none found, then insert a
637 * dummy entry on the list to prevent someone else from
638 * trying to enable this device while we are working on
639 * it.
640 */
641 priority = SCARG(uap, misc);
642 simple_lock(&swap_data_lock);
643 if ((sdp = swaplist_find(vp, 0)) != NULL) {
644 error = EBUSY;
645 simple_unlock(&swap_data_lock);
646 goto bad;
647 }
648 sdp = (struct swapdev *)
649 malloc(sizeof *sdp, M_VMSWAP, M_WAITOK);
650 spp = (struct swappri *)
651 malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
652 bzero(sdp, sizeof(*sdp));
653 sdp->swd_flags = SWF_FAKE; /* placeholder only */
654 sdp->swd_vp = vp;
655 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
656 #ifdef SWAP_TO_FILES
657 /*
658 * XXX Is NFS elaboration necessary?
659 */
660 if (vp->v_type == VREG)
661 sdp->swd_cred = crdup(p->p_ucred);
662 #endif
663 swaplist_insert(sdp, spp, priority);
664 simple_unlock(&swap_data_lock);
665
666 /*
667 * we've now got a FAKE placeholder in the swap list.
668 * now attempt to enable swap on it. if we fail, undo
669 * what we've done and kill the fake entry we just inserted.
670 * if swap_on is a success, it will clear the SWF_FAKE flag
671 */
672 if ((error = swap_on(p, sdp)) != 0) {
673 simple_lock(&swap_data_lock);
674 (void) swaplist_find(vp, 1); /* kill fake entry */
675 swaplist_trim();
676 simple_unlock(&swap_data_lock);
677 #ifdef SWAP_TO_FILES
678 if (vp->v_type == VREG)
679 crfree(sdp->swd_cred);
680 #endif
681 free((caddr_t)sdp, M_VMSWAP);
682 break;
683 }
684
685 /*
686 * got it! now add a second reference to vp so that
687 * we keep a reference to the vnode after we return.
688 */
689 vref(vp);
690 break;
691
692 case SWAP_OFF:
693 UVMHIST_LOG(pdhist, "someone is using SWAP_OFF...??", 0,0,0,0);
694 #ifdef SWAP_OFF_WORKS
695 /*
696 * find the entry of interest and ensure it is enabled.
697 */
698 simple_lock(&swap_data_lock);
699 if ((sdp = swaplist_find(vp, 0)) == NULL) {
700 simple_unlock(&swap_data_lock);
701 error = ENXIO;
702 break;
703 }
704 /*
705 * If a device isn't in use or enabled, we
706 * can't stop swapping from it (again).
707 */
708 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
709 simple_unlock(&swap_data_lock);
710 error = EBUSY;
711 goto bad;
712 }
713 /* XXXCDC: should we call with list locked or unlocked? */
714 if ((error = swap_off(p, sdp)) != 0)
715 goto bad;
716 /* XXXCDC: might need relock here */
717
718 /*
719 * now we can kill the entry.
720 */
721 if ((sdp = swaplist_find(vp, 1)) == NULL) {
722 error = ENXIO;
723 break;
724 }
725 simple_unlock(&swap_data_lock);
726 free((caddr_t)sdp, M_VMSWAP);
727 #else
728 error = EINVAL;
729 #endif
730 break;
731
732 default:
733 UVMHIST_LOG(pdhist, "unhandled command: %#x",
734 SCARG(uap, cmd), 0, 0, 0);
735 error = EINVAL;
736 }
737
738 bad:
739 /*
740 * done! use vput to drop our reference and unlock
741 */
742 vput(vp);
743 lockmgr(&swap_syscall_lock, LK_RELEASE, (void *)0, curproc);
744
745 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0);
746 return (error);
747 }
748
749 /*
750 * swap_on: attempt to enable a swapdev for swapping. note that the
751 * swapdev is already on the global list, but disabled (marked
752 * SWF_FAKE).
753 *
754 * => we avoid the start of the disk (to protect disk labels)
755 * => we also avoid the miniroot, if we are swapping to root.
756 * => caller should leave swap_data_lock unlocked, we may lock it
757 * if needed.
758 */
759 static int
760 swap_on(p, sdp)
761 struct proc *p;
762 struct swapdev *sdp;
763 {
764 static int count = 0; /* static */
765 struct vnode *vp;
766 int error, npages, nblocks, size;
767 long addr;
768 char *storage;
769 int storagesize;
770 #ifdef SWAP_TO_FILES
771 struct vattr va;
772 #endif
773 #ifdef NFS
774 extern int (**nfsv2_vnodeop_p) __P((void *));
775 #endif /* NFS */
776 dev_t dev;
777 char *name;
778 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
779
780 /*
781 * we want to enable swapping on sdp. the swd_vp contains
782 * the vnode we want (locked and ref'd), and the swd_dev
783 * contains the dev_t of the file, if it a block device.
784 */
785
786 vp = sdp->swd_vp;
787 dev = sdp->swd_dev;
788
789 /*
790 * open the swap file (mostly useful for block device files to
791 * let device driver know what is up).
792 *
793 * we skip the open/close for root on swap because the root
794 * has already been opened when root was mounted (mountroot).
795 */
796 if (vp != rootvp) {
797 if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)))
798 return (error);
799 }
800
801 /* XXX this only works for block devices */
802 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0);
803
804 /*
805 * we now need to determine the size of the swap area. for
806 * block specials we can call the d_psize function.
807 * for normal files, we must stat [get attrs].
808 *
809 * we put the result in nblks.
810 * for normal files, we also want the filesystem block size
811 * (which we get with statfs).
812 */
813 switch (vp->v_type) {
814 case VBLK:
815 if (bdevsw[major(dev)].d_psize == 0 ||
816 (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) {
817 error = ENXIO;
818 goto bad;
819 }
820 break;
821
822 #ifdef SWAP_TO_FILES
823 case VREG:
824 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)))
825 goto bad;
826 nblocks = (int)btodb(va.va_size);
827 if ((error =
828 VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0)
829 goto bad;
830
831 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
832 /*
833 * limit the max # of outstanding I/O requests we issue
834 * at any one time. take it easy on NFS servers.
835 */
836 #ifdef NFS
837 if (vp->v_op == nfsv2_vnodeop_p)
838 sdp->swd_maxactive = 2; /* XXX */
839 else
840 #endif /* NFS */
841 sdp->swd_maxactive = 8; /* XXX */
842 break;
843 #endif
844
845 default:
846 error = ENXIO;
847 goto bad;
848 }
849
850 /*
851 * save nblocks in a safe place and convert to pages.
852 */
853
854 sdp->swd_se.se_nblks = nblocks;
855 npages = dbtob(nblocks) / PAGE_SIZE;
856
857 /*
858 * for block special files, we want to make sure that leave
859 * the disklabel and bootblocks alone, so we arrange to skip
860 * over them (randomly choosing to skip PAGE_SIZE bytes).
861 * note that because of this the "size" can be less than the
862 * actual number of blocks on the device.
863 */
864 if (vp->v_type == VBLK) {
865 /* we use pages 1 to (size - 1) [inclusive] */
866 size = npages - 1;
867 addr = 1;
868 } else {
869 /* we use pages 0 to (size - 1) [inclusive] */
870 size = npages;
871 addr = 0;
872 }
873
874 /*
875 * make sure we have enough blocks for a reasonable sized swap
876 * area. we want at least one page.
877 */
878
879 if (size < 1) {
880 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0);
881 error = EINVAL;
882 goto bad;
883 }
884
885 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0);
886
887 /*
888 * now we need to allocate an extent to manage this swap device
889 */
890 name = malloc(12, M_VMSWAP, M_WAITOK);
891 sprintf(name, "swap0x%04x", count++);
892
893 /*
894 * XXXCDC: what should we make of this extent storage size stuff
895 *
896 * XXXMRG: well, i've come to realise that we need, at most,
897 * blocks2pages(npages)/2 extents (or so), to cover all possible
898 * allocations that may occur in the extent -- every other page
899 * being allocated.
900 */
901 #if 1
902 storagesize = EXTENT_FIXED_STORAGE_SIZE(maxproc * 2);
903 #else
904 /* XXXMRG: this uses lots of memory */
905 storagesize = EXTENT_FIXED_STORAGE_SIZE(npages / 2);
906 #endif
907 storage = malloc(storagesize, M_VMSWAP, M_WAITOK);
908 /* note that extent_create's 3rd arg is inclusive, thus "- 1" */
909 sdp->swd_ex = extent_create(name, 0, npages - 1, M_VMSWAP,
910 storage, storagesize, EX_WAITOK);
911 /* allocate the `saved' region from the extent so it won't be used */
912 if (addr) {
913 if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK))
914 panic("disklabel region");
915 sdp->swd_npginuse += addr;
916 uvmexp.swpginuse += addr;
917 }
918
919
920 /*
921 * if the vnode we are swapping to is the root vnode
922 * (i.e. we are swapping to the miniroot) then we want
923 * to make sure we don't overwrite it. do a statfs to
924 * find its size and skip over it.
925 */
926 if (vp == rootvp) {
927 struct mount *mp;
928 struct statfs *sp;
929 int rootblocks, rootpages;
930
931 mp = rootvnode->v_mount;
932 sp = &mp->mnt_stat;
933 rootblocks = sp->f_blocks * btodb(sp->f_bsize);
934 rootpages = round_page(dbtob(rootblocks)) / PAGE_SIZE;
935 if (rootpages > npages)
936 panic("swap_on: miniroot larger than swap?");
937
938 if (extent_alloc_region(sdp->swd_ex, addr,
939 rootpages, EX_WAITOK))
940 panic("swap_on: unable to preserve miniroot");
941
942 sdp->swd_npginuse += (rootpages - addr);
943 uvmexp.swpginuse += (rootpages - addr);
944
945 printf("Preserved %d pages of miniroot ", rootpages);
946 printf("leaving %d pages of swap\n", size - rootpages);
947 }
948
949 /*
950 * now add the new swapdev to the drum and enable.
951 */
952 simple_lock(&swap_data_lock);
953 swapdrum_add(sdp, npages);
954 sdp->swd_npages = npages;
955 sdp->swd_flags &= ~SWF_FAKE; /* going live */
956 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
957 simple_unlock(&swap_data_lock);
958 uvmexp.swpages += npages;
959
960 /*
961 * add anon's to reflect the swap space we added
962 */
963 uvm_anon_add(size);
964
965 return (0);
966
967 bad:
968 /*
969 * failure: close device if necessary and return error.
970 */
971 if (vp != rootvp)
972 (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
973 return (error);
974 }
975
976 #ifdef SWAP_OFF_WORKS
977 /*
978 * swap_off: stop swapping on swapdev
979 *
980 * XXXCDC: what conditions go here?
981 */
982 static int
983 swap_off(p, sdp)
984 struct proc *p;
985 struct swapdev *sdp;
986 {
987 char *name;
988 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
989
990 /* turn off the enable flag */
991 sdp->swd_flags &= ~SWF_ENABLE;
992
993 UVMHIST_LOG(pdhist, " dev=%x", sdp->swd_dev);
994
995 /*
996 * XXX write me
997 *
998 * the idea is to find out which processes are using this swap
999 * device, and page them all in.
1000 *
1001 * eventually, we should try to move them out to other swap areas
1002 * if available.
1003 *
1004 * The alternative is to create a redirection map for this swap
1005 * device. This should work by moving all the pages of data from
1006 * the ex-swap device to another one, and making an entry in the
1007 * redirection map for it. locking is going to be important for
1008 * this!
1009 *
1010 * XXXCDC: also need to shrink anon pool
1011 */
1012
1013 /* until the above code is written, we must ENODEV */
1014 return ENODEV;
1015
1016 extent_free(swapmap, sdp->swd_mapoffset, sdp->swd_mapsize, EX_WAITOK);
1017 name = sdp->swd_ex->ex_name;
1018 extent_destroy(sdp->swd_ex);
1019 free(name, M_VMSWAP);
1020 free((caddr_t)sdp->swd_ex, M_VMSWAP);
1021 if (sdp->swp_vp != rootvp)
1022 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
1023 if (sdp->swd_vp)
1024 vrele(sdp->swd_vp);
1025 free((caddr_t)sdp, M_VMSWAP);
1026 return (0);
1027 }
1028 #endif
1029
1030 /*
1031 * /dev/drum interface and i/o functions
1032 */
1033
1034 /*
1035 * swread: the read function for the drum (just a call to physio)
1036 */
1037 /*ARGSUSED*/
1038 int
1039 swread(dev, uio, ioflag)
1040 dev_t dev;
1041 struct uio *uio;
1042 int ioflag;
1043 {
1044 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
1045
1046 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1047 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1048 }
1049
1050 /*
1051 * swwrite: the write function for the drum (just a call to physio)
1052 */
1053 /*ARGSUSED*/
1054 int
1055 swwrite(dev, uio, ioflag)
1056 dev_t dev;
1057 struct uio *uio;
1058 int ioflag;
1059 {
1060 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
1061
1062 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1063 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1064 }
1065
1066 /*
1067 * swstrategy: perform I/O on the drum
1068 *
1069 * => we must map the i/o request from the drum to the correct swapdev.
1070 */
1071 void
1072 swstrategy(bp)
1073 struct buf *bp;
1074 {
1075 struct swapdev *sdp;
1076 struct vnode *vp;
1077 int pageno;
1078 int bn;
1079 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
1080
1081 /*
1082 * convert block number to swapdev. note that swapdev can't
1083 * be yanked out from under us because we are holding resources
1084 * in it (i.e. the blocks we are doing I/O on).
1085 */
1086 pageno = dbtob(bp->b_blkno) / PAGE_SIZE;
1087 simple_lock(&swap_data_lock);
1088 sdp = swapdrum_getsdp(pageno);
1089 simple_unlock(&swap_data_lock);
1090 if (sdp == NULL) {
1091 bp->b_error = EINVAL;
1092 bp->b_flags |= B_ERROR;
1093 biodone(bp);
1094 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0);
1095 return;
1096 }
1097
1098 /*
1099 * convert drum page number to block number on this swapdev.
1100 */
1101
1102 pageno = pageno - sdp->swd_drumoffset; /* page # on swapdev */
1103 bn = btodb(pageno * PAGE_SIZE); /* convert to diskblock */
1104
1105 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld\n",
1106 ((bp->b_flags & B_READ) == 0) ? "write" : "read",
1107 sdp->swd_drumoffset, bn, bp->b_bcount);
1108
1109
1110 /*
1111 * for block devices we finish up here.
1112 * for regular files we have to do more work which we deligate
1113 * to sw_reg_strategy().
1114 */
1115
1116 switch (sdp->swd_vp->v_type) {
1117 default:
1118 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type);
1119 case VBLK:
1120
1121 /*
1122 * must convert "bp" from an I/O on /dev/drum to an I/O
1123 * on the swapdev (sdp).
1124 */
1125 bp->b_blkno = bn; /* swapdev block number */
1126 vp = sdp->swd_vp; /* swapdev vnode pointer */
1127 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */
1128 VHOLD(vp); /* "hold" swapdev vp for i/o */
1129
1130 /*
1131 * if we are doing a write, we have to redirect the i/o on
1132 * drum's v_numoutput counter to the swapdevs.
1133 */
1134 if ((bp->b_flags & B_READ) == 0) {
1135 int s = splbio();
1136 vwakeup(bp); /* kills one 'v_numoutput' on drum */
1137 vp->v_numoutput++; /* put it on swapdev */
1138 splx(s);
1139 }
1140
1141 /*
1142 * dissassocate buffer with /dev/drum vnode
1143 * [could be null if buf was from physio]
1144 */
1145 if (bp->b_vp != NULLVP)
1146 brelvp(bp);
1147
1148 /*
1149 * finally plug in swapdev vnode and start I/O
1150 */
1151 bp->b_vp = vp;
1152 VOP_STRATEGY(bp);
1153 return;
1154 #ifdef SWAP_TO_FILES
1155 case VREG:
1156 /*
1157 * deligate to sw_reg_strategy function.
1158 */
1159 sw_reg_strategy(sdp, bp, bn);
1160 return;
1161 #endif
1162 }
1163 /* NOTREACHED */
1164 }
1165
1166 #ifdef SWAP_TO_FILES
1167 /*
1168 * sw_reg_strategy: handle swap i/o to regular files
1169 */
1170 static void
1171 sw_reg_strategy(sdp, bp, bn)
1172 struct swapdev *sdp;
1173 struct buf *bp;
1174 int bn;
1175 {
1176 struct vnode *vp;
1177 struct vndxfer *vnx;
1178 daddr_t nbn;
1179 caddr_t addr;
1180 int byteoff, s, off, nra, error, sz, resid;
1181 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
1182
1183 /*
1184 * allocate a vndxfer head for this transfer and point it to
1185 * our buffer.
1186 */
1187 vnx = getvndxfer();
1188 vnx->vx_flags = VX_BUSY;
1189 vnx->vx_error = 0;
1190 vnx->vx_pending = 0;
1191 vnx->vx_bp = bp;
1192 vnx->vx_sdp = sdp;
1193
1194 /*
1195 * setup for main loop where we read filesystem blocks into
1196 * our buffer.
1197 */
1198 error = 0;
1199 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */
1200 addr = bp->b_data; /* current position in buffer */
1201 byteoff = dbtob(bn); /* XXX: should it be an off_t? */
1202
1203 for (resid = bp->b_resid; resid; resid -= sz) {
1204 struct vndbuf *nbp;
1205
1206 /*
1207 * translate byteoffset into block number. return values:
1208 * vp = vnode of underlying device
1209 * nbn = new block number (on underlying vnode dev)
1210 * nra = num blocks we can read-ahead (excludes requested
1211 * block)
1212 */
1213 nra = 0;
1214 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1215 &vp, &nbn, &nra);
1216
1217 if (error == 0 && (long)nbn == -1)
1218 error = EIO; /* failure */
1219
1220 /*
1221 * punt if there was an error or a hole in the file.
1222 * we must wait for any i/o ops we have already started
1223 * to finish before returning.
1224 *
1225 * XXX we could deal with holes here but it would be
1226 * a hassle (in the write case).
1227 */
1228 if (error) {
1229 s = splbio();
1230 vnx->vx_error = error; /* pass error up */
1231 goto out;
1232 }
1233
1234 /*
1235 * compute the size ("sz") of this transfer (in bytes).
1236 * XXXCDC: ignores read-ahead for non-zero offset
1237 */
1238 if ((off = (byteoff % sdp->swd_bsize)) != 0)
1239 sz = sdp->swd_bsize - off;
1240 else
1241 sz = (1 + nra) * sdp->swd_bsize;
1242
1243 if (resid < sz)
1244 sz = resid;
1245
1246 UVMHIST_LOG(pdhist, "sw_reg_strategy: vp %p/%p bn 0x%x/0x%x",
1247 sdp->swd_vp, vp, bn, nbn);
1248
1249 /*
1250 * now get a buf structure. note that the vb_buf is
1251 * at the front of the nbp structure so that you can
1252 * cast pointers between the two structure easily.
1253 */
1254 nbp = getvndbuf();
1255 nbp->vb_buf.b_flags = bp->b_flags | B_CALL;
1256 nbp->vb_buf.b_bcount = sz;
1257 nbp->vb_buf.b_bufsize = bp->b_bufsize; /* XXXCDC: really? */
1258 nbp->vb_buf.b_error = 0;
1259 nbp->vb_buf.b_data = addr;
1260 nbp->vb_buf.b_blkno = nbn + btodb(off);
1261 nbp->vb_buf.b_proc = bp->b_proc;
1262 nbp->vb_buf.b_iodone = sw_reg_iodone;
1263 nbp->vb_buf.b_vp = NULLVP;
1264 nbp->vb_buf.b_rcred = sdp->swd_cred;
1265 nbp->vb_buf.b_wcred = sdp->swd_cred;
1266
1267 /*
1268 * set b_dirtyoff/end and b_vaildoff/end. this is
1269 * required by the NFS client code (otherwise it will
1270 * just discard our I/O request).
1271 */
1272 if (bp->b_dirtyend == 0) {
1273 nbp->vb_buf.b_dirtyoff = 0;
1274 nbp->vb_buf.b_dirtyend = sz;
1275 } else {
1276 nbp->vb_buf.b_dirtyoff =
1277 max(0, bp->b_dirtyoff - (bp->b_bcount-resid));
1278 nbp->vb_buf.b_dirtyend =
1279 min(sz,
1280 max(0, bp->b_dirtyend - (bp->b_bcount-resid)));
1281 }
1282 if (bp->b_validend == 0) {
1283 nbp->vb_buf.b_validoff = 0;
1284 nbp->vb_buf.b_validend = sz;
1285 } else {
1286 nbp->vb_buf.b_validoff =
1287 max(0, bp->b_validoff - (bp->b_bcount-resid));
1288 nbp->vb_buf.b_validend =
1289 min(sz,
1290 max(0, bp->b_validend - (bp->b_bcount-resid)));
1291 }
1292
1293 nbp->vb_xfer = vnx; /* patch it back in to vnx */
1294
1295 /*
1296 * Just sort by block number
1297 */
1298 nbp->vb_buf.b_cylinder = nbp->vb_buf.b_blkno;
1299 s = splbio();
1300 if (vnx->vx_error != 0) {
1301 putvndbuf(nbp);
1302 goto out;
1303 }
1304 vnx->vx_pending++;
1305
1306 /* assoc new buffer with underlying vnode */
1307 bgetvp(vp, &nbp->vb_buf);
1308
1309 /* sort it in and start I/O if we are not over our limit */
1310 disksort(&sdp->swd_tab, &nbp->vb_buf);
1311 sw_reg_start(sdp);
1312 splx(s);
1313
1314 /*
1315 * advance to the next I/O
1316 */
1317 bn += sz;
1318 addr += sz;
1319 }
1320
1321 s = splbio();
1322
1323 out: /* Arrive here at splbio */
1324 vnx->vx_flags &= ~VX_BUSY;
1325 if (vnx->vx_pending == 0) {
1326 if (vnx->vx_error != 0) {
1327 bp->b_error = vnx->vx_error;
1328 bp->b_flags |= B_ERROR;
1329 }
1330 putvndxfer(vnx);
1331 biodone(bp);
1332 }
1333 splx(s);
1334 }
1335
1336 /*
1337 * sw_reg_start: start an I/O request on the requested swapdev
1338 *
1339 * => reqs are sorted by disksort (above)
1340 */
1341 static void
1342 sw_reg_start(sdp)
1343 struct swapdev *sdp;
1344 {
1345 struct buf *bp;
1346 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
1347
1348 /* recursion control */
1349 if ((sdp->swd_flags & SWF_BUSY) != 0)
1350 return;
1351
1352 sdp->swd_flags |= SWF_BUSY;
1353
1354 while (sdp->swd_tab.b_active < sdp->swd_maxactive) {
1355 bp = sdp->swd_tab.b_actf;
1356 if (bp == NULL)
1357 break;
1358 sdp->swd_tab.b_actf = bp->b_actf;
1359 sdp->swd_tab.b_active++;
1360
1361 UVMHIST_LOG(pdhist,
1362 "sw_reg_start: bp %p vp %p blkno %p cnt %lx",
1363 bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
1364 if ((bp->b_flags & B_READ) == 0)
1365 bp->b_vp->v_numoutput++;
1366 VOP_STRATEGY(bp);
1367 }
1368 sdp->swd_flags &= ~SWF_BUSY;
1369 }
1370
1371 /*
1372 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1373 *
1374 * => note that we can recover the vndbuf struct by casting the buf ptr
1375 */
1376 static void
1377 sw_reg_iodone(bp)
1378 struct buf *bp;
1379 {
1380 struct vndbuf *vbp = (struct vndbuf *) bp;
1381 struct vndxfer *vnx = vbp->vb_xfer;
1382 struct buf *pbp = vnx->vx_bp; /* parent buffer */
1383 struct swapdev *sdp = vnx->vx_sdp;
1384 int s, resid;
1385 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
1386
1387 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p",
1388 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
1389 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx",
1390 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1391
1392 /*
1393 * protect vbp at splbio and update.
1394 */
1395
1396 s = splbio();
1397 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1398 pbp->b_resid -= resid;
1399 vnx->vx_pending--;
1400
1401 if (vbp->vb_buf.b_error) {
1402 UVMHIST_LOG(pdhist, " got error=%d !",
1403 vbp->vb_buf.b_error, 0, 0, 0);
1404
1405 /* pass error upward */
1406 vnx->vx_error = vbp->vb_buf.b_error;
1407 }
1408
1409 /*
1410 * drop "hold" reference to vnode (if one)
1411 * XXXCDC: always set to NULLVP, this is useless, right?
1412 */
1413 if (vbp->vb_buf.b_vp != NULLVP)
1414 brelvp(&vbp->vb_buf);
1415
1416 /*
1417 * kill vbp structure
1418 */
1419 putvndbuf(vbp);
1420
1421 /*
1422 * wrap up this transaction if it has run to completion or, in
1423 * case of an error, when all auxiliary buffers have returned.
1424 */
1425 if (vnx->vx_error != 0) {
1426 /* pass error upward */
1427 pbp->b_flags |= B_ERROR;
1428 pbp->b_error = vnx->vx_error;
1429 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1430 putvndxfer(vnx);
1431 biodone(pbp);
1432 }
1433 }
1434
1435 if (pbp->b_resid == 0) {
1436 #ifdef DIAGNOSTIC
1437 if (vnx->vx_pending != 0)
1438 panic("sw_reg_iodone: vnx pending: %d", vnx->vx_pending);
1439 #endif
1440
1441 if ((vnx->vx_flags & VX_BUSY) == 0) {
1442 UVMHIST_LOG(pdhist, " iodone error=%d !",
1443 pbp, vnx->vx_error, 0, 0);
1444 putvndxfer(vnx);
1445 biodone(pbp);
1446 }
1447 }
1448
1449 /*
1450 * done! start next swapdev I/O if one is pending
1451 */
1452 sdp->swd_tab.b_active--;
1453 sw_reg_start(sdp);
1454
1455 splx(s);
1456 }
1457 #endif /* SWAP_TO_FILES */
1458
1459
1460 /*
1461 * uvm_swap_alloc: allocate space on swap
1462 *
1463 * => allocation is done "round robin" down the priority list, as we
1464 * allocate in a priority we "rotate" the circle queue.
1465 * => space can be freed with uvm_swap_free
1466 * => we return the page slot number in /dev/drum (0 == invalid slot)
1467 * => we lock swap_data_lock
1468 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1469 */
1470 int
1471 uvm_swap_alloc(nslots, lessok)
1472 int *nslots; /* IN/OUT */
1473 boolean_t lessok;
1474 {
1475 struct swapdev *sdp;
1476 struct swappri *spp;
1477 u_long result;
1478 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
1479
1480 /*
1481 * no swap devices configured yet? definite failure.
1482 */
1483 if (uvmexp.nswapdev < 1)
1484 return 0;
1485
1486 /*
1487 * lock data lock, convert slots into blocks, and enter loop
1488 */
1489 simple_lock(&swap_data_lock);
1490
1491 ReTry: /* XXXMRG */
1492 for (spp = swap_priority.lh_first; spp != NULL;
1493 spp = spp->spi_swappri.le_next) {
1494 for (sdp = spp->spi_swapdev.cqh_first;
1495 sdp != (void *)&spp->spi_swapdev;
1496 sdp = sdp->swd_next.cqe_next) {
1497 /* if it's not enabled, then we can't swap from it */
1498 if ((sdp->swd_flags & SWF_ENABLE) == 0)
1499 continue;
1500 if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1501 continue;
1502 if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN,
1503 EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT,
1504 &result) != 0) {
1505 continue;
1506 }
1507
1508 /*
1509 * successful allocation! now rotate the circleq.
1510 */
1511 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1512 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1513 sdp->swd_npginuse += *nslots;
1514 uvmexp.swpginuse += *nslots;
1515 simple_unlock(&swap_data_lock);
1516 /* done! return drum slot number */
1517 UVMHIST_LOG(pdhist,
1518 "success! returning %d slots starting at %d",
1519 *nslots, result + sdp->swd_drumoffset, 0, 0);
1520 #if 0
1521 {
1522 struct swapdev *sdp2;
1523
1524 sdp2 = swapdrum_getsdp(result + sdp->swd_drumoffset);
1525 if (sdp2 == NULL) {
1526 printf("uvm_swap_alloc: nslots=%d, dev=%x, drumoff=%d, result=%ld",
1527 *nslots, sdp->swd_dev, sdp->swd_drumoffset, result);
1528 panic("uvm_swap_alloc: allocating unmapped swap block!");
1529 }
1530 }
1531 #endif
1532 return(result + sdp->swd_drumoffset);
1533 }
1534 }
1535
1536 /* XXXMRG: BEGIN HACK */
1537 if (*nslots > 1 && lessok) {
1538 *nslots = 1;
1539 goto ReTry; /* XXXMRG: ugh! extent should support this for us */
1540 }
1541 /* XXXMRG: END HACK */
1542
1543 simple_unlock(&swap_data_lock);
1544 return 0; /* failed */
1545 }
1546
1547 /*
1548 * uvm_swap_free: free swap slots
1549 *
1550 * => this can be all or part of an allocation made by uvm_swap_alloc
1551 * => we lock swap_data_lock
1552 */
1553 void
1554 uvm_swap_free(startslot, nslots)
1555 int startslot;
1556 int nslots;
1557 {
1558 struct swapdev *sdp;
1559 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
1560
1561 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots,
1562 startslot, 0, 0);
1563 /*
1564 * convert drum slot offset back to sdp, free the blocks
1565 * in the extent, and return. must hold pri lock to do
1566 * lookup and access the extent.
1567 */
1568 simple_lock(&swap_data_lock);
1569 sdp = swapdrum_getsdp(startslot);
1570
1571 #ifdef DIAGNOSTIC
1572 if (uvmexp.nswapdev < 1)
1573 panic("uvm_swap_free: uvmexp.nswapdev < 1\n");
1574 if (sdp == NULL) {
1575 printf("uvm_swap_free: startslot %d, nslots %d\n", startslot,
1576 nslots);
1577 panic("uvm_swap_free: unmapped address\n");
1578 }
1579 #endif
1580 extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots,
1581 EX_MALLOCOK|EX_NOWAIT);
1582 sdp->swd_npginuse -= nslots;
1583 uvmexp.swpginuse -= nslots;
1584 #ifdef DIAGNOSTIC
1585 if (sdp->swd_npginuse < 0)
1586 panic("uvm_swap_free: inuse < 0");
1587 #endif
1588 simple_unlock(&swap_data_lock);
1589 }
1590
1591 /*
1592 * uvm_swap_put: put any number of pages into a contig place on swap
1593 *
1594 * => can be sync or async
1595 * => XXXMRG: consider making it an inline or macro
1596 */
1597 int
1598 uvm_swap_put(swslot, ppsp, npages, flags)
1599 int swslot;
1600 struct vm_page **ppsp;
1601 int npages;
1602 int flags;
1603 {
1604 int result;
1605
1606 #if 0
1607 flags |= PGO_SYNCIO; /* XXXMRG: tmp, force sync */
1608 #endif
1609
1610 result = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1611 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1612
1613 return (result);
1614 }
1615
1616 /*
1617 * uvm_swap_get: get a single page from swap
1618 *
1619 * => usually a sync op (from fault)
1620 * => XXXMRG: consider making it an inline or macro
1621 */
1622 int
1623 uvm_swap_get(page, swslot, flags)
1624 struct vm_page *page;
1625 int swslot, flags;
1626 {
1627 int result;
1628
1629 uvmexp.nswget++;
1630 #ifdef DIAGNOSTIC
1631 if ((flags & PGO_SYNCIO) == 0)
1632 printf("uvm_swap_get: ASYNC get requested?\n");
1633 #endif
1634
1635 result = uvm_swap_io(&page, swslot, 1, B_READ |
1636 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1637
1638 return (result);
1639 }
1640
1641 /*
1642 * uvm_swap_io: do an i/o operation to swap
1643 */
1644
1645 static int
1646 uvm_swap_io(pps, startslot, npages, flags)
1647 struct vm_page **pps;
1648 int startslot, npages, flags;
1649 {
1650 daddr_t startblk;
1651 struct swapbuf *sbp;
1652 struct buf *bp;
1653 vm_offset_t kva;
1654 int result, s, waitf;
1655 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
1656
1657 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d",
1658 startslot, npages, flags, 0);
1659 /*
1660 * convert starting drum slot to block number
1661 */
1662 startblk = btodb(startslot * PAGE_SIZE);
1663
1664 /*
1665 * first, map the pages into the kernel (XXX: currently required
1666 * by buffer system). note that we don't let pagermapin alloc
1667 * an aiodesc structure because we don't want to chance a malloc.
1668 * we've got our own pool of aiodesc structures (in swapbuf).
1669 */
1670 waitf = (flags & B_ASYNC) ? M_NOWAIT : M_WAITOK;
1671 kva = uvm_pagermapin(pps, npages, NULL, waitf);
1672 if (kva == NULL)
1673 return (VM_PAGER_AGAIN);
1674
1675 /*
1676 * now allocate a swap buffer off of freesbufs
1677 * [make sure we don't put the pagedaemon to sleep...]
1678 */
1679 s = splbio();
1680 simple_lock(&swap_buf_lock);
1681
1682 /* never put the pagedaemon to sleep! */
1683 if ((flags & B_ASYNC) != 0 || curproc == uvm.pagedaemon_proc) {
1684
1685 sbp = freesbufs.sqh_first;
1686
1687 } else {
1688
1689 /* we can sleep for a sbuf if needed */
1690 while (freesbufs.sqh_first == NULL) {
1691
1692 sbufs_wanted = 1;
1693 UVM_UNLOCK_AND_WAIT(&freesbufs, &swap_buf_lock, 0,
1694 "uvmswiobuf",0);
1695
1696 simple_lock(&swap_buf_lock); /* relock */
1697 }
1698 sbp = freesbufs.sqh_first;
1699 }
1700
1701 if (sbp)
1702 SIMPLEQ_REMOVE_HEAD(&freesbufs, sbp, sw_sq);
1703 simple_unlock(&swap_buf_lock);
1704 splx(s); /* drop splbio */
1705
1706 /*
1707 * if we failed to get a swapbuf, return "try again"
1708 */
1709 if (sbp == NULL)
1710 return (VM_PAGER_AGAIN);
1711
1712 /*
1713 * fill in the bp/sbp. we currently route our i/o through
1714 * /dev/drum's vnode [swapdev_vp].
1715 */
1716 bp = &sbp->sw_buf;
1717 bp->b_flags = B_BUSY | (flags & (B_READ|B_ASYNC));
1718 bp->b_proc = &proc0; /* XXX */
1719 bp->b_data = (caddr_t)kva;
1720 bp->b_blkno = startblk;
1721 VHOLD(swapdev_vp);
1722 bp->b_vp = swapdev_vp;
1723 /* XXXCDC: isn't swapdev_vp always a VCHR? */
1724 /* XXXMRG: probably -- this is obviously something inherited... */
1725 if (swapdev_vp->v_type == VBLK)
1726 bp->b_dev = swapdev_vp->v_rdev;
1727 bp->b_bcount = npages * PAGE_SIZE;
1728
1729 /*
1730 * for pageouts we must set "dirtyoff" [NFS client code needs it].
1731 * and we bump v_numoutput (counter of number of active outputs).
1732 */
1733 if ((bp->b_flags & B_READ) == 0) {
1734 bp->b_dirtyoff = 0;
1735 bp->b_dirtyend = npages * PAGE_SIZE;
1736 s = splbio();
1737 swapdev_vp->v_numoutput++;
1738 splx(s);
1739 }
1740
1741 /*
1742 * for async ops we must set up the aiodesc and setup the callback
1743 * XXX: we expect no async-reads, but we don't prevent it here.
1744 */
1745 if (flags & B_ASYNC) {
1746 sbp->sw_aio.aiodone = uvm_swap_aiodone;
1747 sbp->sw_aio.kva = kva;
1748 sbp->sw_aio.npages = npages;
1749 sbp->sw_aio.pd_ptr = sbp; /* backpointer */
1750 bp->b_flags |= B_CALL; /* set callback */
1751 bp->b_iodone = uvm_swap_bufdone;/* "buf" iodone function */
1752 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1753 }
1754 UVMHIST_LOG(pdhist,
1755 "about to start io: data = 0x%p blkno = 0x%x, bcount = %ld",
1756 bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1757
1758 /*
1759 * now we start the I/O, and if async, return.
1760 */
1761 VOP_STRATEGY(bp);
1762 if (flags & B_ASYNC)
1763 return (VM_PAGER_PEND);
1764
1765 /*
1766 * must be sync i/o. wait for it to finish
1767 */
1768 bp->b_error = biowait(bp);
1769 result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK;
1770
1771 /*
1772 * kill the pager mapping
1773 */
1774 uvm_pagermapout(kva, npages);
1775
1776 /*
1777 * now dispose of the swap buffer
1778 */
1779 s = splbio();
1780 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
1781 if (bp->b_vp)
1782 brelvp(bp);
1783
1784 simple_lock(&swap_buf_lock);
1785 SIMPLEQ_INSERT_HEAD(&freesbufs, sbp, sw_sq);
1786 if (sbufs_wanted) {
1787 sbufs_wanted = 0;
1788 thread_wakeup(&freesbufs);
1789 }
1790 simple_unlock(&swap_buf_lock);
1791 splx(s);
1792
1793 /*
1794 * finally return.
1795 */
1796 UVMHIST_LOG(pdhist, "<- done (sync) result=%d", result, 0, 0, 0);
1797 return (result);
1798 }
1799
1800 /*
1801 * uvm_swap_bufdone: called from the buffer system when the i/o is done
1802 */
1803 static void
1804 uvm_swap_bufdone(bp)
1805 struct buf *bp;
1806 {
1807 struct swapbuf *sbp = (struct swapbuf *) bp;
1808 int s = splbio();
1809 UVMHIST_FUNC("uvm_swap_bufdone"); UVMHIST_CALLED(pdhist);
1810
1811 UVMHIST_LOG(pdhist, "cleaning buf %p", buf, 0, 0, 0);
1812 #ifdef DIAGNOSTIC
1813 /*
1814 * sanity check: swapbufs are private, so they shouldn't be wanted
1815 */
1816 if (bp->b_flags & B_WANTED)
1817 panic("uvm_swap_bufdone: private buf wanted");
1818 #endif
1819
1820 /*
1821 * drop buffers reference to the vnode and its flags.
1822 */
1823 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY);
1824 if (bp->b_vp)
1825 brelvp(bp);
1826
1827 /*
1828 * now put the aio on the uvm.aio_done list and wake the
1829 * pagedaemon (which will finish up our job in its context).
1830 */
1831 simple_lock(&uvm.pagedaemon_lock); /* locks uvm.aio_done */
1832 TAILQ_INSERT_TAIL(&uvm.aio_done, &sbp->sw_aio, aioq);
1833 simple_unlock(&uvm.pagedaemon_lock);
1834
1835 thread_wakeup(&uvm.pagedaemon);
1836 splx(s);
1837 }
1838
1839 /*
1840 * uvm_swap_aiodone: aiodone function for anonymous memory
1841 *
1842 * => this is called in the context of the pagedaemon (but with the
1843 * page queues unlocked!)
1844 * => our "aio" structure must be part of a "swapbuf"
1845 */
1846 static void
1847 uvm_swap_aiodone(aio)
1848 struct uvm_aiodesc *aio;
1849 {
1850 struct swapbuf *sbp = aio->pd_ptr;
1851 /* XXXMRG: does this work if PAGE_SIZE is a variable, eg SUN4C&&SUN4 */
1852 /* XXX it does with GCC */
1853 struct vm_page *pps[MAXBSIZE/PAGE_SIZE];
1854 int lcv, s;
1855 vm_offset_t addr;
1856 UVMHIST_FUNC("uvm_swap_aiodone"); UVMHIST_CALLED(pdhist);
1857
1858 UVMHIST_LOG(pdhist, "done with aio %p", aio, 0, 0, 0);
1859 #ifdef DIAGNOSTIC
1860 /*
1861 * sanity check
1862 */
1863 if (aio->npages > (MAXBSIZE/PAGE_SIZE))
1864 panic("uvm_swap_aiodone: aio too big!");
1865 #endif
1866
1867 /*
1868 * first, we have to recover the page pointers (pps) by poking in the
1869 * kernel pmap (XXX: should be saved in the buf structure).
1870 */
1871 for (addr = aio->kva, lcv = 0 ; lcv < aio->npages ;
1872 addr += PAGE_SIZE, lcv++) {
1873 pps[lcv] = uvm_pageratop(addr);
1874 }
1875
1876 /*
1877 * now we can dispose of the kernel mappings of the buffer
1878 */
1879 uvm_pagermapout(aio->kva, aio->npages);
1880
1881 /*
1882 * now we can dispose of the pages by using the dropcluster function
1883 * [note that we have no "page of interest" so we pass in null]
1884 */
1885 uvm_pager_dropcluster(NULL, NULL, pps, &aio->npages,
1886 PGO_PDFREECLUST, 0);
1887
1888 /*
1889 * finally, we can dispose of the swapbuf
1890 */
1891 s = splbio();
1892 simple_lock(&swap_buf_lock);
1893 SIMPLEQ_INSERT_HEAD(&freesbufs, sbp, sw_sq);
1894 if (sbufs_wanted) {
1895 sbufs_wanted = 0;
1896 thread_wakeup(&freesbufs);
1897 }
1898 simple_unlock(&swap_buf_lock);
1899 splx(s);
1900
1901 /*
1902 * done!
1903 */
1904 }
1905