1 /* $NetBSD: uvm_swap.c,v 1.215 2026/02/13 19:16:41 kre Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.215 2026/02/13 19:16:41 kre Exp $"); 34 35 #include "opt_uvmhist.h" 36 #include "opt_compat_netbsd.h" 37 #include "opt_ddb.h" 38 #include "opt_vmswap.h" 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/atomic.h> 43 #include <sys/buf.h> 44 #include <sys/bufq.h> 45 #include <sys/conf.h> 46 #include <sys/cprng.h> 47 #include <sys/proc.h> 48 #include <sys/namei.h> 49 #include <sys/disklabel.h> 50 #include <sys/errno.h> 51 #include <sys/kernel.h> 52 #include <sys/vnode.h> 53 #include <sys/file.h> 54 #include <sys/vmem.h> 55 #include <sys/blist.h> 56 #include <sys/mount.h> 57 #include <sys/pool.h> 58 #include <sys/kmem.h> 59 #include <sys/syscallargs.h> 60 #include <sys/swap.h> 61 #include <sys/kauth.h> 62 #include <sys/sysctl.h> 63 #include <sys/workqueue.h> 64 65 #include <uvm/uvm.h> 66 67 #include <miscfs/specfs/specdev.h> 68 69 #include <crypto/aes/aes.h> 70 #include <crypto/aes/aes_cbc.h> 71 72 /* 73 * uvm_swap.c: manage configuration and i/o to swap space. 74 */ 75 76 /* 77 * swap space is managed in the following way: 78 * 79 * each swap partition or file is described by a "swapdev" structure. 80 * each "swapdev" structure contains a "swapent" structure which contains 81 * information that is passed up to the user (via system calls). 82 * 83 * each swap partition is assigned a "priority" (int) which controls 84 * swap partition usage. 85 * 86 * the system maintains a global data structure describing all swap 87 * partitions/files. there is a sorted LIST of "swappri" structures 88 * which describe "swapdev"'s at that priority. this LIST is headed 89 * by the "swap_priority" global var. each "swappri" contains a 90 * TAILQ of "swapdev" structures at that priority. 91 * 92 * locking: 93 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl 94 * system call and prevents the swap priority list from changing 95 * while we are in the middle of a system call (e.g. SWAP_STATS). 96 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data 97 * structures including the priority list, the swapdev structures, 98 * and the swapmap arena. 99 * 100 * each swap device has the following info: 101 * - swap device in use (could be disabled, preventing future use) 102 * - swap enabled (allows new allocations on swap) 103 * - map info in /dev/drum 104 * - vnode pointer 105 * for swap files only: 106 * - block size 107 * - max byte count in buffer 108 * - buffer 109 * 110 * userland controls and configures swap with the swapctl(2) system call. 111 * the sys_swapctl performs the following operations: 112 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 113 * [2] SWAP_STATS: given a pointer to an array of swapent structures 114 * (passed in via "arg") of a size passed in via "misc" ... we load 115 * the current swap config into the array. The actual work is done 116 * in the uvm_swap_stats() function. 117 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 118 * priority in "misc", start swapping on it. 119 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 120 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 121 * "misc") 122 */ 123 124 /* 125 * swapdev: describes a single swap partition/file 126 * 127 * note the following should be true: 128 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 129 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 130 */ 131 struct swapdev { 132 dev_t swd_dev; /* device id */ 133 int swd_flags; /* flags:inuse/enable/fake */ 134 int swd_priority; /* our priority */ 135 int swd_nblks; /* blocks in this device */ 136 char *swd_path; /* saved pathname of device */ 137 int swd_pathlen; /* length of pathname */ 138 int swd_npages; /* #pages we can use */ 139 int swd_npginuse; /* #pages in use */ 140 int swd_npgbad; /* #pages bad */ 141 int swd_drumoffset; /* page0 offset in drum */ 142 int swd_drumsize; /* #pages in drum */ 143 blist_t swd_blist; /* blist for this swapdev */ 144 struct vnode *swd_vp; /* backing vnode */ 145 TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ 146 147 int swd_bsize; /* blocksize (bytes) */ 148 int swd_maxactive; /* max active i/o reqs */ 149 struct bufq_state *swd_tab; /* buffer list */ 150 int swd_active; /* number of active buffers */ 151 152 volatile uint32_t *swd_encmap; /* bitmap of encrypted slots */ 153 struct aesenc swd_enckey; /* AES key expanded for enc */ 154 struct aesdec swd_deckey; /* AES key expanded for dec */ 155 bool swd_encinit; /* true if keys initialized */ 156 }; 157 158 /* 159 * swap device priority entry; the list is kept sorted on `spi_priority'. 160 */ 161 struct swappri { 162 int spi_priority; /* priority */ 163 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 164 /* tailq of swapdevs at this priority */ 165 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 166 }; 167 168 /* 169 * The following two structures are used to keep track of data transfers 170 * on swap devices associated with regular files. 171 * NOTE: this code is more or less a copy of vnd.c; we use the same 172 * structure names here to ease porting.. 173 */ 174 struct vndxfer { 175 struct buf *vx_bp; /* Pointer to parent buffer */ 176 struct swapdev *vx_sdp; 177 int vx_error; 178 int vx_pending; /* # of pending aux buffers */ 179 int vx_flags; 180 #define VX_BUSY 1 181 #define VX_DEAD 2 182 }; 183 184 struct vndbuf { 185 struct buf vb_buf; 186 struct vndxfer *vb_xfer; 187 }; 188 189 /* 190 * We keep a of pool vndbuf's and vndxfer structures. 191 */ 192 static struct pool vndxfer_pool, vndbuf_pool; 193 194 /* 195 * local variables 196 */ 197 static vmem_t *swapmap; /* controls the mapping of /dev/drum */ 198 199 /* list of all active swap devices [by priority] */ 200 LIST_HEAD(swap_priority, swappri); 201 static struct swap_priority swap_priority; 202 203 /* locks */ 204 static kmutex_t uvm_swap_data_lock __cacheline_aligned; 205 static krwlock_t swap_syscall_lock; 206 bool uvm_swap_init_done = false; 207 208 /* workqueue and use counter for swap to regular files */ 209 static int sw_reg_count = 0; 210 static struct workqueue *sw_reg_workqueue; 211 212 /* tuneables */ 213 u_int uvm_swapisfull_factor = 99; 214 #if VMSWAP_DEFAULT_PLAINTEXT 215 bool uvm_swap_encrypt = false; 216 #else 217 bool uvm_swap_encrypt = true; 218 #endif 219 220 /* 221 * prototypes 222 */ 223 static struct swapdev *swapdrum_getsdp(int); 224 225 static struct swapdev *swaplist_find(struct vnode *, bool); 226 static void swaplist_insert(struct swapdev *, 227 struct swappri *, int); 228 static void swaplist_trim(void); 229 230 static int swap_on(struct lwp *, struct swapdev *); 231 static int swap_off(struct lwp *, struct swapdev *); 232 233 static void sw_reg_strategy(struct swapdev *, struct buf *, int); 234 static void sw_reg_biodone(struct buf *); 235 static void sw_reg_iodone(struct work *wk, void *dummy); 236 static void sw_reg_start(struct swapdev *); 237 238 static int uvm_swap_io(struct vm_page **, int, int, int); 239 240 static void uvm_swap_genkey(struct swapdev *); 241 static void uvm_swap_encryptpage(struct swapdev *, void *, int); 242 static void uvm_swap_decryptpage(struct swapdev *, void *, int); 243 244 static size_t 245 encmap_size(size_t npages) 246 { 247 struct swapdev *sdp; 248 const size_t bytesperword = sizeof(sdp->swd_encmap[0]); 249 const size_t bitsperword = NBBY * bytesperword; 250 const size_t nbits = npages; /* one bit for each page */ 251 const size_t nwords = howmany(nbits, bitsperword); 252 const size_t nbytes = nwords * bytesperword; 253 254 return nbytes; 255 } 256 257 /* 258 * uvm_swap_init: init the swap system data structures and locks 259 * 260 * => called at boot time from init_main.c after the filesystems 261 * are brought up (which happens after uvm_init()) 262 */ 263 void 264 uvm_swap_init(void) 265 { 266 UVMHIST_FUNC(__func__); 267 268 UVMHIST_CALLED(pdhist); 269 /* 270 * first, init the swap list, its counter, and its lock. 271 * then get a handle on the vnode for /dev/drum by using 272 * the its dev_t number ("swapdev", from MD conf.c). 273 */ 274 275 LIST_INIT(&swap_priority); 276 uvmexp.nswapdev = 0; 277 rw_init(&swap_syscall_lock); 278 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 279 280 if (bdevvp(swapdev, &swapdev_vp)) 281 panic("%s: can't get vnode for swap device", __func__); 282 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY)) 283 panic("%s: can't lock swap device", __func__); 284 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED)) 285 panic("%s: can't open swap device", __func__); 286 VOP_UNLOCK(swapdev_vp); 287 288 /* 289 * create swap block resource map to map /dev/drum. the range 290 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 291 * that block 0 is reserved (used to indicate an allocation 292 * failure, or no allocation). 293 */ 294 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, 295 VM_NOSLEEP, IPL_NONE); 296 if (swapmap == 0) { 297 panic("%s: vmem_create failed", __func__); 298 } 299 300 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 301 NULL, IPL_BIO); 302 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 303 NULL, IPL_BIO); 304 305 uvm_swap_init_done = true; 306 307 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 308 } 309 310 /* 311 * swaplist functions: functions that operate on the list of swap 312 * devices on the system. 313 */ 314 315 /* 316 * swaplist_insert: insert swap device "sdp" into the global list 317 * 318 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 319 * => caller must provide a newly allocated swappri structure (we will 320 * FREE it if we don't need it... this it to prevent allocation 321 * blocking here while adding swap) 322 */ 323 static void 324 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 325 { 326 struct swappri *spp, *pspp; 327 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 328 329 KASSERT(rw_write_held(&swap_syscall_lock)); 330 KASSERT(mutex_owned(&uvm_swap_data_lock)); 331 332 /* 333 * find entry at or after which to insert the new device. 334 */ 335 pspp = NULL; 336 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 337 if (priority <= spp->spi_priority) 338 break; 339 pspp = spp; 340 } 341 342 /* 343 * new priority? 344 */ 345 if (spp == NULL || spp->spi_priority != priority) { 346 spp = newspp; /* use newspp! */ 347 UVMHIST_LOG(pdhist, "created new swappri = %jd", 348 priority, 0, 0, 0); 349 350 spp->spi_priority = priority; 351 TAILQ_INIT(&spp->spi_swapdev); 352 353 if (pspp) 354 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 355 else 356 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 357 } else { 358 /* we don't need a new priority structure, free it */ 359 kmem_free(newspp, sizeof(*newspp)); 360 } 361 362 /* 363 * priority found (or created). now insert on the priority's 364 * tailq list and bump the total number of swapdevs. 365 */ 366 sdp->swd_priority = priority; 367 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 368 uvmexp.nswapdev++; 369 } 370 371 /* 372 * swaplist_find: find and optionally remove a swap device from the 373 * global list. 374 * 375 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 376 * => we return the swapdev we found (and removed) 377 */ 378 static struct swapdev * 379 swaplist_find(struct vnode *vp, bool remove) 380 { 381 struct swapdev *sdp; 382 struct swappri *spp; 383 384 KASSERT(rw_lock_held(&swap_syscall_lock)); 385 KASSERT(remove ? rw_write_held(&swap_syscall_lock) : 1); 386 KASSERT(mutex_owned(&uvm_swap_data_lock)); 387 388 /* 389 * search the lists for the requested vp 390 */ 391 392 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 393 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 394 if (sdp->swd_vp == vp) { 395 if (remove) { 396 TAILQ_REMOVE(&spp->spi_swapdev, 397 sdp, swd_next); 398 uvmexp.nswapdev--; 399 } 400 return(sdp); 401 } 402 } 403 } 404 return (NULL); 405 } 406 407 /* 408 * swaplist_trim: scan priority list for empty priority entries and kill 409 * them. 410 * 411 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 412 */ 413 static void 414 swaplist_trim(void) 415 { 416 struct swappri *spp, *nextspp; 417 418 KASSERT(rw_write_held(&swap_syscall_lock)); 419 KASSERT(mutex_owned(&uvm_swap_data_lock)); 420 421 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { 422 if (!TAILQ_EMPTY(&spp->spi_swapdev)) 423 continue; 424 LIST_REMOVE(spp, spi_swappri); 425 kmem_free(spp, sizeof(*spp)); 426 } 427 } 428 429 /* 430 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 431 * to the "swapdev" that maps that section of the drum. 432 * 433 * => each swapdev takes one big contig chunk of the drum 434 * => caller must hold uvm_swap_data_lock 435 */ 436 static struct swapdev * 437 swapdrum_getsdp(int pgno) 438 { 439 struct swapdev *sdp; 440 struct swappri *spp; 441 442 KASSERT(mutex_owned(&uvm_swap_data_lock)); 443 444 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 445 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 446 if (sdp->swd_flags & SWF_FAKE) 447 continue; 448 if (pgno >= sdp->swd_drumoffset && 449 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 450 return sdp; 451 } 452 } 453 } 454 return NULL; 455 } 456 457 /* 458 * swapdrum_sdp_is: true iff the swap device for pgno is sdp 459 * 460 * => for use in positive assertions only; result is not stable 461 */ 462 static bool __debugused 463 swapdrum_sdp_is(int pgno, struct swapdev *sdp) 464 { 465 bool result; 466 467 mutex_enter(&uvm_swap_data_lock); 468 result = swapdrum_getsdp(pgno) == sdp; 469 mutex_exit(&uvm_swap_data_lock); 470 471 return result; 472 } 473 474 void swapsys_lock(krw_t op) 475 { 476 rw_enter(&swap_syscall_lock, op); 477 } 478 479 void swapsys_unlock(void) 480 { 481 rw_exit(&swap_syscall_lock); 482 } 483 484 static void 485 swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse) 486 { 487 se->se_dev = sdp->swd_dev; 488 se->se_flags = sdp->swd_flags; 489 se->se_nblks = sdp->swd_nblks; 490 se->se_inuse = inuse; 491 se->se_priority = sdp->swd_priority; 492 KASSERT(sdp->swd_pathlen < sizeof(se->se_path)); 493 strcpy(se->se_path, sdp->swd_path); 494 } 495 496 int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) = 497 (void *)enosys; 498 int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) = 499 (void *)enosys; 500 501 /* 502 * sys_swapctl: main entry point for swapctl(2) system call 503 * [with three helper functions: swap_on, swap_off and uvm_swap_stats] 504 */ 505 int 506 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, 507 register_t *retval) 508 { 509 /* { 510 syscallarg(int) cmd; 511 syscallarg(void *) arg; 512 syscallarg(int) misc; 513 } */ 514 struct vnode *vp; 515 struct nameidata nd; 516 struct swappri *spp; 517 struct swapdev *sdp; 518 #define SWAP_PATH_MAX (PATH_MAX + 1) 519 char *userpath; 520 size_t len = 0; 521 int error; 522 int priority; 523 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 524 525 /* 526 * we handle the non-priv NSWAP and STATS request first. 527 * 528 * SWAP_NSWAP: return number of config'd swap devices 529 * [can also be obtained with uvmexp sysctl] 530 */ 531 if (SCARG(uap, cmd) == SWAP_NSWAP) { 532 const int nswapdev = uvmexp.nswapdev; 533 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev, 534 0, 0, 0); 535 *retval = nswapdev; 536 return 0; 537 } 538 539 userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP); 540 541 /* 542 * ensure serialized syscall access by grabbing the swap_syscall_lock 543 */ 544 rw_enter(&swap_syscall_lock, RW_WRITER); 545 546 /* 547 * SWAP_STATS: get stats on current # of configured swap devs 548 * 549 * note that the swap_priority list can't change as long 550 * as we are holding the swap_syscall_lock. we don't want 551 * to grab the uvm_swap_data_lock because we may fault&sleep during 552 * copyout() and we don't want to be holding that lock then! 553 */ 554 switch (SCARG(uap, cmd)) { 555 case SWAP_STATS13: 556 error = (*uvm_swap_stats13)(uap, retval); 557 goto out; 558 case SWAP_STATS50: 559 error = (*uvm_swap_stats50)(uap, retval); 560 goto out; 561 case SWAP_STATS: 562 error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc), 563 NULL, sizeof(struct swapent), retval); 564 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 565 goto out; 566 567 case SWAP_GETDUMPDEV: 568 error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev)); 569 goto out; 570 default: 571 break; 572 } 573 574 /* 575 * all other requests require superuser privs. verify. 576 */ 577 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, 578 0, NULL, NULL, NULL))) 579 goto out; 580 581 if (SCARG(uap, cmd) == SWAP_DUMPOFF) { 582 /* drop the current dump device */ 583 dumpdev = NODEV; 584 dumpcdev = NODEV; 585 cpu_dumpconf(); 586 goto out; 587 } 588 589 /* 590 * at this point we expect a path name in arg. we will 591 * use namei() to gain a vnode reference (vref), and lock 592 * the vnode (VOP_LOCK). 593 * 594 * XXX: a NULL arg means use the root vnode pointer (e.g. for 595 * miniroot) 596 */ 597 if (SCARG(uap, arg) == NULL) { 598 vp = rootvp; /* miniroot */ 599 vref(vp); 600 if (vn_lock(vp, LK_EXCLUSIVE)) { 601 vrele(vp); 602 error = EBUSY; 603 goto out; 604 } 605 if (SCARG(uap, cmd) == SWAP_ON && 606 copystr("miniroot", userpath, SWAP_PATH_MAX, &len)) 607 panic("swapctl: miniroot copy failed"); 608 } else { 609 struct pathbuf *pb; 610 611 /* 612 * This used to allow copying in one extra byte 613 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON. 614 * This was completely pointless because if anyone 615 * used that extra byte namei would fail with 616 * ENAMETOOLONG anyway, so I've removed the excess 617 * logic. - dholland 20100215 618 */ 619 620 error = pathbuf_copyin(SCARG(uap, arg), &pb); 621 if (error) { 622 goto out; 623 } 624 if (SCARG(uap, cmd) == SWAP_ON) { 625 /* get a copy of the string */ 626 pathbuf_copystring(pb, userpath, SWAP_PATH_MAX); 627 len = strlen(userpath) + 1; 628 } 629 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); 630 if ((error = namei(&nd))) { 631 pathbuf_destroy(pb); 632 goto out; 633 } 634 vp = nd.ni_vp; 635 pathbuf_destroy(pb); 636 } 637 /* note: "vp" is referenced and locked */ 638 639 error = 0; /* assume no error */ 640 switch(SCARG(uap, cmd)) { 641 642 case SWAP_DUMPDEV: 643 if (vp->v_type != VBLK) { 644 error = ENOTBLK; 645 break; 646 } 647 if (bdevsw_lookup(vp->v_rdev)) { 648 dumpdev = vp->v_rdev; 649 dumpcdev = devsw_blk2chr(dumpdev); 650 } else 651 dumpdev = NODEV; 652 cpu_dumpconf(); 653 break; 654 655 case SWAP_CTL: 656 /* 657 * get new priority, remove old entry (if any) and then 658 * reinsert it in the correct place. finally, prune out 659 * any empty priority structures. 660 */ 661 priority = SCARG(uap, misc); 662 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 663 mutex_enter(&uvm_swap_data_lock); 664 if ((sdp = swaplist_find(vp, true)) == NULL) { 665 error = ENOENT; 666 } else { 667 swaplist_insert(sdp, spp, priority); 668 swaplist_trim(); 669 } 670 mutex_exit(&uvm_swap_data_lock); 671 if (error) 672 kmem_free(spp, sizeof(*spp)); 673 break; 674 675 case SWAP_ON: 676 677 /* 678 * check for duplicates. if none found, then insert a 679 * dummy entry on the list to prevent someone else from 680 * trying to enable this device while we are working on 681 * it. 682 */ 683 684 priority = SCARG(uap, misc); 685 sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP); 686 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 687 sdp->swd_flags = SWF_FAKE; 688 sdp->swd_vp = vp; 689 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 690 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); 691 mutex_enter(&uvm_swap_data_lock); 692 if (swaplist_find(vp, false) != NULL) { 693 error = EBUSY; 694 mutex_exit(&uvm_swap_data_lock); 695 bufq_free(sdp->swd_tab); 696 kmem_free(sdp, sizeof(*sdp)); 697 kmem_free(spp, sizeof(*spp)); 698 break; 699 } 700 swaplist_insert(sdp, spp, priority); 701 mutex_exit(&uvm_swap_data_lock); 702 703 KASSERT(len > 0); 704 sdp->swd_pathlen = len; 705 sdp->swd_path = kmem_alloc(len, KM_SLEEP); 706 if (copystr(userpath, sdp->swd_path, len, 0) != 0) 707 panic("swapctl: copystr"); 708 709 /* 710 * we've now got a FAKE placeholder in the swap list. 711 * now attempt to enable swap on it. if we fail, undo 712 * what we've done and kill the fake entry we just inserted. 713 * if swap_on is a success, it will clear the SWF_FAKE flag 714 */ 715 716 if ((error = swap_on(l, sdp)) != 0) { 717 mutex_enter(&uvm_swap_data_lock); 718 (void) swaplist_find(vp, true); /* kill fake entry */ 719 swaplist_trim(); 720 mutex_exit(&uvm_swap_data_lock); 721 bufq_free(sdp->swd_tab); 722 kmem_free(sdp->swd_path, sdp->swd_pathlen); 723 kmem_free(sdp, sizeof(*sdp)); 724 break; 725 } 726 break; 727 728 case SWAP_OFF: 729 mutex_enter(&uvm_swap_data_lock); 730 if ((sdp = swaplist_find(vp, false)) == NULL) { 731 mutex_exit(&uvm_swap_data_lock); 732 error = ENXIO; 733 break; 734 } 735 736 /* 737 * If a device isn't in use or enabled, we 738 * can't stop swapping from it (again). 739 */ 740 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 741 mutex_exit(&uvm_swap_data_lock); 742 error = EBUSY; 743 break; 744 } 745 746 /* 747 * do the real work. 748 */ 749 error = swap_off(l, sdp); 750 break; 751 752 default: 753 error = EINVAL; 754 } 755 756 /* 757 * done! release the ref gained by namei() and unlock. 758 */ 759 vput(vp); 760 out: 761 rw_exit(&swap_syscall_lock); 762 kmem_free(userpath, SWAP_PATH_MAX); 763 764 UVMHIST_LOG(pdhist, "<- done! error=%jd", error, 0, 0, 0); 765 return (error); 766 } 767 768 /* 769 * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept 770 * away from sys_swapctl() in order to allow COMPAT_* swapctl() 771 * emulation to use it directly without going through sys_swapctl(). 772 * The problem with using sys_swapctl() there is that it involves 773 * copying the swapent array to the stackgap, and this array's size 774 * is not known at build time. Hence it would not be possible to 775 * ensure it would fit in the stackgap in any case. 776 */ 777 int 778 uvm_swap_stats(char *ptr, int misc, 779 void (*f)(void *, const struct swapent *), size_t len, 780 register_t *retval) 781 { 782 struct swappri *spp; 783 struct swapdev *sdp, **sdps, **sp; 784 struct swapent sep; 785 size_t sdpsize = 0; 786 struct swapdev *stackbuf[8]; /* magic 8, any number >1 will do */ 787 int count, slots; 788 int error; 789 790 KASSERT(len <= sizeof(sep)); 791 if (len == 0) 792 return ENOSYS; 793 794 if (misc < 0) 795 return EINVAL; 796 797 if (misc == 0 || uvmexp.nswapdev == 0) 798 return 0; 799 800 KASSERT(rw_lock_held(&swap_syscall_lock)); 801 802 /* 803 * Allocate space (slots) for pointers to all swapdevs 804 * 805 * This needs to be done here (not earlier) (and so needs 806 * the unlock/lock dance) because of the way the various 807 * compat functions work. 808 */ 809 sdps = NULL; 810 slots = uvmexp.nswapdev; 811 812 if (slots > misc) /* we never need more than requested */ 813 slots = misc; 814 815 /* 816 * Nb: do not limit misc to <= uvmexp.nswapdev yet, 817 * as the latter might get bigger (or smaller) 818 */ 819 820 if ((SIZE_T_MAX / sizeof sdp) <= misc) /* unlikely */ 821 return E2BIG; 822 823 /* 824 * One slot for each currently existing swap device, but 825 * limited (above) to no more than the request wants (misc). 826 * Each slot needs space for a pointer to a swapdev. 827 */ 828 sdpsize = (size_t)slots * sizeof sdp; 829 830 /* 831 * Borrow from kmem_tmpbuf_alloc(9) but don't use that 832 * so we don't need to do the unlock dance unnecessarily 833 */ 834 if (sdpsize <= sizeof stackbuf) { 835 /* Should be the common case */ 836 sdps = stackbuf; 837 } else { 838 rw_exit(&swap_syscall_lock); 839 840 sdps = kmem_alloc(sdpsize, KM_SLEEP); 841 842 rw_enter(&swap_syscall_lock, RW_READER); 843 844 /* 845 * At this point, 3 possibilities. 846 * 847 * 1. uvmexp.nswapdev has increased. 848 * 849 * A new swap device got added. That's OK, just ignore the 850 * excess device(s), and return the first N (the number that 851 * were there when we started). 852 * 853 * 2. uvmexp.nswapdev has decreased. 854 * 855 * A swap device was deleted. In this case we will return 856 * less devices than requested but that's OK. We will have 857 * more slot memory than is needed to save them all, but just 858 * a little more, and it gets freed just below. 859 * 860 * 3. uvmexp.nswapdev hasn't changed. 861 * 862 * This will be the usual case; no swapctl operations occurred 863 * while the lock was released, or possibly a device was 864 * deleted and another added - that's irrelevant. At this 865 * point all that matters is the number of devices, we haven't 866 * looked at the lists yet. 867 * 868 * So we never need to adjust this allocation. 869 * 870 * And we don't need to look at uvmexp.nswapdev again! 871 */ 872 } 873 874 KASSERT(rw_lock_held(&swap_syscall_lock)); 875 876 /* 877 * Collect all of the swap descriptors, while holding the data lock, 878 * so the lists cannot change. Then they can be used safely. 879 * 880 * Entries cannot be deleted, because swap_syscall_lock is held, 881 * but the lists holding them can be reordered except in this small 882 * loop where we lock out that kind of activity. No processing 883 * happens here, this is fast, with no func calls, or anything which 884 * might perform operations which might need the lock. 885 */ 886 mutex_enter(&uvm_swap_data_lock); 887 sp = sdps; 888 count = 0; 889 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 890 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 891 if (++count <= slots) 892 *sp++ = sdp; 893 /* 894 * don't bother with exiting the loops early, 895 * the lists tend to be very short, and not 896 * exhausting them is a very rare occurrence. 897 * So just loop and do nothing (but count) in 898 * the odd case we could have broken out early. 899 */ 900 } 901 } 902 mutex_exit(&uvm_swap_data_lock); 903 904 /* 905 * Now we have a stable list of devices which cannot change, 906 * even if the swapping lists are reordered. 907 */ 908 909 if (misc > slots) /* the number of storage slots */ 910 misc = slots; 911 if (misc > count) /* the number of devices now */ 912 misc = count; 913 914 /* 915 * This is the actual work of uvm_swap_stats() - above was bookkeeping. 916 */ 917 error = 0; 918 count = 0; 919 sp = sdps; 920 while (misc-- > 0) { 921 int inuse; 922 923 sdp = *sp++; /* The next swapdev, from the next slot */ 924 925 inuse = btodb((uint64_t)sdp->swd_npginuse << 926 PAGE_SHIFT); 927 928 memset(&sep, 0, sizeof(sep)); 929 swapent_cvt(&sep, sdp, inuse); 930 if (f) 931 (*f)(&sep, &sep); 932 if ((error = copyout(&sep, ptr, len)) != 0) 933 goto out; 934 ptr += len; 935 count++; 936 } 937 *retval = count; 938 out:; 939 if (sdps != stackbuf) { 940 /* 941 * XXX should unlock & lock again here probably, 942 * but for now, no... 943 */ 944 kmem_free(sdps, sdpsize); 945 } 946 return error; 947 } 948 949 /* 950 * swap_on: attempt to enable a swapdev for swapping. note that the 951 * swapdev is already on the global list, but disabled (marked 952 * SWF_FAKE). 953 * 954 * => we avoid the start of the disk (to protect disk labels) 955 * => we also avoid the miniroot, if we are swapping to root. 956 * => caller should leave uvm_swap_data_lock unlocked, we may lock it 957 * if needed. 958 */ 959 static int 960 swap_on(struct lwp *l, struct swapdev *sdp) 961 { 962 struct vnode *vp; 963 int error, npages, nblocks, size; 964 long addr; 965 vmem_addr_t result; 966 struct vattr va; 967 dev_t dev; 968 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 969 970 /* 971 * we want to enable swapping on sdp. the swd_vp contains 972 * the vnode we want (locked and ref'd), and the swd_dev 973 * contains the dev_t of the file, if it a block device. 974 */ 975 976 vp = sdp->swd_vp; 977 dev = sdp->swd_dev; 978 979 /* 980 * open the swap file (mostly useful for block device files to 981 * let device driver know what is up). 982 * 983 * we skip the open/close for root on swap because the root 984 * has already been opened when root was mounted (mountroot). 985 */ 986 if (vp != rootvp) { 987 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred))) 988 return (error); 989 } 990 991 /* XXX this only works for block devices */ 992 UVMHIST_LOG(pdhist, " dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0); 993 994 /* 995 * we now need to determine the size of the swap area. for 996 * block specials we can call the d_psize function. 997 * for normal files, we must stat [get attrs]. 998 * 999 * we put the result in nblks. 1000 * for normal files, we also want the filesystem block size 1001 * (which we get with statfs). 1002 */ 1003 switch (vp->v_type) { 1004 case VBLK: 1005 if ((nblocks = bdev_size(dev)) == -1) { 1006 error = ENXIO; 1007 goto bad; 1008 } 1009 break; 1010 1011 case VREG: 1012 if ((error = VOP_GETATTR(vp, &va, l->l_cred))) 1013 goto bad; 1014 nblocks = (int)btodb(va.va_size); 1015 sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift; 1016 /* 1017 * limit the max # of outstanding I/O requests we issue 1018 * at any one time. take it easy on NFS servers. 1019 */ 1020 if (vp->v_tag == VT_NFS) 1021 sdp->swd_maxactive = 2; /* XXX */ 1022 else 1023 sdp->swd_maxactive = 8; /* XXX */ 1024 break; 1025 1026 default: 1027 error = ENXIO; 1028 goto bad; 1029 } 1030 1031 /* 1032 * save nblocks in a safe place and convert to pages. 1033 */ 1034 1035 sdp->swd_nblks = nblocks; 1036 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; 1037 1038 /* 1039 * for block special files, we want to make sure that leave 1040 * the disklabel and bootblocks alone, so we arrange to skip 1041 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 1042 * note that because of this the "size" can be less than the 1043 * actual number of blocks on the device. 1044 */ 1045 if (vp->v_type == VBLK) { 1046 /* we use pages 1 to (size - 1) [inclusive] */ 1047 size = npages - 1; 1048 addr = 1; 1049 } else { 1050 /* we use pages 0 to (size - 1) [inclusive] */ 1051 size = npages; 1052 addr = 0; 1053 } 1054 1055 /* 1056 * make sure we have enough blocks for a reasonable sized swap 1057 * area. we want at least one page. 1058 */ 1059 1060 if (size < 1) { 1061 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 1062 error = EINVAL; 1063 goto bad; 1064 } 1065 1066 UVMHIST_LOG(pdhist," dev=%#jx: size=%jd addr=%jd", dev, size, addr, 0); 1067 1068 /* 1069 * now we need to allocate an extent to manage this swap device 1070 */ 1071 1072 sdp->swd_blist = blist_create(npages); 1073 /* mark all expect the `saved' region free. */ 1074 blist_free(sdp->swd_blist, addr, size); 1075 1076 /* 1077 * allocate space to for swap encryption state and mark the 1078 * keys uninitialized so we generate them lazily 1079 */ 1080 sdp->swd_encmap = kmem_zalloc(encmap_size(npages), KM_SLEEP); 1081 sdp->swd_encinit = false; 1082 1083 /* 1084 * if the vnode we are swapping to is the root vnode 1085 * (i.e. we are swapping to the miniroot) then we want 1086 * to make sure we don't overwrite it. do a statfs to 1087 * find its size and skip over it. 1088 */ 1089 if (vp == rootvp) { 1090 struct mount *mp; 1091 struct statvfs *sp; 1092 int rootblocks, rootpages; 1093 1094 mp = rootvnode->v_mount; 1095 sp = &mp->mnt_stat; 1096 rootblocks = sp->f_blocks * btodb(sp->f_frsize); 1097 /* 1098 * XXX: sp->f_blocks isn't the total number of 1099 * blocks in the filesystem, it's the number of 1100 * data blocks. so, our rootblocks almost 1101 * definitely underestimates the total size 1102 * of the filesystem - how badly depends on the 1103 * details of the filesystem type. there isn't 1104 * an obvious way to deal with this cleanly 1105 * and perfectly, so for now we just pad our 1106 * rootblocks estimate with an extra 5 percent. 1107 */ 1108 rootblocks += (rootblocks >> 5) + 1109 (rootblocks >> 6) + 1110 (rootblocks >> 7); 1111 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 1112 if (rootpages > size) 1113 panic("swap_on: miniroot larger than swap?"); 1114 1115 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { 1116 panic("swap_on: unable to preserve miniroot"); 1117 } 1118 1119 size -= rootpages; 1120 printf("Preserved %d pages of miniroot ", rootpages); 1121 printf("leaving %d pages of swap\n", size); 1122 } 1123 1124 /* 1125 * add a ref to vp to reflect usage as a swap device. 1126 */ 1127 vref(vp); 1128 1129 /* 1130 * now add the new swapdev to the drum and enable. 1131 */ 1132 error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result); 1133 if (error != 0) 1134 panic("swapdrum_add"); 1135 /* 1136 * If this is the first regular swap create the workqueue. 1137 * => Protected by swap_syscall_lock. 1138 */ 1139 if (vp->v_type != VBLK) { 1140 if (sw_reg_count++ == 0) { 1141 KASSERT(sw_reg_workqueue == NULL); 1142 if (workqueue_create(&sw_reg_workqueue, "swapiod", 1143 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0) 1144 panic("%s: workqueue_create failed", __func__); 1145 } 1146 } 1147 1148 sdp->swd_drumoffset = (int)result; 1149 sdp->swd_drumsize = npages; 1150 sdp->swd_npages = size; 1151 mutex_enter(&uvm_swap_data_lock); 1152 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 1153 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 1154 uvmexp.swpages += size; 1155 uvmexp.swpgavail += size; 1156 mutex_exit(&uvm_swap_data_lock); 1157 return (0); 1158 1159 /* 1160 * failure: clean up and return error. 1161 */ 1162 1163 bad: 1164 if (sdp->swd_blist) { 1165 blist_destroy(sdp->swd_blist); 1166 } 1167 if (vp != rootvp) { 1168 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred); 1169 } 1170 return (error); 1171 } 1172 1173 /* 1174 * swap_off: stop swapping on swapdev 1175 * 1176 * => swap data should be locked, we will unlock. 1177 */ 1178 static int 1179 swap_off(struct lwp *l, struct swapdev *sdp) 1180 { 1181 int npages = sdp->swd_npages; 1182 int error = 0; 1183 1184 UVMHIST_FUNC(__func__); 1185 UVMHIST_CALLARGS(pdhist, 1186 " dev=%#jx, npages=%jd", sdp->swd_dev,npages, 0, 0); 1187 1188 KASSERT(rw_write_held(&swap_syscall_lock)); 1189 KASSERT(mutex_owned(&uvm_swap_data_lock)); 1190 1191 /* disable the swap area being removed */ 1192 sdp->swd_flags &= ~SWF_ENABLE; 1193 uvmexp.swpgavail -= npages; 1194 mutex_exit(&uvm_swap_data_lock); 1195 1196 /* 1197 * the idea is to find all the pages that are paged out to this 1198 * device, and page them all in. in uvm, swap-backed pageable 1199 * memory can take two forms: aobjs and anons. call the 1200 * swapoff hook for each subsystem to bring in pages. 1201 */ 1202 1203 if (uao_swap_off(sdp->swd_drumoffset, 1204 sdp->swd_drumoffset + sdp->swd_drumsize) || 1205 amap_swap_off(sdp->swd_drumoffset, 1206 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1207 error = ENOMEM; 1208 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1209 error = EBUSY; 1210 } 1211 1212 if (error) { 1213 mutex_enter(&uvm_swap_data_lock); 1214 sdp->swd_flags |= SWF_ENABLE; 1215 uvmexp.swpgavail += npages; 1216 mutex_exit(&uvm_swap_data_lock); 1217 1218 return error; 1219 } 1220 1221 /* 1222 * If this is the last regular swap destroy the workqueue. 1223 * => Protected by swap_syscall_lock. 1224 */ 1225 if (sdp->swd_vp->v_type != VBLK) { 1226 KASSERT(sw_reg_count > 0); 1227 KASSERT(sw_reg_workqueue != NULL); 1228 if (--sw_reg_count == 0) { 1229 workqueue_destroy(sw_reg_workqueue); 1230 sw_reg_workqueue = NULL; 1231 } 1232 } 1233 1234 /* 1235 * done with the vnode. 1236 * drop our ref on the vnode before calling VOP_CLOSE() 1237 * so that spec_close() can tell if this is the last close. 1238 */ 1239 vrele(sdp->swd_vp); 1240 if (sdp->swd_vp != rootvp) { 1241 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred); 1242 } 1243 1244 mutex_enter(&uvm_swap_data_lock); 1245 uvmexp.swpages -= npages; 1246 KASSERTMSG(uvmexp.swpginuse >= sdp->swd_npgbad, 1247 "swpginuse %d sdp->swd_npgbad %d", 1248 uvmexp.swpginuse, sdp->swd_npgbad); 1249 uvmexp.swpginuse -= sdp->swd_npgbad; 1250 1251 if (swaplist_find(sdp->swd_vp, true) == NULL) 1252 panic("%s: swapdev not in list", __func__); 1253 swaplist_trim(); 1254 mutex_exit(&uvm_swap_data_lock); 1255 1256 /* 1257 * free all resources! 1258 */ 1259 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); 1260 blist_destroy(sdp->swd_blist); 1261 bufq_free(sdp->swd_tab); 1262 kmem_free(__UNVOLATILE(sdp->swd_encmap), 1263 encmap_size(sdp->swd_drumsize)); 1264 explicit_memset(&sdp->swd_enckey, 0, sizeof sdp->swd_enckey); 1265 explicit_memset(&sdp->swd_deckey, 0, sizeof sdp->swd_deckey); 1266 kmem_free(sdp, sizeof(*sdp)); 1267 return (0); 1268 } 1269 1270 void 1271 uvm_swap_shutdown(struct lwp *l) 1272 { 1273 struct swapdev *sdp; 1274 struct swappri *spp; 1275 struct vnode *vp; 1276 int error; 1277 1278 if (!uvm_swap_init_done || uvmexp.nswapdev == 0) 1279 return; 1280 printf("turning off swap..."); 1281 rw_enter(&swap_syscall_lock, RW_WRITER); 1282 mutex_enter(&uvm_swap_data_lock); 1283 again: 1284 LIST_FOREACH(spp, &swap_priority, spi_swappri) 1285 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1286 if (sdp->swd_flags & SWF_FAKE) 1287 continue; 1288 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) 1289 continue; 1290 #ifdef DEBUG 1291 printf("\nturning off swap on %s...", sdp->swd_path); 1292 #endif 1293 /* Have to lock and reference vnode for swap_off(). */ 1294 vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE|LK_RETRY); 1295 vref(vp); 1296 error = swap_off(l, sdp); 1297 vput(vp); 1298 mutex_enter(&uvm_swap_data_lock); 1299 if (error) { 1300 printf("stopping swap on %s failed " 1301 "with error %d\n", sdp->swd_path, error); 1302 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1303 uvmexp.nswapdev--; 1304 swaplist_trim(); 1305 } 1306 goto again; 1307 } 1308 printf(" done\n"); 1309 mutex_exit(&uvm_swap_data_lock); 1310 rw_exit(&swap_syscall_lock); 1311 } 1312 1313 1314 /* 1315 * /dev/drum interface and i/o functions 1316 */ 1317 1318 /* 1319 * swopen: allow the initial open from uvm_swap_init() and reject all others. 1320 */ 1321 1322 static int 1323 swopen(dev_t dev, int flag, int mode, struct lwp *l) 1324 { 1325 static bool inited = false; 1326 1327 if (!inited) { 1328 inited = true; 1329 return 0; 1330 } 1331 return ENODEV; 1332 } 1333 1334 /* 1335 * swstrategy: perform I/O on the drum 1336 * 1337 * => we must map the i/o request from the drum to the correct swapdev. 1338 */ 1339 static void 1340 swstrategy(struct buf *bp) 1341 { 1342 struct swapdev *sdp; 1343 struct vnode *vp; 1344 int pageno, bn; 1345 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1346 1347 /* 1348 * convert block number to swapdev. note that swapdev can't 1349 * be yanked out from under us because we are holding resources 1350 * in it (i.e. the blocks we are doing I/O on). 1351 */ 1352 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1353 mutex_enter(&uvm_swap_data_lock); 1354 sdp = swapdrum_getsdp(pageno); 1355 mutex_exit(&uvm_swap_data_lock); 1356 if (sdp == NULL) { 1357 bp->b_error = EINVAL; 1358 bp->b_resid = bp->b_bcount; 1359 biodone(bp); 1360 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1361 return; 1362 } 1363 1364 /* 1365 * convert drum page number to block number on this swapdev. 1366 */ 1367 1368 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1369 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1370 1371 UVMHIST_LOG(pdhist, " Rd/Wr (0/1) %jd: mapoff=%#jx bn=%#jx bcount=%jd", 1372 ((bp->b_flags & B_READ) == 0) ? 1 : 0, 1373 sdp->swd_drumoffset, bn, bp->b_bcount); 1374 1375 /* 1376 * for block devices we finish up here. 1377 * for regular files we have to do more work which we delegate 1378 * to sw_reg_strategy(). 1379 */ 1380 1381 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1382 switch (vp->v_type) { 1383 default: 1384 panic("%s: vnode type 0x%x", __func__, vp->v_type); 1385 1386 case VBLK: 1387 1388 /* 1389 * must convert "bp" from an I/O on /dev/drum to an I/O 1390 * on the swapdev (sdp). 1391 */ 1392 bp->b_blkno = bn; /* swapdev block number */ 1393 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1394 1395 /* 1396 * if we are doing a write, we have to redirect the i/o on 1397 * drum's v_numoutput counter to the swapdevs. 1398 */ 1399 if ((bp->b_flags & B_READ) == 0) { 1400 mutex_enter(bp->b_objlock); 1401 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1402 mutex_exit(bp->b_objlock); 1403 mutex_enter(vp->v_interlock); 1404 vp->v_numoutput++; /* put it on swapdev */ 1405 mutex_exit(vp->v_interlock); 1406 } 1407 1408 /* 1409 * finally plug in swapdev vnode and start I/O 1410 */ 1411 bp->b_vp = vp; 1412 bp->b_objlock = vp->v_interlock; 1413 VOP_STRATEGY(vp, bp); 1414 return; 1415 1416 case VREG: 1417 /* 1418 * delegate to sw_reg_strategy function. 1419 */ 1420 sw_reg_strategy(sdp, bp, bn); 1421 return; 1422 } 1423 /* NOTREACHED */ 1424 } 1425 1426 /* 1427 * swread: the read function for the drum (just a call to physio) 1428 */ 1429 /*ARGSUSED*/ 1430 static int 1431 swread(dev_t dev, struct uio *uio, int ioflag) 1432 { 1433 UVMHIST_FUNC(__func__); 1434 UVMHIST_CALLARGS(pdhist, 1435 " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0); 1436 1437 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1438 } 1439 1440 /* 1441 * swwrite: the write function for the drum (just a call to physio) 1442 */ 1443 /*ARGSUSED*/ 1444 static int 1445 swwrite(dev_t dev, struct uio *uio, int ioflag) 1446 { 1447 UVMHIST_FUNC(__func__); 1448 UVMHIST_CALLARGS(pdhist, 1449 " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0); 1450 1451 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1452 } 1453 1454 const struct bdevsw swap_bdevsw = { 1455 .d_open = swopen, 1456 .d_close = noclose, 1457 .d_strategy = swstrategy, 1458 .d_ioctl = noioctl, 1459 .d_dump = nodump, 1460 .d_psize = nosize, 1461 .d_discard = nodiscard, 1462 .d_flag = D_OTHER 1463 }; 1464 1465 const struct cdevsw swap_cdevsw = { 1466 .d_open = nullopen, 1467 .d_close = nullclose, 1468 .d_read = swread, 1469 .d_write = swwrite, 1470 .d_ioctl = noioctl, 1471 .d_stop = nostop, 1472 .d_tty = notty, 1473 .d_poll = nopoll, 1474 .d_mmap = nommap, 1475 .d_kqfilter = nokqfilter, 1476 .d_discard = nodiscard, 1477 .d_flag = D_OTHER, 1478 }; 1479 1480 /* 1481 * sw_reg_strategy: handle swap i/o to regular files 1482 */ 1483 static void 1484 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1485 { 1486 struct vnode *vp; 1487 struct vndxfer *vnx; 1488 daddr_t nbn; 1489 char *addr; 1490 off_t byteoff; 1491 int s, off, nra, error, sz, resid; 1492 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1493 1494 /* 1495 * allocate a vndxfer head for this transfer and point it to 1496 * our buffer. 1497 */ 1498 vnx = pool_get(&vndxfer_pool, PR_WAITOK); 1499 vnx->vx_flags = VX_BUSY; 1500 vnx->vx_error = 0; 1501 vnx->vx_pending = 0; 1502 vnx->vx_bp = bp; 1503 vnx->vx_sdp = sdp; 1504 1505 /* 1506 * setup for main loop where we read filesystem blocks into 1507 * our buffer. 1508 */ 1509 error = 0; 1510 bp->b_resid = bp->b_bcount; /* nothing transferred yet! */ 1511 addr = bp->b_data; /* current position in buffer */ 1512 byteoff = dbtob((uint64_t)bn); 1513 1514 for (resid = bp->b_resid; resid; resid -= sz) { 1515 struct vndbuf *nbp; 1516 1517 /* 1518 * translate byteoffset into block number. return values: 1519 * vp = vnode of underlying device 1520 * nbn = new block number (on underlying vnode dev) 1521 * nra = num blocks we can read-ahead (excludes requested 1522 * block) 1523 */ 1524 nra = 0; 1525 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1526 &vp, &nbn, &nra); 1527 1528 if (error == 0 && nbn == (daddr_t)-1) { 1529 /* 1530 * this used to just set error, but that doesn't 1531 * do the right thing. Instead, it causes random 1532 * memory errors. The panic() should remain until 1533 * this condition doesn't destabilize the system. 1534 */ 1535 #if 1 1536 panic("%s: swap to sparse file", __func__); 1537 #else 1538 error = EIO; /* failure */ 1539 #endif 1540 } 1541 1542 /* 1543 * punt if there was an error or a hole in the file. 1544 * we must wait for any i/o ops we have already started 1545 * to finish before returning. 1546 * 1547 * XXX we could deal with holes here but it would be 1548 * a hassle (in the write case). 1549 */ 1550 if (error) { 1551 s = splbio(); 1552 vnx->vx_error = error; /* pass error up */ 1553 goto out; 1554 } 1555 1556 /* 1557 * compute the size ("sz") of this transfer (in bytes). 1558 */ 1559 off = byteoff % sdp->swd_bsize; 1560 sz = (1 + nra) * sdp->swd_bsize - off; 1561 if (sz > resid) 1562 sz = resid; 1563 1564 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1565 "vp %#jx/%#jx offset %#jx/%#jx", 1566 (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn); 1567 1568 /* 1569 * now get a buf structure. note that the vb_buf is 1570 * at the front of the nbp structure so that you can 1571 * cast pointers between the two structure easily. 1572 */ 1573 nbp = pool_get(&vndbuf_pool, PR_WAITOK); 1574 buf_init(&nbp->vb_buf); 1575 nbp->vb_buf.b_flags = bp->b_flags; 1576 nbp->vb_buf.b_cflags = bp->b_cflags; 1577 nbp->vb_buf.b_oflags = bp->b_oflags; 1578 nbp->vb_buf.b_bcount = sz; 1579 nbp->vb_buf.b_bufsize = sz; 1580 nbp->vb_buf.b_error = 0; 1581 nbp->vb_buf.b_data = addr; 1582 nbp->vb_buf.b_lblkno = 0; 1583 nbp->vb_buf.b_blkno = nbn + btodb(off); 1584 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1585 nbp->vb_buf.b_iodone = sw_reg_biodone; 1586 nbp->vb_buf.b_vp = vp; 1587 nbp->vb_buf.b_objlock = vp->v_interlock; 1588 if (vp->v_type == VBLK) { 1589 nbp->vb_buf.b_dev = vp->v_rdev; 1590 } 1591 1592 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1593 1594 /* 1595 * Just sort by block number 1596 */ 1597 s = splbio(); 1598 if (vnx->vx_error != 0) { 1599 buf_destroy(&nbp->vb_buf); 1600 pool_put(&vndbuf_pool, nbp); 1601 goto out; 1602 } 1603 vnx->vx_pending++; 1604 1605 /* sort it in and start I/O if we are not over our limit */ 1606 /* XXXAD locking */ 1607 bufq_put(sdp->swd_tab, &nbp->vb_buf); 1608 sw_reg_start(sdp); 1609 splx(s); 1610 1611 /* 1612 * advance to the next I/O 1613 */ 1614 byteoff += sz; 1615 addr += sz; 1616 } 1617 1618 s = splbio(); 1619 1620 out: /* Arrive here at splbio */ 1621 vnx->vx_flags &= ~VX_BUSY; 1622 if (vnx->vx_pending == 0) { 1623 error = vnx->vx_error; 1624 pool_put(&vndxfer_pool, vnx); 1625 if (error) { 1626 bp->b_resid = bp->b_bcount; 1627 bp->b_error = error; 1628 } 1629 biodone(bp); 1630 } 1631 splx(s); 1632 } 1633 1634 /* 1635 * sw_reg_start: start an I/O request on the requested swapdev 1636 * 1637 * => reqs are sorted by b_rawblkno (above) 1638 */ 1639 static void 1640 sw_reg_start(struct swapdev *sdp) 1641 { 1642 struct buf *bp; 1643 struct vnode *vp; 1644 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1645 1646 /* recursion control */ 1647 if ((sdp->swd_flags & SWF_BUSY) != 0) 1648 return; 1649 1650 sdp->swd_flags |= SWF_BUSY; 1651 1652 while (sdp->swd_active < sdp->swd_maxactive) { 1653 bp = bufq_get(sdp->swd_tab); 1654 if (bp == NULL) 1655 break; 1656 sdp->swd_active++; 1657 1658 UVMHIST_LOG(pdhist, 1659 "sw_reg_start: bp %#jx vp %#jx blkno %#jx cnt %#jx", 1660 (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno, 1661 bp->b_bcount); 1662 vp = bp->b_vp; 1663 KASSERT(bp->b_objlock == vp->v_interlock); 1664 if ((bp->b_flags & B_READ) == 0) { 1665 mutex_enter(vp->v_interlock); 1666 vp->v_numoutput++; 1667 mutex_exit(vp->v_interlock); 1668 } 1669 VOP_STRATEGY(vp, bp); 1670 } 1671 sdp->swd_flags &= ~SWF_BUSY; 1672 } 1673 1674 /* 1675 * sw_reg_biodone: one of our i/o's has completed 1676 */ 1677 static void 1678 sw_reg_biodone(struct buf *bp) 1679 { 1680 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); 1681 } 1682 1683 /* 1684 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1685 * 1686 * => note that we can recover the vndbuf struct by casting the buf ptr 1687 */ 1688 static void 1689 sw_reg_iodone(struct work *wk, void *dummy) 1690 { 1691 struct vndbuf *vbp = (void *)wk; 1692 struct vndxfer *vnx = vbp->vb_xfer; 1693 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1694 struct swapdev *sdp = vnx->vx_sdp; 1695 int s, resid, error; 1696 KASSERT(&vbp->vb_buf.b_work == wk); 1697 UVMHIST_FUNC(__func__); 1698 UVMHIST_CALLARGS(pdhist, " vbp=%#jx vp=%#jx blkno=%#jx addr=%#jx", 1699 (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, 1700 (uintptr_t)vbp->vb_buf.b_data); 1701 UVMHIST_LOG(pdhist, " cnt=%#jx resid=%#jx", 1702 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1703 1704 /* 1705 * protect vbp at splbio and update. 1706 */ 1707 1708 s = splbio(); 1709 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1710 pbp->b_resid -= resid; 1711 vnx->vx_pending--; 1712 1713 if (vbp->vb_buf.b_error != 0) { 1714 /* pass error upward */ 1715 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO; 1716 UVMHIST_LOG(pdhist, " got error=%jd !", error, 0, 0, 0); 1717 vnx->vx_error = error; 1718 } 1719 1720 /* 1721 * kill vbp structure 1722 */ 1723 buf_destroy(&vbp->vb_buf); 1724 pool_put(&vndbuf_pool, vbp); 1725 1726 /* 1727 * wrap up this transaction if it has run to completion or, in 1728 * case of an error, when all auxiliary buffers have returned. 1729 */ 1730 if (vnx->vx_error != 0) { 1731 /* pass error upward */ 1732 error = vnx->vx_error; 1733 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1734 pbp->b_error = error; 1735 pbp->b_resid = pbp->b_bcount; 1736 biodone(pbp); 1737 pool_put(&vndxfer_pool, vnx); 1738 } 1739 } else if (pbp->b_resid == 0) { 1740 KASSERT(vnx->vx_pending == 0); 1741 if ((vnx->vx_flags & VX_BUSY) == 0) { 1742 UVMHIST_LOG(pdhist, " iodone, pbp=%#jx error=%jd !", 1743 (uintptr_t)pbp, vnx->vx_error, 0, 0); 1744 biodone(pbp); 1745 pool_put(&vndxfer_pool, vnx); 1746 } 1747 } 1748 1749 /* 1750 * done! start next swapdev I/O if one is pending 1751 */ 1752 sdp->swd_active--; 1753 sw_reg_start(sdp); 1754 splx(s); 1755 } 1756 1757 1758 /* 1759 * uvm_swap_alloc: allocate space on swap 1760 * 1761 * => allocation is done "round robin" down the priority list, as we 1762 * allocate in a priority we "rotate" the circle queue. 1763 * => space can be freed with uvm_swap_free 1764 * => we return the page slot number in /dev/drum (0 == invalid slot) 1765 * => we lock uvm_swap_data_lock 1766 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1767 */ 1768 int 1769 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) 1770 { 1771 struct swapdev *sdp; 1772 struct swappri *spp; 1773 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1774 1775 /* 1776 * no swap devices configured yet? definite failure. 1777 */ 1778 if (uvmexp.nswapdev < 1) 1779 return 0; 1780 1781 /* 1782 * XXXJAK: BEGIN HACK 1783 * 1784 * blist_alloc() in subr_blist.c will panic if we try to allocate 1785 * too many slots. 1786 */ 1787 if (*nslots > BLIST_MAX_ALLOC) { 1788 if (__predict_false(lessok == false)) 1789 return 0; 1790 *nslots = BLIST_MAX_ALLOC; 1791 } 1792 /* XXXJAK: END HACK */ 1793 1794 /* 1795 * lock data lock, convert slots into blocks, and enter loop 1796 */ 1797 mutex_enter(&uvm_swap_data_lock); 1798 1799 ReTry: /* XXXMRG */ 1800 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1801 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1802 uint64_t result; 1803 1804 /* if it's not enabled, then we can't swap from it */ 1805 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1806 continue; 1807 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1808 continue; 1809 result = blist_alloc(sdp->swd_blist, *nslots); 1810 if (result == BLIST_NONE) { 1811 continue; 1812 } 1813 KASSERT(result < sdp->swd_drumsize); 1814 1815 /* 1816 * successful allocation! now rotate the tailq. 1817 */ 1818 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1819 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1820 sdp->swd_npginuse += *nslots; 1821 uvmexp.swpginuse += *nslots; 1822 mutex_exit(&uvm_swap_data_lock); 1823 /* done! return drum slot number */ 1824 UVMHIST_LOG(pdhist, 1825 "success! returning %jd slots starting at %jd", 1826 *nslots, result + sdp->swd_drumoffset, 0, 0); 1827 return (result + sdp->swd_drumoffset); 1828 } 1829 } 1830 1831 /* XXXMRG: BEGIN HACK */ 1832 if (*nslots > 1 && lessok) { 1833 *nslots = 1; 1834 /* XXXMRG: ugh! blist should support this for us */ 1835 goto ReTry; 1836 } 1837 /* XXXMRG: END HACK */ 1838 1839 mutex_exit(&uvm_swap_data_lock); 1840 return 0; 1841 } 1842 1843 /* 1844 * uvm_swapisfull: return true if most of available swap is allocated 1845 * and in use. we don't count some small portion as it may be inaccessible 1846 * to us at any given moment, for example if there is lock contention or if 1847 * pages are busy. 1848 */ 1849 bool 1850 uvm_swapisfull(void) 1851 { 1852 int swpgonly; 1853 bool rv; 1854 1855 if (uvmexp.swpages == 0) { 1856 return true; 1857 } 1858 1859 mutex_enter(&uvm_swap_data_lock); 1860 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1861 swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 / 1862 uvm_swapisfull_factor); 1863 rv = (swpgonly >= uvmexp.swpgavail); 1864 mutex_exit(&uvm_swap_data_lock); 1865 1866 return (rv); 1867 } 1868 1869 /* 1870 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1871 * 1872 * => we lock uvm_swap_data_lock 1873 */ 1874 void 1875 uvm_swap_markbad(int startslot, int nslots) 1876 { 1877 struct swapdev *sdp; 1878 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1879 1880 mutex_enter(&uvm_swap_data_lock); 1881 sdp = swapdrum_getsdp(startslot); 1882 KASSERT(sdp != NULL); 1883 1884 /* 1885 * we just keep track of how many pages have been marked bad 1886 * in this device, to make everything add up in swap_off(). 1887 * we assume here that the range of slots will all be within 1888 * one swap device. 1889 */ 1890 1891 KASSERT(uvmexp.swpgonly >= nslots); 1892 atomic_add_int(&uvmexp.swpgonly, -nslots); 1893 sdp->swd_npgbad += nslots; 1894 UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0); 1895 mutex_exit(&uvm_swap_data_lock); 1896 } 1897 1898 /* 1899 * uvm_swap_free: free swap slots 1900 * 1901 * => this can be all or part of an allocation made by uvm_swap_alloc 1902 * => we lock uvm_swap_data_lock 1903 */ 1904 void 1905 uvm_swap_free(int startslot, int nslots) 1906 { 1907 struct swapdev *sdp; 1908 UVMHIST_FUNC(__func__); 1909 UVMHIST_CALLARGS(pdhist, "freeing %jd slots starting at %jd", nslots, 1910 startslot, 0, 0); 1911 1912 /* 1913 * ignore attempts to free the "bad" slot. 1914 */ 1915 1916 if (startslot == SWSLOT_BAD) { 1917 return; 1918 } 1919 1920 /* 1921 * convert drum slot offset back to sdp, free the blocks 1922 * in the extent, and return. must hold pri lock to do 1923 * lookup and access the extent. 1924 */ 1925 1926 mutex_enter(&uvm_swap_data_lock); 1927 sdp = swapdrum_getsdp(startslot); 1928 KASSERT(uvmexp.nswapdev >= 1); 1929 KASSERT(sdp != NULL); 1930 KASSERT(sdp->swd_npginuse >= nslots); 1931 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); 1932 sdp->swd_npginuse -= nslots; 1933 KASSERTMSG(uvmexp.swpginuse >= nslots, "swpginuse %d nslots %d", 1934 uvmexp.swpginuse, nslots); 1935 uvmexp.swpginuse -= nslots; 1936 mutex_exit(&uvm_swap_data_lock); 1937 } 1938 1939 /* 1940 * uvm_swap_put: put any number of pages into a contig place on swap 1941 * 1942 * => can be sync or async 1943 */ 1944 1945 int 1946 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1947 { 1948 int error; 1949 1950 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1951 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1952 return error; 1953 } 1954 1955 /* 1956 * uvm_swap_get: get a single page from swap 1957 * 1958 * => usually a sync op (from fault) 1959 */ 1960 1961 int 1962 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1963 { 1964 int error; 1965 1966 atomic_inc_uint(&uvmexp.nswget); 1967 KASSERT(flags & PGO_SYNCIO); 1968 if (swslot == SWSLOT_BAD) { 1969 return EIO; 1970 } 1971 1972 error = uvm_swap_io(&page, swslot, 1, B_READ | 1973 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1974 if (error == 0) { 1975 1976 /* 1977 * this page is no longer only in swap. 1978 */ 1979 1980 KASSERT(uvmexp.swpgonly > 0); 1981 atomic_dec_uint(&uvmexp.swpgonly); 1982 } 1983 return error; 1984 } 1985 1986 /* 1987 * uvm_swap_io: do an i/o operation to swap 1988 */ 1989 1990 static int 1991 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1992 { 1993 daddr_t startblk; 1994 struct buf *bp; 1995 vaddr_t kva; 1996 int error, mapinflags; 1997 bool write, async, swap_encrypt; 1998 UVMHIST_FUNC(__func__); 1999 UVMHIST_CALLARGS(pdhist, 2000 "<- called, startslot=%jd, npages=%jd, flags=%#jx", 2001 startslot, npages, flags, 0); 2002 2003 write = (flags & B_READ) == 0; 2004 async = (flags & B_ASYNC) != 0; 2005 swap_encrypt = atomic_load_relaxed(&uvm_swap_encrypt); 2006 2007 /* 2008 * allocate a buf for the i/o. 2009 */ 2010 2011 KASSERT(curlwp != uvm.pagedaemon_lwp || write); 2012 KASSERT(curlwp != uvm.pagedaemon_lwp || async); 2013 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp); 2014 if (bp == NULL) { 2015 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM); 2016 return ENOMEM; 2017 } 2018 2019 /* 2020 * convert starting drum slot to block number 2021 */ 2022 2023 startblk = btodb((uint64_t)startslot << PAGE_SHIFT); 2024 2025 /* 2026 * first, map the pages into the kernel. 2027 */ 2028 2029 mapinflags = !write ? 2030 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : 2031 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; 2032 if (write && swap_encrypt) /* need to encrypt in-place */ 2033 mapinflags |= UVMPAGER_MAPIN_READ; 2034 kva = uvm_pagermapin(pps, npages, mapinflags); 2035 2036 /* 2037 * encrypt writes in place if requested 2038 */ 2039 2040 if (write) do { 2041 struct swapdev *sdp; 2042 int i; 2043 2044 /* 2045 * Get the swapdev so we can discriminate on the 2046 * encryption state. There may or may not be an 2047 * encryption key generated; we may or may not be asked 2048 * to encrypt swap. 2049 * 2050 * 1. NO KEY, NO ENCRYPTION: Nothing to do. 2051 * 2052 * 2. NO KEY, BUT ENCRYPTION: Generate a key, encrypt, 2053 * and mark the slots encrypted. 2054 * 2055 * 3. KEY, BUT NO ENCRYPTION: The slots may already be 2056 * marked encrypted from a past life. Mark them not 2057 * encrypted. 2058 * 2059 * 4. KEY, ENCRYPTION: Encrypt and mark the slots 2060 * encrypted. 2061 */ 2062 mutex_enter(&uvm_swap_data_lock); 2063 sdp = swapdrum_getsdp(startslot); 2064 if (!sdp->swd_encinit) { 2065 if (!swap_encrypt) { 2066 mutex_exit(&uvm_swap_data_lock); 2067 break; 2068 } 2069 uvm_swap_genkey(sdp); 2070 } 2071 KASSERT(sdp->swd_encinit); 2072 mutex_exit(&uvm_swap_data_lock); 2073 2074 for (i = 0; i < npages; i++) { 2075 int s = startslot + i; 2076 KDASSERT(swapdrum_sdp_is(s, sdp)); 2077 KASSERT(s >= sdp->swd_drumoffset); 2078 s -= sdp->swd_drumoffset; 2079 KASSERT(s < sdp->swd_drumsize); 2080 2081 if (swap_encrypt) { 2082 uvm_swap_encryptpage(sdp, 2083 (void *)(kva + (vsize_t)i*PAGE_SIZE), s); 2084 atomic_or_32(&sdp->swd_encmap[s/32], 2085 __BIT(s%32)); 2086 } else { 2087 atomic_and_32(&sdp->swd_encmap[s/32], 2088 ~__BIT(s%32)); 2089 } 2090 } 2091 } while (0); 2092 2093 /* 2094 * fill in the bp/sbp. we currently route our i/o through 2095 * /dev/drum's vnode [swapdev_vp]. 2096 */ 2097 2098 bp->b_cflags = BC_BUSY | BC_NOCACHE; 2099 bp->b_flags = (flags & (B_READ|B_ASYNC)); 2100 bp->b_proc = &proc0; /* XXX */ 2101 bp->b_vnbufs.le_next = NOLIST; 2102 bp->b_data = (void *)kva; 2103 bp->b_blkno = startblk; 2104 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 2105 2106 /* 2107 * bump v_numoutput (counter of number of active outputs). 2108 */ 2109 2110 if (write) { 2111 mutex_enter(swapdev_vp->v_interlock); 2112 swapdev_vp->v_numoutput++; 2113 mutex_exit(swapdev_vp->v_interlock); 2114 } 2115 2116 /* 2117 * for async ops we must set up the iodone handler. 2118 */ 2119 2120 if (async) { 2121 bp->b_iodone = uvm_aio_aiodone; 2122 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 2123 if (curlwp == uvm.pagedaemon_lwp) 2124 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 2125 else 2126 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 2127 } else { 2128 bp->b_iodone = NULL; 2129 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 2130 } 2131 UVMHIST_LOG(pdhist, 2132 "about to start io: data = %#jx blkno = %#jx, bcount = %jd", 2133 (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0); 2134 2135 /* 2136 * now we start the I/O, and if async, return. 2137 */ 2138 2139 VOP_STRATEGY(swapdev_vp, bp); 2140 if (async) { 2141 /* 2142 * Reads are always synchronous; if this changes, we 2143 * need to add an asynchronous path for decryption. 2144 */ 2145 KASSERT(write); 2146 return 0; 2147 } 2148 2149 /* 2150 * must be sync i/o. wait for it to finish 2151 */ 2152 2153 error = biowait(bp); 2154 if (error) 2155 goto out; 2156 2157 /* 2158 * decrypt reads in place if needed 2159 */ 2160 2161 if (!write) do { 2162 struct swapdev *sdp; 2163 bool encinit; 2164 int i; 2165 2166 /* 2167 * Get the sdp. Everything about it except the encinit 2168 * bit, saying whether the encryption key is 2169 * initialized or not, and the encrypted bit for each 2170 * page, is stable until all swap pages have been 2171 * released and the device is removed. 2172 */ 2173 mutex_enter(&uvm_swap_data_lock); 2174 sdp = swapdrum_getsdp(startslot); 2175 encinit = sdp->swd_encinit; 2176 mutex_exit(&uvm_swap_data_lock); 2177 2178 if (!encinit) 2179 /* 2180 * If there's no encryption key, there's no way 2181 * any of these slots can be encrypted, so 2182 * nothing to do here. 2183 */ 2184 break; 2185 for (i = 0; i < npages; i++) { 2186 int s = startslot + i; 2187 KDASSERT(swapdrum_sdp_is(s, sdp)); 2188 KASSERT(s >= sdp->swd_drumoffset); 2189 s -= sdp->swd_drumoffset; 2190 KASSERT(s < sdp->swd_drumsize); 2191 if ((atomic_load_relaxed(&sdp->swd_encmap[s/32]) & 2192 __BIT(s%32)) == 0) 2193 continue; 2194 uvm_swap_decryptpage(sdp, 2195 (void *)(kva + (vsize_t)i*PAGE_SIZE), s); 2196 } 2197 } while (0); 2198 out: 2199 /* 2200 * kill the pager mapping 2201 */ 2202 2203 uvm_pagermapout(kva, npages); 2204 2205 /* 2206 * now dispose of the buf and we're done. 2207 */ 2208 2209 if (write) { 2210 mutex_enter(swapdev_vp->v_interlock); 2211 vwakeup(bp); 2212 mutex_exit(swapdev_vp->v_interlock); 2213 } 2214 putiobuf(bp); 2215 UVMHIST_LOG(pdhist, "<- done (sync) error=%jd", error, 0, 0, 0); 2216 2217 return (error); 2218 } 2219 2220 /* 2221 * uvm_swap_genkey(sdp) 2222 * 2223 * Generate a key for swap encryption. 2224 */ 2225 static void 2226 uvm_swap_genkey(struct swapdev *sdp) 2227 { 2228 uint8_t key[32]; 2229 2230 KASSERT(!sdp->swd_encinit); 2231 2232 cprng_strong(kern_cprng, key, sizeof key, 0); 2233 aes_setenckey256(&sdp->swd_enckey, key); 2234 aes_setdeckey256(&sdp->swd_deckey, key); 2235 explicit_memset(key, 0, sizeof key); 2236 2237 sdp->swd_encinit = true; 2238 } 2239 2240 /* 2241 * uvm_swap_encryptpage(sdp, kva, slot) 2242 * 2243 * Encrypt one page of data at kva for the specified slot number 2244 * in the swap device. 2245 */ 2246 static void 2247 uvm_swap_encryptpage(struct swapdev *sdp, void *kva, int slot) 2248 { 2249 uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16); 2250 2251 /* iv := AES_k(le32enc(slot) || 0^96) */ 2252 le32enc(preiv, slot); 2253 aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS); 2254 2255 /* *kva := AES-CBC_k(iv, *kva) */ 2256 aes_cbc_enc(&sdp->swd_enckey, kva, kva, PAGE_SIZE, iv, 2257 AES_256_NROUNDS); 2258 2259 explicit_memset(&iv, 0, sizeof iv); 2260 } 2261 2262 /* 2263 * uvm_swap_decryptpage(sdp, kva, slot) 2264 * 2265 * Decrypt one page of data at kva for the specified slot number 2266 * in the swap device. 2267 */ 2268 static void 2269 uvm_swap_decryptpage(struct swapdev *sdp, void *kva, int slot) 2270 { 2271 uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16); 2272 2273 /* iv := AES_k(le32enc(slot) || 0^96) */ 2274 le32enc(preiv, slot); 2275 aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS); 2276 2277 /* *kva := AES-CBC^{-1}_k(iv, *kva) */ 2278 aes_cbc_dec(&sdp->swd_deckey, kva, kva, PAGE_SIZE, iv, 2279 AES_256_NROUNDS); 2280 2281 explicit_memset(&iv, 0, sizeof iv); 2282 } 2283 2284 SYSCTL_SETUP(sysctl_uvmswap_setup, "sysctl uvmswap setup") 2285 { 2286 2287 sysctl_createv(clog, 0, NULL, NULL, 2288 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "swap_encrypt", 2289 SYSCTL_DESCR("Encrypt data when swapped out to disk"), 2290 NULL, 0, &uvm_swap_encrypt, 0, 2291 CTL_VM, CTL_CREATE, CTL_EOL); 2292 } 2293