1 /* $NetBSD: uvm_swap.c,v 1.232 2026/05/03 16:02:37 thorpej Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.232 2026/05/03 16:02:37 thorpej Exp $"); 34 35 #include "opt_uvmhist.h" 36 #include "opt_compat_netbsd.h" 37 #include "opt_ddb.h" 38 #include "opt_vmswap.h" 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/atomic.h> 43 #include <sys/buf.h> 44 #include <sys/bufq.h> 45 #include <sys/conf.h> 46 #include <sys/cprng.h> 47 #include <sys/proc.h> 48 #include <sys/namei.h> 49 #include <sys/disklabel.h> 50 #include <sys/errno.h> 51 #include <sys/kernel.h> 52 #include <sys/vnode.h> 53 #include <sys/file.h> 54 #include <sys/vmem.h> 55 #include <sys/blist.h> 56 #include <sys/mount.h> 57 #include <sys/pool.h> 58 #include <sys/kmem.h> 59 #include <sys/syscallargs.h> 60 #include <sys/swap.h> 61 #include <sys/kauth.h> 62 #include <sys/sysctl.h> 63 #include <sys/workqueue.h> 64 65 #include <uvm/uvm.h> 66 67 #include <miscfs/specfs/specdev.h> 68 69 #include <crypto/aes/aes.h> 70 #include <crypto/aes/aes_cbc.h> 71 72 /* 73 * uvm_swap.c: manage configuration and i/o to swap space. 74 */ 75 76 /* 77 * swap space is managed in the following way: 78 * 79 * each swap partition or file is described by a "swapdev" structure. 80 * each "swapdev" structure contains a "swapent" structure which contains 81 * information that is passed up to the user (via system calls). 82 * 83 * each swap partition is assigned a "priority" (int) which controls 84 * swap partition usage. 85 * 86 * the system maintains a global data structure describing all swap 87 * partitions/files. there is a sorted LIST of "swappri" structures 88 * which describe "swapdev"'s at that priority. this LIST is headed 89 * by the "swap_priority" global var. each "swappri" contains a 90 * TAILQ of "swapdev" structures at that priority. 91 * 92 * locking: 93 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl 94 * system call and prevents the swap priority list from changing 95 * while we are in the middle of a system call (e.g. SWAP_STATS). 96 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data 97 * structures including the priority list, the swapdev structures, 98 * and the swapmap arena. 99 * 100 * each swap device has the following info: 101 * - swap device in use (could be disabled, preventing future use) 102 * - swap enabled (allows new allocations on swap) 103 * - map info in /dev/drum 104 * - vnode pointer 105 * for swap files only: 106 * - block size 107 * - max byte count in buffer 108 * - buffer 109 * 110 * userland controls and configures swap with the swapctl(2) system call. 111 * the sys_swapctl performs the following operations: 112 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 113 * [2] SWAP_STATS: given a pointer to an array of swapent structures 114 * (passed in via "arg") of a size passed in via "misc" ... we load 115 * the current swap config into the array. The actual work is done 116 * in the uvm_swap_stats() function. 117 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 118 * priority in "misc", start swapping on it. 119 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 120 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 121 * "misc") 122 */ 123 124 /* 125 * swapdev: describes a single swap partition/file 126 * 127 * note the following should be true: 128 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 129 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 130 */ 131 struct swapdev { 132 dev_t swd_dev; /* device id */ 133 int swd_flags; /* flags:inuse/enable/fake */ 134 int swd_priority; /* our priority */ 135 int swd_nblks; /* blocks in this device */ 136 char *swd_path; /* saved pathname of device */ 137 int swd_pathlen; /* length of pathname */ 138 int swd_npages; /* #pages we can use */ 139 int swd_npginuse; /* #pages in use */ 140 int swd_npgbad; /* #pages bad */ 141 int swd_drumoffset; /* page0 offset in drum */ 142 int swd_drumsize; /* #pages in drum */ 143 blist_t swd_blist; /* blist for this swapdev */ 144 struct vnode *swd_vp; /* backing vnode */ 145 TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */ 146 147 struct aesenc swd_enckey; /* AES key expanded for enc */ 148 struct aesdec swd_deckey; /* AES key expanded for dec */ 149 bool swd_encinit; /* true if keys initialized */ 150 151 /* 152 * the following members are only used for swap on VREG file. 153 * swd_lock protects swd_active and swd_tab. 154 */ 155 kmutex_t swd_lock; 156 int swd_bsize; /* blocksize (bytes) */ 157 int swd_maxactive; /* max active i/o reqs */ 158 struct bufq_state *swd_tab; /* buffer list */ 159 int swd_active; /* number of active buffers */ 160 }; 161 162 /* 163 * swap device priority entry; the list is kept sorted on `spi_priority'. 164 */ 165 struct swappri { 166 int spi_priority; /* priority */ 167 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 168 /* tailq of swapdevs at this priority */ 169 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 170 }; 171 172 /* 173 * local variables 174 */ 175 static vmem_t *swapmap; /* controls the mapping of /dev/drum */ 176 177 /* list of all active swap devices [by priority] */ 178 LIST_HEAD(swap_priority, swappri); 179 static struct swap_priority swap_priority; 180 181 /* locks */ 182 static kmutex_t uvm_swap_data_lock __cacheline_aligned; 183 static krwlock_t swap_syscall_lock; 184 bool uvm_swap_init_done = false; 185 186 /* workqueue and use counter for swap to regular files */ 187 static int sw_reg_count = 0; 188 static struct workqueue *sw_reg_workqueue; 189 190 /* tuneables */ 191 u_int uvm_swapisfull_factor = 99; 192 #if VMSWAP_DEFAULT_PLAINTEXT 193 bool uvm_swap_encrypt = false; 194 #else 195 bool uvm_swap_encrypt = true; 196 #endif 197 198 /* 199 * prototypes 200 */ 201 static struct swapdev *swapdrum_getsdp(int); 202 203 static struct swapdev *swaplist_find(struct vnode *, bool); 204 static void swaplist_insert(struct swapdev *, 205 struct swappri *, int); 206 static void swaplist_trim(void); 207 208 static int swap_on(struct lwp *, struct swapdev *); 209 static int swap_off(struct lwp *, struct swapdev *); 210 211 static void sw_reg_strategy(struct swapdev *, struct buf *, int); 212 static void sw_reg_biodone(struct buf *); 213 static void sw_reg_iodone(struct work *wk, void *dummy); 214 static void sw_reg_start(struct swapdev *); 215 216 static int uvm_swap_io(struct vm_page **, int, int, int); 217 218 static void uvm_swap_genkey(struct swapdev *); 219 static void uvm_swap_encryptpage(struct swapdev *, void *, int); 220 static void uvm_swap_decryptpage(struct swapdev *, void *, int); 221 222 /* 223 * uvm_swap_init: init the swap system data structures and locks 224 * 225 * => called at boot time from init_main.c after the filesystems 226 * are brought up (which happens after uvm_init()) 227 */ 228 void 229 uvm_swap_init(void) 230 { 231 UVMHIST_FUNC(__func__); 232 233 UVMHIST_CALLED(pdhist); 234 /* 235 * first, init the swap list, its counter, and its lock. 236 * then get a handle on the vnode for /dev/drum by using 237 * the its dev_t number ("swapdev", from MD conf.c). 238 */ 239 240 LIST_INIT(&swap_priority); 241 uvmexp.nswapdev = 0; 242 rw_init(&swap_syscall_lock); 243 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 244 245 if (bdevvp(swapdev, &swapdev_vp)) 246 panic("%s: can't get vnode for swap device", __func__); 247 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY)) 248 panic("%s: can't lock swap device", __func__); 249 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED)) 250 panic("%s: can't open swap device", __func__); 251 VOP_UNLOCK(swapdev_vp); 252 253 /* 254 * create swap block resource map to map /dev/drum. the range 255 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 256 * that block 0 is reserved (used to indicate an allocation 257 * failure, or no allocation). 258 */ 259 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, 260 VM_NOSLEEP, IPL_NONE); 261 if (swapmap == 0) { 262 panic("%s: vmem_create failed", __func__); 263 } 264 265 uvm_swap_init_done = true; 266 267 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 268 } 269 270 /* 271 * swaplist functions: functions that operate on the list of swap 272 * devices on the system. 273 */ 274 275 /* 276 * swaplist_insert: insert swap device "sdp" into the global list 277 * 278 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 279 * => caller must provide a newly allocated swappri structure (we will 280 * FREE it if we don't need it... this it to prevent allocation 281 * blocking here while adding swap) 282 */ 283 static void 284 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 285 { 286 struct swappri *spp, *pspp; 287 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 288 289 KASSERT(rw_write_held(&swap_syscall_lock)); 290 KASSERT(mutex_owned(&uvm_swap_data_lock)); 291 292 if (LIST_EMPTY(&swap_priority)) { 293 KASSERT(uvmexp.swpginuse == 0); 294 KASSERT(uvmexp.swpgonly == 0); 295 KASSERT(uvmexp.swpages == 0); 296 KASSERT(uvmexp.swpgavail == 0); 297 } 298 299 /* 300 * find entry at or after which to insert the new device. 301 */ 302 pspp = NULL; 303 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 304 if (priority <= spp->spi_priority) 305 break; 306 pspp = spp; 307 } 308 309 /* 310 * new priority? 311 */ 312 if (spp == NULL || spp->spi_priority != priority) { 313 spp = newspp; /* use newspp! */ 314 UVMHIST_LOG(pdhist, "created new swappri = %jd", 315 priority, 0, 0, 0); 316 317 spp->spi_priority = priority; 318 TAILQ_INIT(&spp->spi_swapdev); 319 320 if (pspp) 321 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 322 else 323 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 324 } else { 325 /* we don't need a new priority structure, free it */ 326 kmem_free(newspp, sizeof(*newspp)); 327 } 328 329 /* 330 * priority found (or created). now insert on the priority's 331 * tailq list and bump the total number of swapdevs. 332 */ 333 sdp->swd_priority = priority; 334 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 335 uvmexp.nswapdev++; 336 } 337 338 /* 339 * swaplist_find: find and optionally remove a swap device from the 340 * global list. 341 * 342 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 343 * => we return the swapdev we found (and removed) 344 */ 345 static struct swapdev * 346 swaplist_find(struct vnode *vp, bool remove) 347 { 348 struct swapdev *sdp; 349 struct swappri *spp; 350 351 KASSERT(rw_lock_held(&swap_syscall_lock)); 352 KASSERT(remove ? rw_write_held(&swap_syscall_lock) : 1); 353 KASSERT(mutex_owned(&uvm_swap_data_lock)); 354 355 /* 356 * search the lists for the requested vp 357 */ 358 359 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 360 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 361 if (sdp->swd_vp == vp) { 362 if (remove) { 363 TAILQ_REMOVE(&spp->spi_swapdev, 364 sdp, swd_next); 365 uvmexp.nswapdev--; 366 } 367 return(sdp); 368 } 369 } 370 } 371 return (NULL); 372 } 373 374 /* 375 * swaplist_trim: scan priority list for empty priority entries and kill 376 * them. 377 * 378 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 379 */ 380 static void 381 swaplist_trim(void) 382 { 383 struct swappri *spp, *nextspp; 384 385 KASSERT(rw_write_held(&swap_syscall_lock)); 386 KASSERT(mutex_owned(&uvm_swap_data_lock)); 387 388 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { 389 if (!TAILQ_EMPTY(&spp->spi_swapdev)) 390 continue; 391 LIST_REMOVE(spp, spi_swappri); 392 kmem_free(spp, sizeof(*spp)); 393 } 394 395 if (LIST_EMPTY(&swap_priority)) { 396 KASSERT(uvmexp.swpginuse == 0); 397 KASSERT(uvmexp.swpgonly == 0); 398 KASSERT(uvmexp.swpages == 0); 399 KASSERT(uvmexp.swpgavail == 0); 400 } 401 } 402 403 /* 404 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 405 * to the "swapdev" that maps that section of the drum. 406 * 407 * => each swapdev takes one big contig chunk of the drum 408 * => caller must hold uvm_swap_data_lock 409 */ 410 static struct swapdev * 411 swapdrum_getsdp(int pgno) 412 { 413 struct swapdev *sdp; 414 struct swappri *spp; 415 416 KASSERT(mutex_owned(&uvm_swap_data_lock)); 417 418 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 419 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 420 if (sdp->swd_flags & SWF_FAKE) 421 continue; 422 if (pgno >= sdp->swd_drumoffset && 423 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 424 return sdp; 425 } 426 } 427 } 428 return NULL; 429 } 430 431 /* 432 * swapdrum_sdp_is: true iff the swap device for pgno is sdp 433 * 434 * => for use in positive assertions only; result is not stable 435 */ 436 static bool __debugused 437 swapdrum_sdp_is(int pgno, struct swapdev *sdp) 438 { 439 bool result; 440 441 mutex_enter(&uvm_swap_data_lock); 442 result = swapdrum_getsdp(pgno) == sdp; 443 mutex_exit(&uvm_swap_data_lock); 444 445 return result; 446 } 447 448 void swapsys_lock(krw_t op) 449 { 450 rw_enter(&swap_syscall_lock, op); 451 } 452 453 void swapsys_unlock(void) 454 { 455 rw_exit(&swap_syscall_lock); 456 } 457 458 static void 459 swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse) 460 { 461 se->se_dev = sdp->swd_dev; 462 se->se_flags = sdp->swd_flags; 463 se->se_nblks = sdp->swd_nblks; 464 se->se_npgbad = sdp->swd_npgbad; 465 se->se_inuse = inuse; 466 se->se_priority = sdp->swd_priority; 467 KASSERT(sdp->swd_pathlen < sizeof(se->se_path)); 468 strcpy(se->se_path, sdp->swd_path); 469 } 470 471 int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) = 472 (void *)enosys; 473 int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) = 474 (void *)enosys; 475 int (*uvm_swap_stats110)(const struct sys_swapctl_args *, register_t *) = 476 (void *)enosys; 477 478 /* 479 * sys_swapctl: main entry point for swapctl(2) system call 480 * [with three helper functions: swap_on, swap_off and uvm_swap_stats] 481 */ 482 int 483 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, 484 register_t *retval) 485 { 486 /* { 487 syscallarg(int) cmd; 488 syscallarg(void *) arg; 489 syscallarg(int) misc; 490 } */ 491 struct vnode *vp; 492 struct nameidata nd; 493 struct swappri *spp; 494 struct swapdev *sdp; 495 #define SWAP_PATH_MAX (PATH_MAX + 1) 496 char *userpath; 497 size_t len = 0; 498 int error; 499 int priority; 500 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 501 502 /* 503 * we handle the non-priv NSWAP and STATS request first. 504 * 505 * SWAP_NSWAP: return number of config'd swap devices 506 * [can also be obtained with uvmexp sysctl] 507 */ 508 if (SCARG(uap, cmd) == SWAP_NSWAP) { 509 const int nswapdev = uvmexp.nswapdev; 510 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev, 511 0, 0, 0); 512 *retval = nswapdev; 513 return 0; 514 } 515 516 userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP); 517 518 /* 519 * ensure serialized syscall access by grabbing the swap_syscall_lock 520 */ 521 rw_enter(&swap_syscall_lock, RW_WRITER); 522 523 /* 524 * SWAP_STATS: get stats on current # of configured swap devs 525 * 526 * note that the swap_priority list can't change as long 527 * as we are holding the swap_syscall_lock. we don't want 528 * to grab the uvm_swap_data_lock because we may fault&sleep during 529 * copyout() and we don't want to be holding that lock then! 530 */ 531 switch (SCARG(uap, cmd)) { 532 case SWAP_STATS13: 533 error = (*uvm_swap_stats13)(uap, retval); 534 goto out; 535 case SWAP_STATS50: 536 error = (*uvm_swap_stats50)(uap, retval); 537 goto out; 538 case SWAP_STATS110: 539 error = (*uvm_swap_stats110)(uap, retval); 540 goto out; 541 case SWAP_STATS: 542 error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc), 543 NULL, sizeof(struct swapent), retval); 544 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 545 goto out; 546 547 case SWAP_GETDUMPDEV: 548 error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev)); 549 goto out; 550 default: 551 break; 552 } 553 554 /* 555 * all other requests require superuser privs. verify. 556 */ 557 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, 558 0, NULL, NULL, NULL))) 559 goto out; 560 561 if (SCARG(uap, cmd) == SWAP_DUMPOFF) { 562 /* drop the current dump device */ 563 dumpdev = NODEV; 564 dumpcdev = NODEV; 565 cpu_dumpconf(); 566 goto out; 567 } 568 569 /* 570 * at this point we expect a path name in arg. we will 571 * use namei() to gain a vnode reference (vref), and lock 572 * the vnode (VOP_LOCK). 573 * 574 * XXX: a NULL arg means use the root vnode pointer (e.g. for 575 * miniroot) 576 */ 577 if (SCARG(uap, arg) == NULL) { 578 vp = rootvp; /* miniroot */ 579 vref(vp); 580 if (vn_lock(vp, LK_EXCLUSIVE)) { 581 vrele(vp); 582 error = EBUSY; 583 goto out; 584 } 585 if (SCARG(uap, cmd) == SWAP_ON && 586 copystr("miniroot", userpath, SWAP_PATH_MAX, &len)) 587 panic("swapctl: miniroot copy failed"); 588 } else { 589 struct pathbuf *pb; 590 591 /* 592 * This used to allow copying in one extra byte 593 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON. 594 * This was completely pointless because if anyone 595 * used that extra byte namei would fail with 596 * ENAMETOOLONG anyway, so I've removed the excess 597 * logic. - dholland 20100215 598 */ 599 600 error = pathbuf_copyin(SCARG(uap, arg), &pb); 601 if (error) { 602 goto out; 603 } 604 if (SCARG(uap, cmd) == SWAP_ON) { 605 /* get a copy of the string */ 606 pathbuf_copystring(pb, userpath, SWAP_PATH_MAX); 607 len = strlen(userpath) + 1; 608 } 609 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); 610 if ((error = namei(&nd))) { 611 pathbuf_destroy(pb); 612 goto out; 613 } 614 vp = nd.ni_vp; 615 pathbuf_destroy(pb); 616 } 617 /* note: "vp" is referenced and locked */ 618 619 error = 0; /* assume no error */ 620 switch(SCARG(uap, cmd)) { 621 622 case SWAP_DUMPDEV: 623 if (vp->v_type != VBLK) { 624 error = ENOTBLK; 625 break; 626 } 627 if (bdevsw_lookup(vp->v_rdev)) { 628 dumpdev = vp->v_rdev; 629 dumpcdev = devsw_blk2chr(dumpdev); 630 } else 631 dumpdev = NODEV; 632 cpu_dumpconf(); 633 break; 634 635 case SWAP_CTL: 636 /* 637 * get new priority, remove old entry (if any) and then 638 * reinsert it in the correct place. finally, prune out 639 * any empty priority structures. 640 */ 641 priority = SCARG(uap, misc); 642 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 643 mutex_enter(&uvm_swap_data_lock); 644 if ((sdp = swaplist_find(vp, true)) == NULL) { 645 error = ENOENT; 646 } else { 647 swaplist_insert(sdp, spp, priority); 648 swaplist_trim(); 649 } 650 mutex_exit(&uvm_swap_data_lock); 651 if (error) 652 kmem_free(spp, sizeof(*spp)); 653 break; 654 655 case SWAP_ON: 656 657 /* 658 * check for duplicates. if none found, then insert a 659 * dummy entry on the list to prevent someone else from 660 * trying to enable this device while we are working on 661 * it. 662 */ 663 664 priority = SCARG(uap, misc); 665 sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP); 666 spp = kmem_alloc(sizeof(*spp), KM_SLEEP); 667 mutex_init(&sdp->swd_lock, MUTEX_DEFAULT, IPL_NONE); 668 sdp->swd_flags = SWF_FAKE; 669 sdp->swd_vp = vp; 670 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 671 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); 672 mutex_enter(&uvm_swap_data_lock); 673 if (swaplist_find(vp, false) != NULL) { 674 error = EBUSY; 675 mutex_exit(&uvm_swap_data_lock); 676 bufq_free(sdp->swd_tab); 677 mutex_destroy(&sdp->swd_lock); 678 kmem_free(sdp, sizeof(*sdp)); 679 kmem_free(spp, sizeof(*spp)); 680 break; 681 } 682 swaplist_insert(sdp, spp, priority); 683 mutex_exit(&uvm_swap_data_lock); 684 685 KASSERT(len > 0); 686 sdp->swd_pathlen = len; 687 sdp->swd_path = kmem_alloc(len, KM_SLEEP); 688 if (copystr(userpath, sdp->swd_path, len, 0) != 0) 689 panic("swapctl: copystr"); 690 691 /* 692 * we've now got a FAKE placeholder in the swap list. 693 * now attempt to enable swap on it. if we fail, undo 694 * what we've done and kill the fake entry we just inserted. 695 * if swap_on is a success, it will clear the SWF_FAKE flag 696 */ 697 698 if ((error = swap_on(l, sdp)) != 0) { 699 mutex_enter(&uvm_swap_data_lock); 700 (void) swaplist_find(vp, true); /* kill fake entry */ 701 swaplist_trim(); 702 mutex_exit(&uvm_swap_data_lock); 703 bufq_free(sdp->swd_tab); 704 kmem_free(sdp->swd_path, sdp->swd_pathlen); 705 mutex_destroy(&sdp->swd_lock); 706 kmem_free(sdp, sizeof(*sdp)); 707 break; 708 } 709 break; 710 711 case SWAP_OFF: 712 mutex_enter(&uvm_swap_data_lock); 713 if ((sdp = swaplist_find(vp, false)) == NULL) { 714 mutex_exit(&uvm_swap_data_lock); 715 error = ENXIO; 716 break; 717 } 718 719 /* 720 * If a device isn't in use or enabled, we 721 * can't stop swapping from it (again). 722 */ 723 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 724 mutex_exit(&uvm_swap_data_lock); 725 error = EBUSY; 726 break; 727 } 728 729 /* 730 * do the real work. 731 */ 732 error = swap_off(l, sdp); 733 break; 734 735 default: 736 error = EINVAL; 737 } 738 739 /* 740 * done! release the ref gained by namei() and unlock. 741 */ 742 vput(vp); 743 out: 744 rw_exit(&swap_syscall_lock); 745 kmem_free(userpath, SWAP_PATH_MAX); 746 747 UVMHIST_LOG(pdhist, "<- done! error=%jd", error, 0, 0, 0); 748 return (error); 749 } 750 751 /* 752 * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept 753 * away from sys_swapctl() in order to allow COMPAT_* swapctl() 754 * emulation to use it directly without going through sys_swapctl(). 755 * The problem with using sys_swapctl() there is that it involves 756 * copying the swapent array to the stackgap, and this array's size 757 * is not known at build time. Hence it would not be possible to 758 * ensure it would fit in the stackgap in any case. 759 */ 760 int 761 uvm_swap_stats(char *ptr, int misc, 762 void (*f)(void *, const struct swapent *), size_t len, 763 register_t *retval) 764 { 765 struct swappri *spp; 766 struct swapdev *sdp, **sdps, **sp; 767 struct swapent sep; 768 size_t sdpsize = 0; 769 struct swapdev *stackbuf[8]; /* magic 8, any number >1 will do */ 770 int count, slots; 771 int error; 772 773 KASSERT(len <= sizeof(sep)); 774 if (len == 0) 775 return ENOSYS; 776 777 if (misc < 0) 778 return EINVAL; 779 780 if (misc == 0 || uvmexp.nswapdev == 0) 781 return 0; 782 783 KASSERT(rw_lock_held(&swap_syscall_lock)); 784 785 /* 786 * Allocate space (slots) for pointers to all swapdevs 787 * 788 * This needs to be done here (not earlier) (and so needs 789 * the unlock/lock dance) because of the way the various 790 * compat functions work. 791 */ 792 sdps = NULL; 793 slots = uvmexp.nswapdev; 794 795 if (slots > misc) /* we never need more than requested */ 796 slots = misc; 797 798 /* 799 * Nb: do not limit misc to <= uvmexp.nswapdev yet, 800 * as the latter might get bigger (or smaller) 801 */ 802 803 if ((SIZE_T_MAX / sizeof sdp) <= misc) /* unlikely */ 804 return E2BIG; 805 806 /* 807 * One slot for each currently existing swap device, but 808 * limited (above) to no more than the request wants (misc). 809 * Each slot needs space for a pointer to a swapdev. 810 */ 811 sdpsize = (size_t)slots * sizeof sdp; 812 813 /* 814 * Borrow from kmem_tmpbuf_alloc(9) but don't use that 815 * so we don't need to do the unlock dance unnecessarily 816 */ 817 if (sdpsize <= sizeof stackbuf) { 818 /* Should be the common case */ 819 sdps = stackbuf; 820 } else { 821 rw_exit(&swap_syscall_lock); 822 823 sdps = kmem_alloc(sdpsize, KM_SLEEP); 824 825 rw_enter(&swap_syscall_lock, RW_READER); 826 827 /* 828 * At this point, 3 possibilities. 829 * 830 * 1. uvmexp.nswapdev has increased. 831 * 832 * A new swap device got added. That's OK, just ignore the 833 * excess device(s), and return the first N (the number that 834 * were there when we started). 835 * 836 * 2. uvmexp.nswapdev has decreased. 837 * 838 * A swap device was deleted. In this case we will return 839 * less devices than requested but that's OK. We will have 840 * more slot memory than is needed to save them all, but just 841 * a little more, and it gets freed just below. 842 * 843 * 3. uvmexp.nswapdev hasn't changed. 844 * 845 * This will be the usual case; no swapctl operations occurred 846 * while the lock was released, or possibly a device was 847 * deleted and another added - that's irrelevant. At this 848 * point all that matters is the number of devices, we haven't 849 * looked at the lists yet. 850 * 851 * So we never need to adjust this allocation. 852 * 853 * And we don't need to look at uvmexp.nswapdev again! 854 */ 855 } 856 857 KASSERT(rw_lock_held(&swap_syscall_lock)); 858 859 /* 860 * Collect all of the swap descriptors, while holding the data lock, 861 * so the lists cannot change. Then they can be used safely. 862 * 863 * Entries cannot be deleted, because swap_syscall_lock is held, 864 * but the lists holding them can be reordered except in this small 865 * loop where we lock out that kind of activity. No processing 866 * happens here, this is fast, with no func calls, or anything which 867 * might perform operations which might need the lock. 868 */ 869 mutex_enter(&uvm_swap_data_lock); 870 sp = sdps; 871 count = 0; 872 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 873 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 874 if (++count <= slots) 875 *sp++ = sdp; 876 /* 877 * don't bother with exiting the loops early, 878 * the lists tend to be very short, and not 879 * exhausting them is a very rare occurrence. 880 * So just loop and do nothing (but count) in 881 * the odd case we could have broken out early. 882 */ 883 } 884 } 885 mutex_exit(&uvm_swap_data_lock); 886 887 /* 888 * Now we have a stable list of devices which cannot change, 889 * even if the swapping lists are reordered. 890 */ 891 892 if (misc > slots) /* the number of storage slots */ 893 misc = slots; 894 if (misc > count) /* the number of devices now */ 895 misc = count; 896 897 /* 898 * This is the actual work of uvm_swap_stats() - above was bookkeeping. 899 */ 900 error = 0; 901 count = 0; 902 sp = sdps; 903 while (misc-- > 0) { 904 int inuse; 905 906 sdp = *sp++; /* The next swapdev, from the next slot */ 907 908 inuse = btodb((uint64_t)sdp->swd_npginuse << 909 PAGE_SHIFT); 910 911 memset(&sep, 0, sizeof(sep)); 912 swapent_cvt(&sep, sdp, inuse); 913 if (f) 914 (*f)(&sep, &sep); 915 if ((error = copyout(&sep, ptr, len)) != 0) 916 goto out; 917 ptr += len; 918 count++; 919 } 920 *retval = count; 921 out:; 922 if (sdps != stackbuf) { 923 /* 924 * XXX should unlock & lock again here probably, 925 * but for now, no... 926 */ 927 kmem_free(sdps, sdpsize); 928 } 929 return error; 930 } 931 932 /* 933 * swap_on: attempt to enable a swapdev for swapping. note that the 934 * swapdev is already on the global list, but disabled (marked 935 * SWF_FAKE). 936 * 937 * => we avoid the start of the disk (to protect disk labels) 938 * => we also avoid the miniroot, if we are swapping to root. 939 * => caller should leave uvm_swap_data_lock unlocked, we may lock it 940 * if needed. 941 */ 942 static int 943 swap_on(struct lwp *l, struct swapdev *sdp) 944 { 945 struct vnode *vp; 946 int error, npages, nblocks, size; 947 long addr; 948 vmem_addr_t result; 949 struct vattr va; 950 dev_t dev; 951 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 952 953 /* 954 * we want to enable swapping on sdp. the swd_vp contains 955 * the vnode we want (locked and ref'd), and the swd_dev 956 * contains the dev_t of the file, if it a block device. 957 */ 958 959 vp = sdp->swd_vp; 960 dev = sdp->swd_dev; 961 962 /* 963 * open the swap file (mostly useful for block device files to 964 * let device driver know what is up). 965 * 966 * we skip the open/close for root on swap because the root 967 * has already been opened when root was mounted (mountroot). 968 */ 969 if (vp != rootvp) { 970 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred))) 971 return (error); 972 } 973 974 /* XXX this only works for block devices */ 975 UVMHIST_LOG(pdhist, " dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0); 976 977 /* 978 * we now need to determine the size of the swap area. for 979 * block specials we can call the d_psize function. 980 * for normal files, we must stat [get attrs]. 981 * 982 * we put the result in nblks. 983 * for normal files, we also want the filesystem block size 984 * (which we get with statfs). 985 */ 986 switch (vp->v_type) { 987 case VBLK: 988 if ((nblocks = bdev_size(dev)) == -1) { 989 error = ENXIO; 990 goto bad; 991 } 992 break; 993 994 case VREG: 995 if ((error = VOP_GETATTR(vp, &va, l->l_cred))) 996 goto bad; 997 nblocks = (int)btodb(va.va_size); 998 sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift; 999 /* 1000 * limit the max # of outstanding I/O requests we issue 1001 * at any one time. take it easy on NFS servers. 1002 */ 1003 if (vp->v_tag == VT_NFS) 1004 sdp->swd_maxactive = 2; /* XXX */ 1005 else 1006 sdp->swd_maxactive = 8; /* XXX */ 1007 break; 1008 1009 default: 1010 error = ENXIO; 1011 goto bad; 1012 } 1013 1014 /* 1015 * save nblocks in a safe place and convert to pages. 1016 */ 1017 1018 sdp->swd_nblks = nblocks; 1019 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; 1020 1021 /* 1022 * for block special files, we want to make sure that leave 1023 * the disklabel and bootblocks alone, so we arrange to skip 1024 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 1025 * note that because of this the "size" can be less than the 1026 * actual number of blocks on the device. 1027 */ 1028 if (vp->v_type == VBLK) { 1029 /* we use pages 1 to (size - 1) [inclusive] */ 1030 size = npages - 1; 1031 addr = 1; 1032 } else { 1033 /* we use pages 0 to (size - 1) [inclusive] */ 1034 size = npages; 1035 addr = 0; 1036 } 1037 1038 /* 1039 * make sure we have enough blocks for a reasonable sized swap 1040 * area. we want at least one page. 1041 */ 1042 1043 if (size < 1) { 1044 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 1045 error = EINVAL; 1046 goto bad; 1047 } 1048 1049 UVMHIST_LOG(pdhist," dev=%#jx: size=%jd addr=%jd", dev, size, addr, 0); 1050 1051 /* 1052 * now we need to allocate an extent to manage this swap device 1053 */ 1054 1055 sdp->swd_blist = blist_create(npages); 1056 /* mark all expect the `saved' region free. */ 1057 blist_free(sdp->swd_blist, addr, size); 1058 1059 /* 1060 * mark the keys uninitialized so we generate them lazily. 1061 * 1062 * we defer the key generation to help to maximize the amount 1063 * of data fed into the entropy pool before generating a key, 1064 * for the benefit of machines without HWRNG. 1065 */ 1066 sdp->swd_encinit = false; 1067 1068 /* 1069 * if the vnode we are swapping to is the root vnode 1070 * (i.e. we are swapping to the miniroot) then we want 1071 * to make sure we don't overwrite it. do a statfs to 1072 * find its size and skip over it. 1073 */ 1074 if (vp == rootvp) { 1075 struct mount *mp; 1076 struct statvfs *sp; 1077 int rootblocks, rootpages; 1078 1079 mp = rootvnode->v_mount; 1080 sp = &mp->mnt_stat; 1081 rootblocks = sp->f_blocks * btodb(sp->f_frsize); 1082 /* 1083 * XXX: sp->f_blocks isn't the total number of 1084 * blocks in the filesystem, it's the number of 1085 * data blocks. so, our rootblocks almost 1086 * definitely underestimates the total size 1087 * of the filesystem - how badly depends on the 1088 * details of the filesystem type. there isn't 1089 * an obvious way to deal with this cleanly 1090 * and perfectly, so for now we just pad our 1091 * rootblocks estimate with an extra 5 percent. 1092 */ 1093 rootblocks += (rootblocks >> 5) + 1094 (rootblocks >> 6) + 1095 (rootblocks >> 7); 1096 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 1097 if (rootpages > size) 1098 panic("swap_on: miniroot larger than swap?"); 1099 1100 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { 1101 panic("swap_on: unable to preserve miniroot"); 1102 } 1103 1104 size -= rootpages; 1105 printf("Preserved %d pages of miniroot ", rootpages); 1106 printf("leaving %d pages of swap\n", size); 1107 } 1108 1109 /* 1110 * add a ref to vp to reflect usage as a swap device. 1111 */ 1112 vref(vp); 1113 1114 /* 1115 * now add the new swapdev to the drum and enable. 1116 */ 1117 error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result); 1118 if (error != 0) 1119 panic("swapdrum_add"); 1120 /* 1121 * If this is the first regular swap create the workqueue. 1122 * => Protected by swap_syscall_lock. 1123 */ 1124 if (vp->v_type != VBLK) { 1125 if (sw_reg_count++ == 0) { 1126 KASSERT(sw_reg_workqueue == NULL); 1127 if (workqueue_create(&sw_reg_workqueue, "swapiod", 1128 sw_reg_iodone, NULL, PRIBIO, IPL_SOFTBIO, 1129 WQ_MPSAFE) != 0) 1130 panic("%s: workqueue_create failed", __func__); 1131 } 1132 } 1133 1134 sdp->swd_drumoffset = (int)result; 1135 sdp->swd_drumsize = npages; 1136 sdp->swd_npages = size; 1137 mutex_enter(&uvm_swap_data_lock); 1138 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 1139 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 1140 uvmexp.swpages += size; 1141 uvmexp.swpgavail += size; 1142 mutex_exit(&uvm_swap_data_lock); 1143 return (0); 1144 1145 /* 1146 * failure: clean up and return error. 1147 */ 1148 1149 bad: 1150 if (sdp->swd_blist) { 1151 blist_destroy(sdp->swd_blist); 1152 } 1153 if (vp != rootvp) { 1154 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred); 1155 } 1156 return (error); 1157 } 1158 1159 /* 1160 * swap_off: stop swapping on swapdev 1161 * 1162 * => swap data should be locked, we will unlock. 1163 */ 1164 static int 1165 swap_off(struct lwp *l, struct swapdev *sdp) 1166 { 1167 int npages = sdp->swd_npages; 1168 int error = 0; 1169 1170 UVMHIST_FUNC(__func__); 1171 UVMHIST_CALLARGS(pdhist, 1172 " dev=%#jx, npages=%jd", sdp->swd_dev,npages, 0, 0); 1173 1174 KASSERT(rw_write_held(&swap_syscall_lock)); 1175 KASSERT(mutex_owned(&uvm_swap_data_lock)); 1176 1177 /* disable the swap area being removed */ 1178 sdp->swd_flags &= ~SWF_ENABLE; 1179 uvmexp.swpgavail -= npages; 1180 mutex_exit(&uvm_swap_data_lock); 1181 1182 /* 1183 * the idea is to find all the pages that are paged out to this 1184 * device, and page them all in. in uvm, swap-backed pageable 1185 * memory can take two forms: aobjs and anons. call the 1186 * swapoff hook for each subsystem to bring in pages. 1187 */ 1188 1189 if (uao_swap_off(sdp->swd_drumoffset, 1190 sdp->swd_drumoffset + sdp->swd_drumsize) || 1191 amap_swap_off(sdp->swd_drumoffset, 1192 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1193 error = ENOMEM; 1194 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1195 error = EBUSY; 1196 } 1197 1198 if (error) { 1199 mutex_enter(&uvm_swap_data_lock); 1200 sdp->swd_flags |= SWF_ENABLE; 1201 uvmexp.swpgavail += npages; 1202 mutex_exit(&uvm_swap_data_lock); 1203 1204 return error; 1205 } 1206 1207 /* 1208 * If this is the last regular swap destroy the workqueue. 1209 * => Protected by swap_syscall_lock. 1210 */ 1211 if (sdp->swd_vp->v_type != VBLK) { 1212 KASSERT(sw_reg_count > 0); 1213 KASSERT(sw_reg_workqueue != NULL); 1214 if (--sw_reg_count == 0) { 1215 workqueue_destroy(sw_reg_workqueue); 1216 sw_reg_workqueue = NULL; 1217 } 1218 } 1219 1220 /* 1221 * done with the vnode. 1222 * drop our ref on the vnode before calling VOP_CLOSE() 1223 * so that spec_close() can tell if this is the last close. 1224 */ 1225 vrele(sdp->swd_vp); 1226 if (sdp->swd_vp != rootvp) { 1227 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred); 1228 } 1229 1230 mutex_enter(&uvm_swap_data_lock); 1231 uvmexp.swpages -= npages; 1232 KASSERTMSG(uvmexp.swpginuse >= sdp->swd_npgbad, 1233 "swpginuse %d sdp->swd_npgbad %d", 1234 uvmexp.swpginuse, sdp->swd_npgbad); 1235 uvmexp.swpginuse -= sdp->swd_npgbad; 1236 1237 if (swaplist_find(sdp->swd_vp, true) == NULL) 1238 panic("%s: swapdev not in list", __func__); 1239 swaplist_trim(); 1240 mutex_exit(&uvm_swap_data_lock); 1241 1242 /* 1243 * free all resources! 1244 */ 1245 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); 1246 blist_destroy(sdp->swd_blist); 1247 bufq_free(sdp->swd_tab); 1248 explicit_memset(&sdp->swd_enckey, 0, sizeof sdp->swd_enckey); 1249 explicit_memset(&sdp->swd_deckey, 0, sizeof sdp->swd_deckey); 1250 mutex_destroy(&sdp->swd_lock); 1251 kmem_free(sdp, sizeof(*sdp)); 1252 return (0); 1253 } 1254 1255 void 1256 uvm_swap_shutdown(struct lwp *l) 1257 { 1258 struct swapdev *sdp; 1259 struct swappri *spp; 1260 struct vnode *vp; 1261 int error; 1262 1263 if (!uvm_swap_init_done || uvmexp.nswapdev == 0) 1264 return; 1265 printf("turning off swap..."); 1266 rw_enter(&swap_syscall_lock, RW_WRITER); 1267 mutex_enter(&uvm_swap_data_lock); 1268 again: 1269 LIST_FOREACH(spp, &swap_priority, spi_swappri) 1270 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1271 if (sdp->swd_flags & SWF_FAKE) 1272 continue; 1273 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) 1274 continue; 1275 #ifdef DEBUG 1276 printf("\nturning off swap on %s...", sdp->swd_path); 1277 #endif 1278 /* Have to lock and reference vnode for swap_off(). */ 1279 vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE|LK_RETRY); 1280 vref(vp); 1281 error = swap_off(l, sdp); 1282 vput(vp); 1283 mutex_enter(&uvm_swap_data_lock); 1284 if (error) { 1285 printf("stopping swap on %s failed " 1286 "with error %d\n", sdp->swd_path, error); 1287 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1288 uvmexp.nswapdev--; 1289 swaplist_trim(); 1290 } 1291 goto again; 1292 } 1293 printf(" done\n"); 1294 mutex_exit(&uvm_swap_data_lock); 1295 rw_exit(&swap_syscall_lock); 1296 } 1297 1298 1299 /* 1300 * /dev/drum interface and i/o functions 1301 */ 1302 1303 /* 1304 * swopen: allow the initial open from uvm_swap_init() and reject all others. 1305 */ 1306 1307 static int 1308 swopen(dev_t dev, int flag, int mode, struct lwp *l) 1309 { 1310 static bool inited = false; 1311 1312 if (!inited) { 1313 inited = true; 1314 return 0; 1315 } 1316 return ENODEV; 1317 } 1318 1319 static void 1320 iobuf_redirect(struct buf *bp, struct vnode *vp) 1321 { 1322 if ((bp->b_flags & B_READ) == 0) { 1323 mutex_enter(bp->b_objlock); 1324 vwakeup(bp); 1325 mutex_exit(bp->b_objlock); 1326 mutex_enter(vp->v_interlock); 1327 vp->v_numoutput++; 1328 mutex_exit(vp->v_interlock); 1329 } 1330 1331 /* 1332 * finally plug in swapdev vnode and start I/O 1333 */ 1334 bp->b_vp = vp; 1335 bp->b_objlock = vp->v_interlock; 1336 } 1337 1338 struct sw_physio_decrypt_context { 1339 void *orig_buf; 1340 void *orig_private; 1341 void (*orig_iodone)(struct buf *); 1342 int swslot; 1343 }; 1344 1345 static void 1346 sw_physio_decrypt_iodone(struct buf *bp) 1347 { 1348 struct sw_physio_decrypt_context *ctx = bp->b_private; 1349 void (*cb)(struct buf *bp) = ctx->orig_iodone; 1350 size_t npages = bp->b_bcount >> PAGE_SHIFT; 1351 1352 KASSERT(ctx->swslot > 0); 1353 KASSERT(npages << PAGE_SHIFT == bp->b_bcount); 1354 if (bp->b_error == 0) { 1355 if (bp->b_resid == 0) { 1356 uvm_swap_decrypt_pages(ctx->swslot, bp->b_data, 1357 npages); 1358 memcpy(ctx->orig_buf, (uint8_t *)bp->b_data, 1359 bp->b_bcount); 1360 } else { 1361 bp->b_error = EIO; 1362 } 1363 } 1364 kmem_intr_free(bp->b_data, bp->b_bcount); 1365 bp->b_data = ctx->orig_buf; 1366 if (bp->b_error != 0) { 1367 bp->b_resid = bp->b_bcount; 1368 } 1369 bp->b_private = ctx->orig_private; 1370 kmem_intr_free(ctx, sizeof(*ctx)); 1371 (cb)(bp); /* call the original b_iodone callback */ 1372 } 1373 1374 /* 1375 * swstrategy: perform I/O on the drum 1376 * 1377 * => we must map the i/o request from the drum to the correct swapdev. 1378 */ 1379 static void 1380 swstrategy(struct buf *bp) 1381 { 1382 struct swapdev *sdp; 1383 struct vnode *vp; 1384 int pageno, bn; 1385 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1386 1387 KASSERT((bp->b_flags & B_RAW) == 0 || 1388 rw_read_held(&swap_syscall_lock)); 1389 1390 /* 1391 * reject non page aligned i/o. 1392 */ 1393 if ((dbtob((int64_t)bp->b_blkno) & PAGE_MASK) != 0 || 1394 (bp->b_bcount & PAGE_MASK) != 0) { 1395 bp->b_error = ENOTSUP; 1396 bp->b_resid = bp->b_bcount; 1397 biodone(bp); 1398 return; 1399 } 1400 1401 /* 1402 * convert block number to swapdev. note that swapdev can't 1403 * be yanked out from under us because we are holding resources 1404 * in it (i.e. the blocks we are doing I/O on) or read lock on 1405 * swap_syscall_lock. 1406 */ 1407 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1408 mutex_enter(&uvm_swap_data_lock); 1409 sdp = swapdrum_getsdp(pageno); 1410 KASSERT((bp->b_flags & B_RAW) != 0 || 1411 (sdp != NULL && sdp->swd_npginuse > 0)); 1412 mutex_exit(&uvm_swap_data_lock); 1413 if (sdp == NULL) { 1414 bp->b_error = EINVAL; 1415 bp->b_resid = bp->b_bcount; 1416 biodone(bp); 1417 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1418 return; 1419 } 1420 1421 /* 1422 * B_RAW here implies user i/o on /dev/drum, for which we need 1423 * to handle encryption/decryption here. 1424 * for swap in/out, it's handled by the caller. 1425 */ 1426 if ((bp->b_flags & B_RAW) != 0 && 1427 atomic_load_relaxed(&uvm_swap_encrypt)) { 1428 struct sw_physio_decrypt_context *ctx; 1429 1430 /* 1431 * we only implement B_READ for now. 1432 * 1433 * REVISIT: what kind of apps needs to write to /dev/drum? 1434 */ 1435 if ((bp->b_flags & B_READ) == 0) { 1436 bp->b_error = ENOTSUP; 1437 bp->b_resid = bp->b_bcount; 1438 biodone(bp); 1439 return; 1440 } 1441 1442 /* 1443 * in-place decryption in the userland buffer might 1444 * have non-trivial implications. for simplicity, 1445 * we use a bounce buffer. 1446 */ 1447 ctx = kmem_intr_alloc(sizeof(*ctx), KM_SLEEP); 1448 ctx->swslot = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1449 KASSERT(ctx->swslot > 0); 1450 ctx->orig_buf = bp->b_data; 1451 ctx->orig_private = bp->b_private; 1452 ctx->orig_iodone = bp->b_iodone; 1453 bp->b_data = kmem_intr_alloc(bp->b_bcount, KM_SLEEP); 1454 bp->b_private = ctx; 1455 bp->b_iodone = sw_physio_decrypt_iodone; 1456 } 1457 1458 /* 1459 * convert drum page number to block number on this swapdev. 1460 */ 1461 1462 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1463 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1464 1465 UVMHIST_LOG(pdhist, " Rd/Wr (0/1) %jd: mapoff=%#jx bn=%#jx bcount=%jd", 1466 ((bp->b_flags & B_READ) == 0) ? 1 : 0, 1467 sdp->swd_drumoffset, bn, bp->b_bcount); 1468 1469 /* 1470 * for block devices we finish up here. 1471 * for regular files we have to do more work which we delegate 1472 * to sw_reg_strategy(). 1473 */ 1474 1475 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1476 switch (vp->v_type) { 1477 default: 1478 panic("%s: vnode type 0x%x", __func__, vp->v_type); 1479 1480 case VBLK: 1481 1482 /* 1483 * must convert "bp" from an I/O on /dev/drum to an I/O 1484 * on the swapdev (sdp). 1485 * 1486 * if we are doing a write, we have to redirect the i/o on 1487 * drum's v_numoutput counter to the swapdev's. 1488 */ 1489 iobuf_redirect(bp, vp); 1490 bp->b_blkno = bn; /* swapdev block number */ 1491 VOP_STRATEGY(vp, bp); 1492 return; 1493 1494 case VREG: 1495 /* 1496 * delegate to sw_reg_strategy function. 1497 */ 1498 sw_reg_strategy(sdp, bp, bn); 1499 return; 1500 } 1501 /* NOTREACHED */ 1502 } 1503 1504 /* 1505 * swread: the read function for the drum (just a call to physio) 1506 */ 1507 /*ARGSUSED*/ 1508 static int 1509 swread(dev_t dev, struct uio *uio, int ioflag) 1510 { 1511 int ret; 1512 1513 UVMHIST_FUNC(__func__); 1514 UVMHIST_CALLARGS(pdhist, 1515 " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0); 1516 1517 rw_enter(&swap_syscall_lock, RW_READER); 1518 ret = physio(swstrategy, NULL, dev, B_READ, minphys, uio); 1519 rw_exit(&swap_syscall_lock); 1520 return ret; 1521 } 1522 1523 /* 1524 * swwrite: the write function for the drum (just a call to physio) 1525 */ 1526 /*ARGSUSED*/ 1527 static int 1528 swwrite(dev_t dev, struct uio *uio, int ioflag) 1529 { 1530 int ret; 1531 1532 UVMHIST_FUNC(__func__); 1533 UVMHIST_CALLARGS(pdhist, 1534 " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0); 1535 1536 rw_enter(&swap_syscall_lock, RW_READER); 1537 ret = physio(swstrategy, NULL, dev, B_WRITE, minphys, uio); 1538 rw_exit(&swap_syscall_lock); 1539 return ret; 1540 } 1541 1542 const struct bdevsw swap_bdevsw = { 1543 .d_open = swopen, 1544 .d_close = noclose, 1545 .d_strategy = swstrategy, 1546 .d_ioctl = noioctl, 1547 .d_dump = nodump, 1548 .d_psize = nosize, 1549 .d_discard = nodiscard, 1550 .d_flag = D_OTHER | D_MPSAFE, 1551 }; 1552 1553 const struct cdevsw swap_cdevsw = { 1554 .d_open = nullopen, 1555 .d_close = nullclose, 1556 .d_read = swread, 1557 .d_write = swwrite, 1558 .d_ioctl = noioctl, 1559 .d_stop = nostop, 1560 .d_tty = notty, 1561 .d_poll = nopoll, 1562 .d_mmap = nommap, 1563 .d_kqfilter = nokqfilter, 1564 .d_discard = nodiscard, 1565 .d_flag = D_OTHER | D_MPSAFE, 1566 }; 1567 1568 /* 1569 * sw_reg_strategy: handle swap i/o to regular files 1570 */ 1571 static void 1572 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1573 { 1574 struct vnode *devvp; 1575 daddr_t nbn; 1576 off_t byteoff; 1577 int offset; 1578 int off, nra, error, sz, resid; 1579 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1580 1581 /* 1582 * setup for main loop where we read filesystem blocks into 1583 * our buffer. 1584 */ 1585 error = 0; 1586 bp->b_resid = bp->b_bcount; /* nothing transferred yet! */ 1587 byteoff = dbtob((uint64_t)bn); 1588 1589 offset = 0; 1590 for (resid = bp->b_resid; resid; resid -= sz) { 1591 struct buf *nbp; 1592 1593 /* 1594 * translate byteoffset into block number. return values: 1595 * devvp = vnode of underlying device 1596 * nbn = new block number (on underlying vnode dev) 1597 * nra = num blocks we can read-ahead (excludes requested 1598 * block) 1599 */ 1600 nra = 0; 1601 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1602 &devvp, &nbn, &nra); 1603 1604 if (error == 0 && nbn == (daddr_t)-1) { 1605 error = EIO; /* failure */ 1606 } 1607 1608 /* 1609 * punt if there was an error or a hole in the file. 1610 * we must wait for any i/o ops we have already started 1611 * to finish before returning. 1612 * 1613 * XXX we could deal with holes here but it would be 1614 * a hassle (in the write case). 1615 */ 1616 if (error) { 1617 break; 1618 } 1619 1620 /* 1621 * compute the size ("sz") of this transfer (in bytes). 1622 */ 1623 off = byteoff % sdp->swd_bsize; 1624 sz = (1 + nra) * sdp->swd_bsize - off; 1625 if (sz > resid) 1626 sz = resid; 1627 1628 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1629 "vp %#jx/%#jx offset %#jx/%#jx", 1630 (uintptr_t)sdp->swd_vp, (uintptr_t)devvp, byteoff, nbn); 1631 1632 nbp = getiobuf(devvp, true); 1633 nestiobuf_setup(bp, nbp, offset, sz); 1634 iobuf_redirect(nbp, devvp); 1635 nbp->b_blkno = nbn + btodb(off); 1636 KASSERT(nbp->b_iodone == nestiobuf_iodone); 1637 nbp->b_private2 = sdp; 1638 nbp->b_iodone = sw_reg_biodone; 1639 1640 /* sort it in and start I/O if we are not over our limit */ 1641 mutex_enter(&sdp->swd_lock); 1642 bufq_put(sdp->swd_tab, nbp); 1643 sw_reg_start(sdp); 1644 mutex_exit(&sdp->swd_lock); 1645 1646 /* 1647 * at this point "nbp" might have been freed. 1648 */ 1649 1650 /* 1651 * advance to the next I/O 1652 */ 1653 byteoff += sz; 1654 offset += sz; 1655 } 1656 if (resid > 0) { 1657 KASSERT(error != 0); 1658 nestiobuf_done(bp, resid, error); 1659 } 1660 } 1661 1662 /* 1663 * sw_reg_start: start an I/O request on the requested swapdev 1664 * 1665 * => reqs are sorted by b_rawblkno (above) 1666 */ 1667 static void 1668 sw_reg_start(struct swapdev *sdp) 1669 { 1670 struct buf *bp; 1671 struct vnode *vp; 1672 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1673 1674 KASSERT(mutex_owned(&sdp->swd_lock)); 1675 1676 /* recursion control */ 1677 if ((sdp->swd_flags & SWF_BUSY) != 0) 1678 return; 1679 1680 sdp->swd_flags |= SWF_BUSY; 1681 1682 while (sdp->swd_active < sdp->swd_maxactive) { 1683 bp = bufq_get(sdp->swd_tab); 1684 if (bp == NULL) 1685 break; 1686 sdp->swd_active++; 1687 1688 UVMHIST_LOG(pdhist, 1689 "sw_reg_start: bp %#jx vp %#jx blkno %#jx cnt %#jx", 1690 (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno, 1691 bp->b_bcount); 1692 vp = bp->b_vp; 1693 VOP_STRATEGY(vp, bp); 1694 } 1695 sdp->swd_flags &= ~SWF_BUSY; 1696 } 1697 1698 /* 1699 * sw_reg_biodone: one of our i/o's has completed 1700 */ 1701 static void 1702 sw_reg_biodone(struct buf *bp) 1703 { 1704 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); 1705 } 1706 1707 /* 1708 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1709 * 1710 * => note that we can recover the vndbuf struct by casting the buf ptr 1711 */ 1712 static void 1713 sw_reg_iodone(struct work *wk, void *dummy) 1714 { 1715 struct buf *nbp = (void *)wk; 1716 struct swapdev *sdp = nbp->b_private2; 1717 1718 KASSERT(&nbp->b_work == wk); 1719 UVMHIST_FUNC(__func__); 1720 UVMHIST_CALLARGS(pdhist, " bp=%#jx vp=%#jx blkno=%#jx addr=%#jx", 1721 (uintptr_t)nbp, (uintptr_t)nbp->b_vp, nbp->b_blkno, 1722 (uintptr_t)nbp->b_data); 1723 UVMHIST_LOG(pdhist, " cnt=%#jx resid=%#jx", 1724 nbp->b_bcount, nbp->b_resid, 0, 0); 1725 1726 /* 1727 * start next swapdev I/O if one is pending 1728 */ 1729 mutex_enter(&sdp->swd_lock); 1730 KASSERT(sdp->swd_active > 0); 1731 sdp->swd_active--; 1732 sw_reg_start(sdp); 1733 mutex_exit(&sdp->swd_lock); 1734 1735 nestiobuf_iodone(nbp); 1736 } 1737 1738 1739 /* 1740 * uvm_swap_alloc: allocate space on swap 1741 * 1742 * => allocation is done "round robin" down the priority list, as we 1743 * allocate in a priority we "rotate" the circle queue. 1744 * => space can be freed with uvm_swap_free 1745 * => we return the page slot number in /dev/drum (0 == invalid slot) 1746 * => we lock uvm_swap_data_lock 1747 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1748 */ 1749 int 1750 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) 1751 { 1752 struct swapdev *sdp; 1753 struct swappri *spp; 1754 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1755 1756 /* 1757 * no swap devices configured yet? definite failure. 1758 */ 1759 if (uvmexp.nswapdev < 1) 1760 return 0; 1761 1762 /* 1763 * XXXJAK: BEGIN HACK 1764 * 1765 * blist_alloc() in subr_blist.c will panic if we try to allocate 1766 * too many slots. 1767 */ 1768 if (*nslots > BLIST_MAX_ALLOC) { 1769 if (__predict_false(lessok == false)) 1770 return 0; 1771 *nslots = BLIST_MAX_ALLOC; 1772 } 1773 /* XXXJAK: END HACK */ 1774 1775 /* 1776 * lock data lock, convert slots into blocks, and enter loop 1777 */ 1778 mutex_enter(&uvm_swap_data_lock); 1779 1780 ReTry: /* XXXMRG */ 1781 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1782 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1783 uint64_t result; 1784 1785 /* if it's not enabled, then we can't swap from it */ 1786 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1787 continue; 1788 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1789 continue; 1790 result = blist_alloc(sdp->swd_blist, *nslots); 1791 if (result == BLIST_NONE) { 1792 continue; 1793 } 1794 KASSERT(result < sdp->swd_drumsize); 1795 1796 /* 1797 * successful allocation! now rotate the tailq. 1798 */ 1799 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1800 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1801 sdp->swd_npginuse += *nslots; 1802 uvmexp.swpginuse += *nslots; 1803 mutex_exit(&uvm_swap_data_lock); 1804 /* done! return drum slot number */ 1805 UVMHIST_LOG(pdhist, 1806 "success! returning %jd slots starting at %jd", 1807 *nslots, result + sdp->swd_drumoffset, 0, 0); 1808 return (result + sdp->swd_drumoffset); 1809 } 1810 } 1811 1812 /* XXXMRG: BEGIN HACK */ 1813 if (*nslots > 1 && lessok) { 1814 *nslots = 1; 1815 /* XXXMRG: ugh! blist should support this for us */ 1816 goto ReTry; 1817 } 1818 /* XXXMRG: END HACK */ 1819 1820 mutex_exit(&uvm_swap_data_lock); 1821 return 0; 1822 } 1823 1824 /* 1825 * uvm_swapisfull: return true if most of available swap is allocated 1826 * and in use. we don't count some small portion as it may be inaccessible 1827 * to us at any given moment, for example if there is lock contention or if 1828 * pages are busy. 1829 */ 1830 bool 1831 uvm_swapisfull(void) 1832 { 1833 int swpgonly; 1834 bool rv; 1835 1836 if (uvmexp.swpages == 0) { 1837 return true; 1838 } 1839 1840 mutex_enter(&uvm_swap_data_lock); 1841 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1842 swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 / 1843 uvm_swapisfull_factor); 1844 rv = (swpgonly >= uvmexp.swpgavail); 1845 mutex_exit(&uvm_swap_data_lock); 1846 1847 return (rv); 1848 } 1849 1850 /* 1851 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1852 * 1853 * => we lock uvm_swap_data_lock 1854 */ 1855 void 1856 uvm_swap_markbad(int startslot, int nslots) 1857 { 1858 struct swapdev *sdp; 1859 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist); 1860 1861 mutex_enter(&uvm_swap_data_lock); 1862 sdp = swapdrum_getsdp(startslot); 1863 KASSERT(sdp != NULL); 1864 1865 /* 1866 * we just keep track of how many pages have been marked bad 1867 * in this device, to make everything add up in swap_off(). 1868 * we assume here that the range of slots will all be within 1869 * one swap device. 1870 */ 1871 1872 KASSERT(uvmexp.swpgonly >= nslots); 1873 atomic_add_int(&uvmexp.swpgonly, -nslots); 1874 sdp->swd_npgbad += nslots; 1875 UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0); 1876 mutex_exit(&uvm_swap_data_lock); 1877 } 1878 1879 /* 1880 * uvm_swap_free: free swap slots 1881 * 1882 * => this can be all or part of an allocation made by uvm_swap_alloc 1883 * => we lock uvm_swap_data_lock 1884 */ 1885 void 1886 uvm_swap_free(int startslot, int nslots) 1887 { 1888 struct swapdev *sdp; 1889 UVMHIST_FUNC(__func__); 1890 UVMHIST_CALLARGS(pdhist, "freeing %jd slots starting at %jd", nslots, 1891 startslot, 0, 0); 1892 1893 /* 1894 * ignore attempts to free the "bad" slot. 1895 */ 1896 1897 if (startslot == SWSLOT_BAD) { 1898 return; 1899 } 1900 1901 /* 1902 * convert drum slot offset back to sdp, free the blocks 1903 * in the extent, and return. must hold pri lock to do 1904 * lookup and access the extent. 1905 */ 1906 1907 mutex_enter(&uvm_swap_data_lock); 1908 sdp = swapdrum_getsdp(startslot); 1909 KASSERT(uvmexp.nswapdev >= 1); 1910 KASSERT(sdp != NULL); 1911 KASSERT(sdp->swd_npginuse >= nslots); 1912 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); 1913 sdp->swd_npginuse -= nslots; 1914 KASSERTMSG(uvmexp.swpginuse >= nslots, "swpginuse %d nslots %d", 1915 uvmexp.swpginuse, nslots); 1916 uvmexp.swpginuse -= nslots; 1917 mutex_exit(&uvm_swap_data_lock); 1918 } 1919 1920 /* 1921 * uvm_swap_put: put any number of pages into a contig place on swap 1922 * 1923 * => can be sync or async 1924 */ 1925 1926 int 1927 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1928 { 1929 int error; 1930 1931 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1932 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1933 return error; 1934 } 1935 1936 /* 1937 * uvm_swap_get: get a single page from swap 1938 * 1939 * => usually a sync op (from fault) 1940 */ 1941 1942 int 1943 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1944 { 1945 int error; 1946 1947 atomic_inc_uint(&uvmexp.nswget); 1948 KASSERT(flags & PGO_SYNCIO); 1949 if (swslot == SWSLOT_BAD) { 1950 return EIO; 1951 } 1952 1953 error = uvm_swap_io(&page, swslot, 1, B_READ | 1954 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1955 if (error == 0) { 1956 1957 /* 1958 * this page is no longer only in swap. 1959 */ 1960 1961 KASSERT(uvmexp.swpgonly > 0); 1962 atomic_dec_uint(&uvmexp.swpgonly); 1963 } 1964 return error; 1965 } 1966 1967 static void 1968 uvm_swap_encrypt_pages(int startslot, void *p, int npages) 1969 { 1970 struct swapdev *sdp; 1971 int i; 1972 1973 if (!atomic_load_relaxed(&uvm_swap_encrypt)) { 1974 return; 1975 } 1976 1977 /* 1978 * Get the swapdev so we can discriminate on the 1979 * encryption state. There may or may not be an 1980 * encryption key generated; we may or may not be asked 1981 * to encrypt swap. 1982 * 1983 * 1. NO KEY, NO ENCRYPTION: Nothing to do. 1984 * 1985 * 2. NO KEY, BUT ENCRYPTION: Generate a key, encrypt, 1986 * and mark the slots encrypted. 1987 * 1988 * 3. KEY, BUT NO ENCRYPTION: The slots may already be 1989 * marked encrypted from a past life. Mark them not 1990 * encrypted. 1991 * 1992 * 4. KEY, ENCRYPTION: Encrypt and mark the slots 1993 * encrypted. 1994 */ 1995 mutex_enter(&uvm_swap_data_lock); 1996 sdp = swapdrum_getsdp(startslot); 1997 if (!sdp->swd_encinit) { 1998 uvm_swap_genkey(sdp); 1999 } 2000 KASSERT(sdp->swd_encinit); 2001 mutex_exit(&uvm_swap_data_lock); 2002 2003 for (i = 0; i < npages; i++) { 2004 int s = startslot + i; 2005 KDASSERT(swapdrum_sdp_is(s, sdp)); 2006 KASSERT(s >= sdp->swd_drumoffset); 2007 s -= sdp->swd_drumoffset; 2008 KASSERT(s < sdp->swd_drumsize); 2009 uvm_swap_encryptpage(sdp, 2010 (void *)((uint8_t *)p + (vsize_t)i*PAGE_SIZE), s); 2011 } 2012 } 2013 2014 void 2015 uvm_swap_decrypt_pages(int startslot, void *p, int npages) 2016 { 2017 struct swapdev *sdp; 2018 bool encinit; 2019 int i; 2020 2021 if (!atomic_load_relaxed(&uvm_swap_encrypt)) { 2022 return; 2023 } 2024 2025 /* 2026 * Get the sdp. Everything about it except the encinit 2027 * bit, saying whether the encryption key is 2028 * initialized or not, and the encrypted bit for each 2029 * page, is stable until all swap pages have been 2030 * released and the device is removed. 2031 */ 2032 mutex_enter(&uvm_swap_data_lock); 2033 sdp = swapdrum_getsdp(startslot); 2034 encinit = sdp->swd_encinit; 2035 mutex_exit(&uvm_swap_data_lock); 2036 2037 /* 2038 * !encinit here means we are reading a swap device which has never 2039 * been written by the swap out process. this should be a user read 2040 * on /dev/drum. 2041 */ 2042 if (!encinit) { 2043 memset(p, 0, npages * PAGE_SIZE); 2044 return; 2045 } 2046 for (i = 0; i < npages; i++) { 2047 int s = startslot + i; 2048 KDASSERT(swapdrum_sdp_is(s, sdp)); 2049 KASSERT(s >= sdp->swd_drumoffset); 2050 s -= sdp->swd_drumoffset; 2051 KASSERT(s < sdp->swd_drumsize); 2052 uvm_swap_decryptpage(sdp, 2053 (void *)((uint8_t *)p + (vsize_t)i*PAGE_SIZE), s); 2054 } 2055 } 2056 2057 /* 2058 * uvm_swap_io: do an i/o operation to swap 2059 */ 2060 2061 static int 2062 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 2063 { 2064 daddr_t startblk; 2065 struct buf *bp; 2066 vaddr_t kva; 2067 int error, mapinflags; 2068 bool write, async, swap_encrypt; 2069 UVMHIST_FUNC(__func__); 2070 UVMHIST_CALLARGS(pdhist, 2071 "<- called, startslot=%jd, npages=%jd, flags=%#jx", 2072 startslot, npages, flags, 0); 2073 2074 write = (flags & B_READ) == 0; 2075 async = (flags & B_ASYNC) != 0; 2076 swap_encrypt = atomic_load_relaxed(&uvm_swap_encrypt); 2077 2078 /* 2079 * allocate a buf for the i/o. 2080 */ 2081 2082 KASSERT(!uvm_lwp_is_pagedaemon(curlwp) || write); 2083 KASSERT(!uvm_lwp_is_pagedaemon(curlwp) || async); 2084 bp = getiobuf(swapdev_vp, !uvm_lwp_is_pagedaemon(curlwp)); 2085 if (bp == NULL) { 2086 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM); 2087 return ENOMEM; 2088 } 2089 2090 /* 2091 * convert starting drum slot to block number 2092 */ 2093 2094 startblk = btodb((uint64_t)startslot << PAGE_SHIFT); 2095 2096 /* 2097 * first, map the pages into the kernel. 2098 */ 2099 2100 mapinflags = !write ? 2101 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : 2102 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; 2103 if (write && swap_encrypt) /* need to encrypt in-place */ 2104 mapinflags |= UVMPAGER_MAPIN_READ; 2105 kva = uvm_pagermapin(pps, npages, mapinflags); 2106 2107 /* 2108 * encrypt writes in place if requested 2109 */ 2110 2111 if (write) { 2112 uvm_swap_encrypt_pages(startslot, (void *)kva, npages); 2113 } 2114 2115 /* 2116 * fill in the bp/sbp. we currently route our i/o through 2117 * /dev/drum's vnode [swapdev_vp]. 2118 */ 2119 2120 bp->b_cflags = BC_BUSY | BC_NOCACHE; 2121 bp->b_flags = (flags & (B_READ|B_ASYNC)); 2122 bp->b_proc = &proc0; /* XXX */ 2123 bp->b_vnbufs.le_next = NOLIST; 2124 bp->b_data = (void *)kva; 2125 bp->b_blkno = startblk; 2126 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 2127 2128 /* 2129 * bump v_numoutput (counter of number of active outputs). 2130 */ 2131 2132 if (write) { 2133 mutex_enter(swapdev_vp->v_interlock); 2134 swapdev_vp->v_numoutput++; 2135 mutex_exit(swapdev_vp->v_interlock); 2136 } 2137 2138 /* 2139 * for async ops we must set up the iodone handler. 2140 */ 2141 2142 if (async) { 2143 bp->b_iodone = uvm_aio_aiodone; 2144 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 2145 if (uvm_lwp_is_pagedaemon(curlwp)) 2146 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 2147 else 2148 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 2149 } else { 2150 bp->b_iodone = NULL; 2151 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 2152 } 2153 UVMHIST_LOG(pdhist, 2154 "about to start io: data = %#jx blkno = %#jx, bcount = %jd", 2155 (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0); 2156 2157 /* 2158 * now we start the I/O, and if async, return. 2159 */ 2160 2161 VOP_STRATEGY(swapdev_vp, bp); 2162 if (async) { 2163 /* 2164 * Reads are always synchronous; if this changes, we 2165 * need to add an asynchronous path for decryption. 2166 */ 2167 KASSERT(write); 2168 return 0; 2169 } 2170 2171 /* 2172 * must be sync i/o. wait for it to finish 2173 */ 2174 2175 error = biowait(bp); 2176 if (error) 2177 goto out; 2178 2179 /* 2180 * decrypt reads in place if needed 2181 */ 2182 2183 if (!write) { 2184 uvm_swap_decrypt_pages(startslot, (void *)kva, npages); 2185 } 2186 out: 2187 /* 2188 * kill the pager mapping 2189 */ 2190 2191 uvm_pagermapout(kva, npages); 2192 2193 /* 2194 * now dispose of the buf and we're done. 2195 */ 2196 2197 if (write) { 2198 mutex_enter(swapdev_vp->v_interlock); 2199 vwakeup(bp); 2200 mutex_exit(swapdev_vp->v_interlock); 2201 } 2202 putiobuf(bp); 2203 UVMHIST_LOG(pdhist, "<- done (sync) error=%jd", error, 0, 0, 0); 2204 2205 return (error); 2206 } 2207 2208 /* 2209 * uvm_swap_genkey(sdp) 2210 * 2211 * Generate a key for swap encryption. 2212 */ 2213 static void 2214 uvm_swap_genkey(struct swapdev *sdp) 2215 { 2216 uint8_t key[32]; 2217 2218 KASSERT(!sdp->swd_encinit); 2219 2220 cprng_strong(kern_cprng, key, sizeof key, 0); 2221 aes_setenckey256(&sdp->swd_enckey, key); 2222 aes_setdeckey256(&sdp->swd_deckey, key); 2223 explicit_memset(key, 0, sizeof key); 2224 2225 sdp->swd_encinit = true; 2226 } 2227 2228 /* 2229 * uvm_swap_encryptpage(sdp, kva, slot) 2230 * 2231 * Encrypt one page of data at kva for the specified slot number 2232 * in the swap device. 2233 */ 2234 static void 2235 uvm_swap_encryptpage(struct swapdev *sdp, void *kva, int slot) 2236 { 2237 uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16); 2238 2239 /* iv := AES_k(le32enc(slot) || 0^96) */ 2240 le32enc(preiv, slot); 2241 aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS); 2242 2243 /* *kva := AES-CBC_k(iv, *kva) */ 2244 aes_cbc_enc(&sdp->swd_enckey, kva, kva, PAGE_SIZE, iv, 2245 AES_256_NROUNDS); 2246 2247 explicit_memset(&iv, 0, sizeof iv); 2248 } 2249 2250 /* 2251 * uvm_swap_decryptpage(sdp, kva, slot) 2252 * 2253 * Decrypt one page of data at kva for the specified slot number 2254 * in the swap device. 2255 */ 2256 static void 2257 uvm_swap_decryptpage(struct swapdev *sdp, void *kva, int slot) 2258 { 2259 uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16); 2260 2261 /* iv := AES_k(le32enc(slot) || 0^96) */ 2262 le32enc(preiv, slot); 2263 aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS); 2264 2265 /* *kva := AES-CBC^{-1}_k(iv, *kva) */ 2266 aes_cbc_dec(&sdp->swd_deckey, kva, kva, PAGE_SIZE, iv, 2267 AES_256_NROUNDS); 2268 2269 explicit_memset(&iv, 0, sizeof iv); 2270 } 2271 2272 static int 2273 sysctl_kern_uvm_swap_encrypt(SYSCTLFN_ARGS) 2274 { 2275 struct sysctlnode node; 2276 int swap_encrypt = uvm_swap_encrypt; 2277 int error; 2278 2279 node = *rnode; 2280 node.sysctl_data = &swap_encrypt; 2281 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 2282 if (error != 0 || newp == NULL) { 2283 return error; 2284 } 2285 /* 2286 * allow a change only when no swap is configured to ensure 2287 * that uvm_swap_encrypt is a constant from the POV of 2288 * any swap devices. 2289 */ 2290 error = 0; 2291 mutex_enter(&uvm_swap_data_lock); 2292 if (uvm_swap_encrypt != swap_encrypt) { 2293 if (LIST_EMPTY(&swap_priority)) { 2294 uvm_swap_encrypt = swap_encrypt; 2295 } else { 2296 error = EBUSY; 2297 } 2298 } 2299 mutex_exit(&uvm_swap_data_lock); 2300 2301 return error; 2302 } 2303 2304 SYSCTL_SETUP(sysctl_uvmswap_setup, "sysctl uvmswap setup") 2305 { 2306 2307 sysctl_createv(clog, 0, NULL, NULL, 2308 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "swap_encrypt", 2309 SYSCTL_DESCR("Encrypt data when swapped out to disk"), 2310 sysctl_kern_uvm_swap_encrypt, 0, NULL, 0, 2311 CTL_VM, CTL_CREATE, CTL_EOL); 2312 } 2313