uvm_swap.c revision 1.206 1 /* $NetBSD: uvm_swap.c,v 1.206 2021/08/23 13:08:18 hannken Exp $ */
2
3 /*
4 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.206 2021/08/23 13:08:18 hannken Exp $");
34
35 #include "opt_uvmhist.h"
36 #include "opt_compat_netbsd.h"
37 #include "opt_ddb.h"
38 #include "opt_vmswap.h"
39
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/atomic.h>
43 #include <sys/buf.h>
44 #include <sys/bufq.h>
45 #include <sys/conf.h>
46 #include <sys/cprng.h>
47 #include <sys/proc.h>
48 #include <sys/namei.h>
49 #include <sys/disklabel.h>
50 #include <sys/errno.h>
51 #include <sys/kernel.h>
52 #include <sys/vnode.h>
53 #include <sys/file.h>
54 #include <sys/vmem.h>
55 #include <sys/blist.h>
56 #include <sys/mount.h>
57 #include <sys/pool.h>
58 #include <sys/kmem.h>
59 #include <sys/syscallargs.h>
60 #include <sys/swap.h>
61 #include <sys/kauth.h>
62 #include <sys/sysctl.h>
63 #include <sys/workqueue.h>
64
65 #include <uvm/uvm.h>
66
67 #include <miscfs/specfs/specdev.h>
68
69 #include <crypto/aes/aes.h>
70 #include <crypto/aes/aes_cbc.h>
71
72 /*
73 * uvm_swap.c: manage configuration and i/o to swap space.
74 */
75
76 /*
77 * swap space is managed in the following way:
78 *
79 * each swap partition or file is described by a "swapdev" structure.
80 * each "swapdev" structure contains a "swapent" structure which contains
81 * information that is passed up to the user (via system calls).
82 *
83 * each swap partition is assigned a "priority" (int) which controls
84 * swap partition usage.
85 *
86 * the system maintains a global data structure describing all swap
87 * partitions/files. there is a sorted LIST of "swappri" structures
88 * which describe "swapdev"'s at that priority. this LIST is headed
89 * by the "swap_priority" global var. each "swappri" contains a
90 * TAILQ of "swapdev" structures at that priority.
91 *
92 * locking:
93 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl
94 * system call and prevents the swap priority list from changing
95 * while we are in the middle of a system call (e.g. SWAP_STATS).
96 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data
97 * structures including the priority list, the swapdev structures,
98 * and the swapmap arena.
99 *
100 * each swap device has the following info:
101 * - swap device in use (could be disabled, preventing future use)
102 * - swap enabled (allows new allocations on swap)
103 * - map info in /dev/drum
104 * - vnode pointer
105 * for swap files only:
106 * - block size
107 * - max byte count in buffer
108 * - buffer
109 *
110 * userland controls and configures swap with the swapctl(2) system call.
111 * the sys_swapctl performs the following operations:
112 * [1] SWAP_NSWAP: returns the number of swap devices currently configured
113 * [2] SWAP_STATS: given a pointer to an array of swapent structures
114 * (passed in via "arg") of a size passed in via "misc" ... we load
115 * the current swap config into the array. The actual work is done
116 * in the uvm_swap_stats() function.
117 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a
118 * priority in "misc", start swapping on it.
119 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
120 * [5] SWAP_CTL: changes the priority of a swap device (new priority in
121 * "misc")
122 */
123
124 /*
125 * swapdev: describes a single swap partition/file
126 *
127 * note the following should be true:
128 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks]
129 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
130 */
131 struct swapdev {
132 dev_t swd_dev; /* device id */
133 int swd_flags; /* flags:inuse/enable/fake */
134 int swd_priority; /* our priority */
135 int swd_nblks; /* blocks in this device */
136 char *swd_path; /* saved pathname of device */
137 int swd_pathlen; /* length of pathname */
138 int swd_npages; /* #pages we can use */
139 int swd_npginuse; /* #pages in use */
140 int swd_npgbad; /* #pages bad */
141 int swd_drumoffset; /* page0 offset in drum */
142 int swd_drumsize; /* #pages in drum */
143 blist_t swd_blist; /* blist for this swapdev */
144 struct vnode *swd_vp; /* backing vnode */
145 TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */
146
147 int swd_bsize; /* blocksize (bytes) */
148 int swd_maxactive; /* max active i/o reqs */
149 struct bufq_state *swd_tab; /* buffer list */
150 int swd_active; /* number of active buffers */
151
152 volatile uint32_t *swd_encmap; /* bitmap of encrypted slots */
153 struct aesenc swd_enckey; /* AES key expanded for enc */
154 struct aesdec swd_deckey; /* AES key expanded for dec */
155 bool swd_encinit; /* true if keys initialized */
156 };
157
158 /*
159 * swap device priority entry; the list is kept sorted on `spi_priority'.
160 */
161 struct swappri {
162 int spi_priority; /* priority */
163 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev;
164 /* tailq of swapdevs at this priority */
165 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */
166 };
167
168 /*
169 * The following two structures are used to keep track of data transfers
170 * on swap devices associated with regular files.
171 * NOTE: this code is more or less a copy of vnd.c; we use the same
172 * structure names here to ease porting..
173 */
174 struct vndxfer {
175 struct buf *vx_bp; /* Pointer to parent buffer */
176 struct swapdev *vx_sdp;
177 int vx_error;
178 int vx_pending; /* # of pending aux buffers */
179 int vx_flags;
180 #define VX_BUSY 1
181 #define VX_DEAD 2
182 };
183
184 struct vndbuf {
185 struct buf vb_buf;
186 struct vndxfer *vb_xfer;
187 };
188
189 /*
190 * We keep a of pool vndbuf's and vndxfer structures.
191 */
192 static struct pool vndxfer_pool, vndbuf_pool;
193
194 /*
195 * local variables
196 */
197 static vmem_t *swapmap; /* controls the mapping of /dev/drum */
198
199 /* list of all active swap devices [by priority] */
200 LIST_HEAD(swap_priority, swappri);
201 static struct swap_priority swap_priority;
202
203 /* locks */
204 static kmutex_t uvm_swap_data_lock __cacheline_aligned;
205 static krwlock_t swap_syscall_lock;
206 bool uvm_swap_init_done = false;
207
208 /* workqueue and use counter for swap to regular files */
209 static int sw_reg_count = 0;
210 static struct workqueue *sw_reg_workqueue;
211
212 /* tuneables */
213 u_int uvm_swapisfull_factor = 99;
214 #if VMSWAP_DEFAULT_PLAINTEXT
215 bool uvm_swap_encrypt = false;
216 #else
217 bool uvm_swap_encrypt = true;
218 #endif
219
220 /*
221 * prototypes
222 */
223 static struct swapdev *swapdrum_getsdp(int);
224
225 static struct swapdev *swaplist_find(struct vnode *, bool);
226 static void swaplist_insert(struct swapdev *,
227 struct swappri *, int);
228 static void swaplist_trim(void);
229
230 static int swap_on(struct lwp *, struct swapdev *);
231 static int swap_off(struct lwp *, struct swapdev *);
232
233 static void sw_reg_strategy(struct swapdev *, struct buf *, int);
234 static void sw_reg_biodone(struct buf *);
235 static void sw_reg_iodone(struct work *wk, void *dummy);
236 static void sw_reg_start(struct swapdev *);
237
238 static int uvm_swap_io(struct vm_page **, int, int, int);
239
240 static void uvm_swap_genkey(struct swapdev *);
241 static void uvm_swap_encryptpage(struct swapdev *, void *, int);
242 static void uvm_swap_decryptpage(struct swapdev *, void *, int);
243
244 static size_t
245 encmap_size(size_t npages)
246 {
247 struct swapdev *sdp;
248 const size_t bytesperword = sizeof(sdp->swd_encmap[0]);
249 const size_t bitsperword = NBBY * bytesperword;
250 const size_t nbits = npages; /* one bit for each page */
251 const size_t nwords = howmany(nbits, bitsperword);
252 const size_t nbytes = nwords * bytesperword;
253
254 return nbytes;
255 }
256
257 /*
258 * uvm_swap_init: init the swap system data structures and locks
259 *
260 * => called at boot time from init_main.c after the filesystems
261 * are brought up (which happens after uvm_init())
262 */
263 void
264 uvm_swap_init(void)
265 {
266 UVMHIST_FUNC(__func__);
267
268 UVMHIST_CALLED(pdhist);
269 /*
270 * first, init the swap list, its counter, and its lock.
271 * then get a handle on the vnode for /dev/drum by using
272 * the its dev_t number ("swapdev", from MD conf.c).
273 */
274
275 LIST_INIT(&swap_priority);
276 uvmexp.nswapdev = 0;
277 rw_init(&swap_syscall_lock);
278 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE);
279
280 if (bdevvp(swapdev, &swapdev_vp))
281 panic("%s: can't get vnode for swap device", __func__);
282 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY))
283 panic("%s: can't lock swap device", __func__);
284 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED))
285 panic("%s: can't open swap device", __func__);
286 VOP_UNLOCK(swapdev_vp);
287
288 /*
289 * create swap block resource map to map /dev/drum. the range
290 * from 1 to INT_MAX allows 2 gigablocks of swap space. note
291 * that block 0 is reserved (used to indicate an allocation
292 * failure, or no allocation).
293 */
294 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0,
295 VM_NOSLEEP, IPL_NONE);
296 if (swapmap == 0) {
297 panic("%s: vmem_create failed", __func__);
298 }
299
300 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx",
301 NULL, IPL_BIO);
302 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd",
303 NULL, IPL_BIO);
304
305 uvm_swap_init_done = true;
306
307 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
308 }
309
310 /*
311 * swaplist functions: functions that operate on the list of swap
312 * devices on the system.
313 */
314
315 /*
316 * swaplist_insert: insert swap device "sdp" into the global list
317 *
318 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
319 * => caller must provide a newly allocated swappri structure (we will
320 * FREE it if we don't need it... this it to prevent allocation
321 * blocking here while adding swap)
322 */
323 static void
324 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
325 {
326 struct swappri *spp, *pspp;
327 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
328
329 KASSERT(rw_write_held(&swap_syscall_lock));
330 KASSERT(mutex_owned(&uvm_swap_data_lock));
331
332 /*
333 * find entry at or after which to insert the new device.
334 */
335 pspp = NULL;
336 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
337 if (priority <= spp->spi_priority)
338 break;
339 pspp = spp;
340 }
341
342 /*
343 * new priority?
344 */
345 if (spp == NULL || spp->spi_priority != priority) {
346 spp = newspp; /* use newspp! */
347 UVMHIST_LOG(pdhist, "created new swappri = %jd",
348 priority, 0, 0, 0);
349
350 spp->spi_priority = priority;
351 TAILQ_INIT(&spp->spi_swapdev);
352
353 if (pspp)
354 LIST_INSERT_AFTER(pspp, spp, spi_swappri);
355 else
356 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
357 } else {
358 /* we don't need a new priority structure, free it */
359 kmem_free(newspp, sizeof(*newspp));
360 }
361
362 /*
363 * priority found (or created). now insert on the priority's
364 * tailq list and bump the total number of swapdevs.
365 */
366 sdp->swd_priority = priority;
367 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
368 uvmexp.nswapdev++;
369 }
370
371 /*
372 * swaplist_find: find and optionally remove a swap device from the
373 * global list.
374 *
375 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
376 * => we return the swapdev we found (and removed)
377 */
378 static struct swapdev *
379 swaplist_find(struct vnode *vp, bool remove)
380 {
381 struct swapdev *sdp;
382 struct swappri *spp;
383
384 KASSERT(rw_lock_held(&swap_syscall_lock));
385 KASSERT(remove ? rw_write_held(&swap_syscall_lock) : 1);
386 KASSERT(mutex_owned(&uvm_swap_data_lock));
387
388 /*
389 * search the lists for the requested vp
390 */
391
392 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
393 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
394 if (sdp->swd_vp == vp) {
395 if (remove) {
396 TAILQ_REMOVE(&spp->spi_swapdev,
397 sdp, swd_next);
398 uvmexp.nswapdev--;
399 }
400 return(sdp);
401 }
402 }
403 }
404 return (NULL);
405 }
406
407 /*
408 * swaplist_trim: scan priority list for empty priority entries and kill
409 * them.
410 *
411 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
412 */
413 static void
414 swaplist_trim(void)
415 {
416 struct swappri *spp, *nextspp;
417
418 KASSERT(rw_write_held(&swap_syscall_lock));
419 KASSERT(mutex_owned(&uvm_swap_data_lock));
420
421 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) {
422 if (!TAILQ_EMPTY(&spp->spi_swapdev))
423 continue;
424 LIST_REMOVE(spp, spi_swappri);
425 kmem_free(spp, sizeof(*spp));
426 }
427 }
428
429 /*
430 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
431 * to the "swapdev" that maps that section of the drum.
432 *
433 * => each swapdev takes one big contig chunk of the drum
434 * => caller must hold uvm_swap_data_lock
435 */
436 static struct swapdev *
437 swapdrum_getsdp(int pgno)
438 {
439 struct swapdev *sdp;
440 struct swappri *spp;
441
442 KASSERT(mutex_owned(&uvm_swap_data_lock));
443
444 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
445 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
446 if (sdp->swd_flags & SWF_FAKE)
447 continue;
448 if (pgno >= sdp->swd_drumoffset &&
449 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
450 return sdp;
451 }
452 }
453 }
454 return NULL;
455 }
456
457 /*
458 * swapdrum_sdp_is: true iff the swap device for pgno is sdp
459 *
460 * => for use in positive assertions only; result is not stable
461 */
462 static bool __debugused
463 swapdrum_sdp_is(int pgno, struct swapdev *sdp)
464 {
465 bool result;
466
467 mutex_enter(&uvm_swap_data_lock);
468 result = swapdrum_getsdp(pgno) == sdp;
469 mutex_exit(&uvm_swap_data_lock);
470
471 return result;
472 }
473
474 void swapsys_lock(krw_t op)
475 {
476 rw_enter(&swap_syscall_lock, op);
477 }
478
479 void swapsys_unlock(void)
480 {
481 rw_exit(&swap_syscall_lock);
482 }
483
484 static void
485 swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse)
486 {
487 se->se_dev = sdp->swd_dev;
488 se->se_flags = sdp->swd_flags;
489 se->se_nblks = sdp->swd_nblks;
490 se->se_inuse = inuse;
491 se->se_priority = sdp->swd_priority;
492 KASSERT(sdp->swd_pathlen < sizeof(se->se_path));
493 strcpy(se->se_path, sdp->swd_path);
494 }
495
496 int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) =
497 (void *)enosys;
498 int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) =
499 (void *)enosys;
500
501 /*
502 * sys_swapctl: main entry point for swapctl(2) system call
503 * [with two helper functions: swap_on and swap_off]
504 */
505 int
506 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval)
507 {
508 /* {
509 syscallarg(int) cmd;
510 syscallarg(void *) arg;
511 syscallarg(int) misc;
512 } */
513 struct vnode *vp;
514 struct nameidata nd;
515 struct swappri *spp;
516 struct swapdev *sdp;
517 #define SWAP_PATH_MAX (PATH_MAX + 1)
518 char *userpath;
519 size_t len = 0;
520 int error;
521 int priority;
522 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
523
524 /*
525 * we handle the non-priv NSWAP and STATS request first.
526 *
527 * SWAP_NSWAP: return number of config'd swap devices
528 * [can also be obtained with uvmexp sysctl]
529 */
530 if (SCARG(uap, cmd) == SWAP_NSWAP) {
531 const int nswapdev = uvmexp.nswapdev;
532 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev,
533 0, 0, 0);
534 *retval = nswapdev;
535 return 0;
536 }
537
538 userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP);
539
540 /*
541 * ensure serialized syscall access by grabbing the swap_syscall_lock
542 */
543 rw_enter(&swap_syscall_lock, RW_WRITER);
544
545 /*
546 * SWAP_STATS: get stats on current # of configured swap devs
547 *
548 * note that the swap_priority list can't change as long
549 * as we are holding the swap_syscall_lock. we don't want
550 * to grab the uvm_swap_data_lock because we may fault&sleep during
551 * copyout() and we don't want to be holding that lock then!
552 */
553 switch (SCARG(uap, cmd)) {
554 case SWAP_STATS13:
555 error = (*uvm_swap_stats13)(uap, retval);
556 goto out;
557 case SWAP_STATS50:
558 error = (*uvm_swap_stats50)(uap, retval);
559 goto out;
560 case SWAP_STATS:
561 error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc),
562 NULL, sizeof(struct swapent), retval);
563 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
564 goto out;
565
566 case SWAP_GETDUMPDEV:
567 error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev));
568 goto out;
569 default:
570 break;
571 }
572
573 /*
574 * all other requests require superuser privs. verify.
575 */
576 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL,
577 0, NULL, NULL, NULL)))
578 goto out;
579
580 if (SCARG(uap, cmd) == SWAP_DUMPOFF) {
581 /* drop the current dump device */
582 dumpdev = NODEV;
583 dumpcdev = NODEV;
584 cpu_dumpconf();
585 goto out;
586 }
587
588 /*
589 * at this point we expect a path name in arg. we will
590 * use namei() to gain a vnode reference (vref), and lock
591 * the vnode (VOP_LOCK).
592 *
593 * XXX: a NULL arg means use the root vnode pointer (e.g. for
594 * miniroot)
595 */
596 if (SCARG(uap, arg) == NULL) {
597 vp = rootvp; /* miniroot */
598 vref(vp);
599 if (vn_lock(vp, LK_EXCLUSIVE)) {
600 vrele(vp);
601 error = EBUSY;
602 goto out;
603 }
604 if (SCARG(uap, cmd) == SWAP_ON &&
605 copystr("miniroot", userpath, SWAP_PATH_MAX, &len))
606 panic("swapctl: miniroot copy failed");
607 } else {
608 struct pathbuf *pb;
609
610 /*
611 * This used to allow copying in one extra byte
612 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON.
613 * This was completely pointless because if anyone
614 * used that extra byte namei would fail with
615 * ENAMETOOLONG anyway, so I've removed the excess
616 * logic. - dholland 20100215
617 */
618
619 error = pathbuf_copyin(SCARG(uap, arg), &pb);
620 if (error) {
621 goto out;
622 }
623 if (SCARG(uap, cmd) == SWAP_ON) {
624 /* get a copy of the string */
625 pathbuf_copystring(pb, userpath, SWAP_PATH_MAX);
626 len = strlen(userpath) + 1;
627 }
628 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
629 if ((error = namei(&nd))) {
630 pathbuf_destroy(pb);
631 goto out;
632 }
633 vp = nd.ni_vp;
634 pathbuf_destroy(pb);
635 }
636 /* note: "vp" is referenced and locked */
637
638 error = 0; /* assume no error */
639 switch(SCARG(uap, cmd)) {
640
641 case SWAP_DUMPDEV:
642 if (vp->v_type != VBLK) {
643 error = ENOTBLK;
644 break;
645 }
646 if (bdevsw_lookup(vp->v_rdev)) {
647 dumpdev = vp->v_rdev;
648 dumpcdev = devsw_blk2chr(dumpdev);
649 } else
650 dumpdev = NODEV;
651 cpu_dumpconf();
652 break;
653
654 case SWAP_CTL:
655 /*
656 * get new priority, remove old entry (if any) and then
657 * reinsert it in the correct place. finally, prune out
658 * any empty priority structures.
659 */
660 priority = SCARG(uap, misc);
661 spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
662 mutex_enter(&uvm_swap_data_lock);
663 if ((sdp = swaplist_find(vp, true)) == NULL) {
664 error = ENOENT;
665 } else {
666 swaplist_insert(sdp, spp, priority);
667 swaplist_trim();
668 }
669 mutex_exit(&uvm_swap_data_lock);
670 if (error)
671 kmem_free(spp, sizeof(*spp));
672 break;
673
674 case SWAP_ON:
675
676 /*
677 * check for duplicates. if none found, then insert a
678 * dummy entry on the list to prevent someone else from
679 * trying to enable this device while we are working on
680 * it.
681 */
682
683 priority = SCARG(uap, misc);
684 sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP);
685 spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
686 sdp->swd_flags = SWF_FAKE;
687 sdp->swd_vp = vp;
688 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
689 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK);
690 mutex_enter(&uvm_swap_data_lock);
691 if (swaplist_find(vp, false) != NULL) {
692 error = EBUSY;
693 mutex_exit(&uvm_swap_data_lock);
694 bufq_free(sdp->swd_tab);
695 kmem_free(sdp, sizeof(*sdp));
696 kmem_free(spp, sizeof(*spp));
697 break;
698 }
699 swaplist_insert(sdp, spp, priority);
700 mutex_exit(&uvm_swap_data_lock);
701
702 KASSERT(len > 0);
703 sdp->swd_pathlen = len;
704 sdp->swd_path = kmem_alloc(len, KM_SLEEP);
705 if (copystr(userpath, sdp->swd_path, len, 0) != 0)
706 panic("swapctl: copystr");
707
708 /*
709 * we've now got a FAKE placeholder in the swap list.
710 * now attempt to enable swap on it. if we fail, undo
711 * what we've done and kill the fake entry we just inserted.
712 * if swap_on is a success, it will clear the SWF_FAKE flag
713 */
714
715 if ((error = swap_on(l, sdp)) != 0) {
716 mutex_enter(&uvm_swap_data_lock);
717 (void) swaplist_find(vp, true); /* kill fake entry */
718 swaplist_trim();
719 mutex_exit(&uvm_swap_data_lock);
720 bufq_free(sdp->swd_tab);
721 kmem_free(sdp->swd_path, sdp->swd_pathlen);
722 kmem_free(sdp, sizeof(*sdp));
723 break;
724 }
725 break;
726
727 case SWAP_OFF:
728 mutex_enter(&uvm_swap_data_lock);
729 if ((sdp = swaplist_find(vp, false)) == NULL) {
730 mutex_exit(&uvm_swap_data_lock);
731 error = ENXIO;
732 break;
733 }
734
735 /*
736 * If a device isn't in use or enabled, we
737 * can't stop swapping from it (again).
738 */
739 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
740 mutex_exit(&uvm_swap_data_lock);
741 error = EBUSY;
742 break;
743 }
744
745 /*
746 * do the real work.
747 */
748 error = swap_off(l, sdp);
749 break;
750
751 default:
752 error = EINVAL;
753 }
754
755 /*
756 * done! release the ref gained by namei() and unlock.
757 */
758 vput(vp);
759 out:
760 rw_exit(&swap_syscall_lock);
761 kmem_free(userpath, SWAP_PATH_MAX);
762
763 UVMHIST_LOG(pdhist, "<- done! error=%jd", error, 0, 0, 0);
764 return (error);
765 }
766
767 /*
768 * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept
769 * away from sys_swapctl() in order to allow COMPAT_* swapctl()
770 * emulation to use it directly without going through sys_swapctl().
771 * The problem with using sys_swapctl() there is that it involves
772 * copying the swapent array to the stackgap, and this array's size
773 * is not known at build time. Hence it would not be possible to
774 * ensure it would fit in the stackgap in any case.
775 */
776 int
777 uvm_swap_stats(char *ptr, int misc,
778 void (*f)(void *, const struct swapent *), size_t len,
779 register_t *retval)
780 {
781 struct swappri *spp;
782 struct swapdev *sdp;
783 struct swapent sep;
784 int count = 0;
785 int error;
786
787 KASSERT(len <= sizeof(sep));
788 if (len == 0)
789 return ENOSYS;
790
791 if (misc < 0)
792 return EINVAL;
793
794 if (misc == 0 || uvmexp.nswapdev == 0)
795 return 0;
796
797 /* Make sure userland cannot exhaust kernel memory */
798 if ((size_t)misc > (size_t)uvmexp.nswapdev)
799 misc = uvmexp.nswapdev;
800
801 KASSERT(rw_lock_held(&swap_syscall_lock));
802
803 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
804 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
805 int inuse;
806
807 if (misc-- <= 0)
808 break;
809
810 inuse = btodb((uint64_t)sdp->swd_npginuse <<
811 PAGE_SHIFT);
812
813 memset(&sep, 0, sizeof(sep));
814 swapent_cvt(&sep, sdp, inuse);
815 if (f)
816 (*f)(&sep, &sep);
817 if ((error = copyout(&sep, ptr, len)) != 0)
818 return error;
819 ptr += len;
820 count++;
821 }
822 }
823 *retval = count;
824 return 0;
825 }
826
827 /*
828 * swap_on: attempt to enable a swapdev for swapping. note that the
829 * swapdev is already on the global list, but disabled (marked
830 * SWF_FAKE).
831 *
832 * => we avoid the start of the disk (to protect disk labels)
833 * => we also avoid the miniroot, if we are swapping to root.
834 * => caller should leave uvm_swap_data_lock unlocked, we may lock it
835 * if needed.
836 */
837 static int
838 swap_on(struct lwp *l, struct swapdev *sdp)
839 {
840 struct vnode *vp;
841 int error, npages, nblocks, size;
842 long addr;
843 vmem_addr_t result;
844 struct vattr va;
845 dev_t dev;
846 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
847
848 /*
849 * we want to enable swapping on sdp. the swd_vp contains
850 * the vnode we want (locked and ref'd), and the swd_dev
851 * contains the dev_t of the file, if it a block device.
852 */
853
854 vp = sdp->swd_vp;
855 dev = sdp->swd_dev;
856
857 /*
858 * open the swap file (mostly useful for block device files to
859 * let device driver know what is up).
860 *
861 * we skip the open/close for root on swap because the root
862 * has already been opened when root was mounted (mountroot).
863 */
864 if (vp != rootvp) {
865 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred)))
866 return (error);
867 }
868
869 /* XXX this only works for block devices */
870 UVMHIST_LOG(pdhist, " dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0);
871
872 /*
873 * we now need to determine the size of the swap area. for
874 * block specials we can call the d_psize function.
875 * for normal files, we must stat [get attrs].
876 *
877 * we put the result in nblks.
878 * for normal files, we also want the filesystem block size
879 * (which we get with statfs).
880 */
881 switch (vp->v_type) {
882 case VBLK:
883 if ((nblocks = bdev_size(dev)) == -1) {
884 error = ENXIO;
885 goto bad;
886 }
887 break;
888
889 case VREG:
890 if ((error = VOP_GETATTR(vp, &va, l->l_cred)))
891 goto bad;
892 nblocks = (int)btodb(va.va_size);
893 sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift;
894 /*
895 * limit the max # of outstanding I/O requests we issue
896 * at any one time. take it easy on NFS servers.
897 */
898 if (vp->v_tag == VT_NFS)
899 sdp->swd_maxactive = 2; /* XXX */
900 else
901 sdp->swd_maxactive = 8; /* XXX */
902 break;
903
904 default:
905 error = ENXIO;
906 goto bad;
907 }
908
909 /*
910 * save nblocks in a safe place and convert to pages.
911 */
912
913 sdp->swd_nblks = nblocks;
914 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT;
915
916 /*
917 * for block special files, we want to make sure that leave
918 * the disklabel and bootblocks alone, so we arrange to skip
919 * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
920 * note that because of this the "size" can be less than the
921 * actual number of blocks on the device.
922 */
923 if (vp->v_type == VBLK) {
924 /* we use pages 1 to (size - 1) [inclusive] */
925 size = npages - 1;
926 addr = 1;
927 } else {
928 /* we use pages 0 to (size - 1) [inclusive] */
929 size = npages;
930 addr = 0;
931 }
932
933 /*
934 * make sure we have enough blocks for a reasonable sized swap
935 * area. we want at least one page.
936 */
937
938 if (size < 1) {
939 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0);
940 error = EINVAL;
941 goto bad;
942 }
943
944 UVMHIST_LOG(pdhist, " dev=%#jx: size=%jd addr=%jd", dev, size, addr, 0);
945
946 /*
947 * now we need to allocate an extent to manage this swap device
948 */
949
950 sdp->swd_blist = blist_create(npages);
951 /* mark all expect the `saved' region free. */
952 blist_free(sdp->swd_blist, addr, size);
953
954 /*
955 * allocate space to for swap encryption state and mark the
956 * keys uninitialized so we generate them lazily
957 */
958 sdp->swd_encmap = kmem_zalloc(encmap_size(npages), KM_SLEEP);
959 sdp->swd_encinit = false;
960
961 /*
962 * if the vnode we are swapping to is the root vnode
963 * (i.e. we are swapping to the miniroot) then we want
964 * to make sure we don't overwrite it. do a statfs to
965 * find its size and skip over it.
966 */
967 if (vp == rootvp) {
968 struct mount *mp;
969 struct statvfs *sp;
970 int rootblocks, rootpages;
971
972 mp = rootvnode->v_mount;
973 sp = &mp->mnt_stat;
974 rootblocks = sp->f_blocks * btodb(sp->f_frsize);
975 /*
976 * XXX: sp->f_blocks isn't the total number of
977 * blocks in the filesystem, it's the number of
978 * data blocks. so, our rootblocks almost
979 * definitely underestimates the total size
980 * of the filesystem - how badly depends on the
981 * details of the filesystem type. there isn't
982 * an obvious way to deal with this cleanly
983 * and perfectly, so for now we just pad our
984 * rootblocks estimate with an extra 5 percent.
985 */
986 rootblocks += (rootblocks >> 5) +
987 (rootblocks >> 6) +
988 (rootblocks >> 7);
989 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
990 if (rootpages > size)
991 panic("swap_on: miniroot larger than swap?");
992
993 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) {
994 panic("swap_on: unable to preserve miniroot");
995 }
996
997 size -= rootpages;
998 printf("Preserved %d pages of miniroot ", rootpages);
999 printf("leaving %d pages of swap\n", size);
1000 }
1001
1002 /*
1003 * add a ref to vp to reflect usage as a swap device.
1004 */
1005 vref(vp);
1006
1007 /*
1008 * now add the new swapdev to the drum and enable.
1009 */
1010 error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result);
1011 if (error != 0)
1012 panic("swapdrum_add");
1013 /*
1014 * If this is the first regular swap create the workqueue.
1015 * => Protected by swap_syscall_lock.
1016 */
1017 if (vp->v_type != VBLK) {
1018 if (sw_reg_count++ == 0) {
1019 KASSERT(sw_reg_workqueue == NULL);
1020 if (workqueue_create(&sw_reg_workqueue, "swapiod",
1021 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0)
1022 panic("%s: workqueue_create failed", __func__);
1023 }
1024 }
1025
1026 sdp->swd_drumoffset = (int)result;
1027 sdp->swd_drumsize = npages;
1028 sdp->swd_npages = size;
1029 mutex_enter(&uvm_swap_data_lock);
1030 sdp->swd_flags &= ~SWF_FAKE; /* going live */
1031 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
1032 uvmexp.swpages += size;
1033 uvmexp.swpgavail += size;
1034 mutex_exit(&uvm_swap_data_lock);
1035 return (0);
1036
1037 /*
1038 * failure: clean up and return error.
1039 */
1040
1041 bad:
1042 if (sdp->swd_blist) {
1043 blist_destroy(sdp->swd_blist);
1044 }
1045 if (vp != rootvp) {
1046 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred);
1047 }
1048 return (error);
1049 }
1050
1051 /*
1052 * swap_off: stop swapping on swapdev
1053 *
1054 * => swap data should be locked, we will unlock.
1055 */
1056 static int
1057 swap_off(struct lwp *l, struct swapdev *sdp)
1058 {
1059 int npages = sdp->swd_npages;
1060 int error = 0;
1061
1062 UVMHIST_FUNC(__func__);
1063 UVMHIST_CALLARGS(pdhist, " dev=%#jx, npages=%jd", sdp->swd_dev,npages, 0, 0);
1064
1065 KASSERT(rw_write_held(&swap_syscall_lock));
1066 KASSERT(mutex_owned(&uvm_swap_data_lock));
1067
1068 /* disable the swap area being removed */
1069 sdp->swd_flags &= ~SWF_ENABLE;
1070 uvmexp.swpgavail -= npages;
1071 mutex_exit(&uvm_swap_data_lock);
1072
1073 /*
1074 * the idea is to find all the pages that are paged out to this
1075 * device, and page them all in. in uvm, swap-backed pageable
1076 * memory can take two forms: aobjs and anons. call the
1077 * swapoff hook for each subsystem to bring in pages.
1078 */
1079
1080 if (uao_swap_off(sdp->swd_drumoffset,
1081 sdp->swd_drumoffset + sdp->swd_drumsize) ||
1082 amap_swap_off(sdp->swd_drumoffset,
1083 sdp->swd_drumoffset + sdp->swd_drumsize)) {
1084 error = ENOMEM;
1085 } else if (sdp->swd_npginuse > sdp->swd_npgbad) {
1086 error = EBUSY;
1087 }
1088
1089 if (error) {
1090 mutex_enter(&uvm_swap_data_lock);
1091 sdp->swd_flags |= SWF_ENABLE;
1092 uvmexp.swpgavail += npages;
1093 mutex_exit(&uvm_swap_data_lock);
1094
1095 return error;
1096 }
1097
1098 /*
1099 * If this is the last regular swap destroy the workqueue.
1100 * => Protected by swap_syscall_lock.
1101 */
1102 if (sdp->swd_vp->v_type != VBLK) {
1103 KASSERT(sw_reg_count > 0);
1104 KASSERT(sw_reg_workqueue != NULL);
1105 if (--sw_reg_count == 0) {
1106 workqueue_destroy(sw_reg_workqueue);
1107 sw_reg_workqueue = NULL;
1108 }
1109 }
1110
1111 /*
1112 * done with the vnode.
1113 * drop our ref on the vnode before calling VOP_CLOSE()
1114 * so that spec_close() can tell if this is the last close.
1115 */
1116 vrele(sdp->swd_vp);
1117 if (sdp->swd_vp != rootvp) {
1118 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred);
1119 }
1120
1121 mutex_enter(&uvm_swap_data_lock);
1122 uvmexp.swpages -= npages;
1123 uvmexp.swpginuse -= sdp->swd_npgbad;
1124
1125 if (swaplist_find(sdp->swd_vp, true) == NULL)
1126 panic("%s: swapdev not in list", __func__);
1127 swaplist_trim();
1128 mutex_exit(&uvm_swap_data_lock);
1129
1130 /*
1131 * free all resources!
1132 */
1133 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize);
1134 blist_destroy(sdp->swd_blist);
1135 bufq_free(sdp->swd_tab);
1136 kmem_free(__UNVOLATILE(sdp->swd_encmap),
1137 encmap_size(sdp->swd_drumsize));
1138 explicit_memset(&sdp->swd_enckey, 0, sizeof sdp->swd_enckey);
1139 explicit_memset(&sdp->swd_deckey, 0, sizeof sdp->swd_deckey);
1140 kmem_free(sdp, sizeof(*sdp));
1141 return (0);
1142 }
1143
1144 void
1145 uvm_swap_shutdown(struct lwp *l)
1146 {
1147 struct swapdev *sdp;
1148 struct swappri *spp;
1149 struct vnode *vp;
1150 int error;
1151
1152 if (!uvm_swap_init_done || uvmexp.nswapdev == 0)
1153 return;
1154 printf("turning off swap...");
1155 rw_enter(&swap_syscall_lock, RW_WRITER);
1156 mutex_enter(&uvm_swap_data_lock);
1157 again:
1158 LIST_FOREACH(spp, &swap_priority, spi_swappri)
1159 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1160 if (sdp->swd_flags & SWF_FAKE)
1161 continue;
1162 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0)
1163 continue;
1164 #ifdef DEBUG
1165 printf("\nturning off swap on %s...", sdp->swd_path);
1166 #endif
1167 /* Have to lock and reference vnode for swap_off(). */
1168 vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE|LK_RETRY);
1169 vref(vp);
1170 error = swap_off(l, sdp);
1171 vput(vp);
1172 mutex_enter(&uvm_swap_data_lock);
1173 if (error) {
1174 printf("stopping swap on %s failed "
1175 "with error %d\n", sdp->swd_path, error);
1176 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1177 uvmexp.nswapdev--;
1178 swaplist_trim();
1179 }
1180 goto again;
1181 }
1182 printf(" done\n");
1183 mutex_exit(&uvm_swap_data_lock);
1184 rw_exit(&swap_syscall_lock);
1185 }
1186
1187
1188 /*
1189 * /dev/drum interface and i/o functions
1190 */
1191
1192 /*
1193 * swstrategy: perform I/O on the drum
1194 *
1195 * => we must map the i/o request from the drum to the correct swapdev.
1196 */
1197 static void
1198 swstrategy(struct buf *bp)
1199 {
1200 struct swapdev *sdp;
1201 struct vnode *vp;
1202 int pageno, bn;
1203 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1204
1205 /*
1206 * convert block number to swapdev. note that swapdev can't
1207 * be yanked out from under us because we are holding resources
1208 * in it (i.e. the blocks we are doing I/O on).
1209 */
1210 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
1211 mutex_enter(&uvm_swap_data_lock);
1212 sdp = swapdrum_getsdp(pageno);
1213 mutex_exit(&uvm_swap_data_lock);
1214 if (sdp == NULL) {
1215 bp->b_error = EINVAL;
1216 bp->b_resid = bp->b_bcount;
1217 biodone(bp);
1218 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0);
1219 return;
1220 }
1221
1222 /*
1223 * convert drum page number to block number on this swapdev.
1224 */
1225
1226 pageno -= sdp->swd_drumoffset; /* page # on swapdev */
1227 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
1228
1229 UVMHIST_LOG(pdhist, " Rd/Wr (0/1) %jd: mapoff=%#jx bn=%#jx bcount=%jd",
1230 ((bp->b_flags & B_READ) == 0) ? 1 : 0,
1231 sdp->swd_drumoffset, bn, bp->b_bcount);
1232
1233 /*
1234 * for block devices we finish up here.
1235 * for regular files we have to do more work which we delegate
1236 * to sw_reg_strategy().
1237 */
1238
1239 vp = sdp->swd_vp; /* swapdev vnode pointer */
1240 switch (vp->v_type) {
1241 default:
1242 panic("%s: vnode type 0x%x", __func__, vp->v_type);
1243
1244 case VBLK:
1245
1246 /*
1247 * must convert "bp" from an I/O on /dev/drum to an I/O
1248 * on the swapdev (sdp).
1249 */
1250 bp->b_blkno = bn; /* swapdev block number */
1251 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */
1252
1253 /*
1254 * if we are doing a write, we have to redirect the i/o on
1255 * drum's v_numoutput counter to the swapdevs.
1256 */
1257 if ((bp->b_flags & B_READ) == 0) {
1258 mutex_enter(bp->b_objlock);
1259 vwakeup(bp); /* kills one 'v_numoutput' on drum */
1260 mutex_exit(bp->b_objlock);
1261 mutex_enter(vp->v_interlock);
1262 vp->v_numoutput++; /* put it on swapdev */
1263 mutex_exit(vp->v_interlock);
1264 }
1265
1266 /*
1267 * finally plug in swapdev vnode and start I/O
1268 */
1269 bp->b_vp = vp;
1270 bp->b_objlock = vp->v_interlock;
1271 VOP_STRATEGY(vp, bp);
1272 return;
1273
1274 case VREG:
1275 /*
1276 * delegate to sw_reg_strategy function.
1277 */
1278 sw_reg_strategy(sdp, bp, bn);
1279 return;
1280 }
1281 /* NOTREACHED */
1282 }
1283
1284 /*
1285 * swread: the read function for the drum (just a call to physio)
1286 */
1287 /*ARGSUSED*/
1288 static int
1289 swread(dev_t dev, struct uio *uio, int ioflag)
1290 {
1291 UVMHIST_FUNC(__func__);
1292 UVMHIST_CALLARGS(pdhist, " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0);
1293
1294 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1295 }
1296
1297 /*
1298 * swwrite: the write function for the drum (just a call to physio)
1299 */
1300 /*ARGSUSED*/
1301 static int
1302 swwrite(dev_t dev, struct uio *uio, int ioflag)
1303 {
1304 UVMHIST_FUNC(__func__);
1305 UVMHIST_CALLARGS(pdhist, " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0);
1306
1307 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1308 }
1309
1310 const struct bdevsw swap_bdevsw = {
1311 .d_open = nullopen,
1312 .d_close = nullclose,
1313 .d_strategy = swstrategy,
1314 .d_ioctl = noioctl,
1315 .d_dump = nodump,
1316 .d_psize = nosize,
1317 .d_discard = nodiscard,
1318 .d_flag = D_OTHER
1319 };
1320
1321 const struct cdevsw swap_cdevsw = {
1322 .d_open = nullopen,
1323 .d_close = nullclose,
1324 .d_read = swread,
1325 .d_write = swwrite,
1326 .d_ioctl = noioctl,
1327 .d_stop = nostop,
1328 .d_tty = notty,
1329 .d_poll = nopoll,
1330 .d_mmap = nommap,
1331 .d_kqfilter = nokqfilter,
1332 .d_discard = nodiscard,
1333 .d_flag = D_OTHER,
1334 };
1335
1336 /*
1337 * sw_reg_strategy: handle swap i/o to regular files
1338 */
1339 static void
1340 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn)
1341 {
1342 struct vnode *vp;
1343 struct vndxfer *vnx;
1344 daddr_t nbn;
1345 char *addr;
1346 off_t byteoff;
1347 int s, off, nra, error, sz, resid;
1348 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1349
1350 /*
1351 * allocate a vndxfer head for this transfer and point it to
1352 * our buffer.
1353 */
1354 vnx = pool_get(&vndxfer_pool, PR_WAITOK);
1355 vnx->vx_flags = VX_BUSY;
1356 vnx->vx_error = 0;
1357 vnx->vx_pending = 0;
1358 vnx->vx_bp = bp;
1359 vnx->vx_sdp = sdp;
1360
1361 /*
1362 * setup for main loop where we read filesystem blocks into
1363 * our buffer.
1364 */
1365 error = 0;
1366 bp->b_resid = bp->b_bcount; /* nothing transferred yet! */
1367 addr = bp->b_data; /* current position in buffer */
1368 byteoff = dbtob((uint64_t)bn);
1369
1370 for (resid = bp->b_resid; resid; resid -= sz) {
1371 struct vndbuf *nbp;
1372
1373 /*
1374 * translate byteoffset into block number. return values:
1375 * vp = vnode of underlying device
1376 * nbn = new block number (on underlying vnode dev)
1377 * nra = num blocks we can read-ahead (excludes requested
1378 * block)
1379 */
1380 nra = 0;
1381 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1382 &vp, &nbn, &nra);
1383
1384 if (error == 0 && nbn == (daddr_t)-1) {
1385 /*
1386 * this used to just set error, but that doesn't
1387 * do the right thing. Instead, it causes random
1388 * memory errors. The panic() should remain until
1389 * this condition doesn't destabilize the system.
1390 */
1391 #if 1
1392 panic("%s: swap to sparse file", __func__);
1393 #else
1394 error = EIO; /* failure */
1395 #endif
1396 }
1397
1398 /*
1399 * punt if there was an error or a hole in the file.
1400 * we must wait for any i/o ops we have already started
1401 * to finish before returning.
1402 *
1403 * XXX we could deal with holes here but it would be
1404 * a hassle (in the write case).
1405 */
1406 if (error) {
1407 s = splbio();
1408 vnx->vx_error = error; /* pass error up */
1409 goto out;
1410 }
1411
1412 /*
1413 * compute the size ("sz") of this transfer (in bytes).
1414 */
1415 off = byteoff % sdp->swd_bsize;
1416 sz = (1 + nra) * sdp->swd_bsize - off;
1417 if (sz > resid)
1418 sz = resid;
1419
1420 UVMHIST_LOG(pdhist, "sw_reg_strategy: "
1421 "vp %#jx/%#jx offset %#jx/%#jx",
1422 (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn);
1423
1424 /*
1425 * now get a buf structure. note that the vb_buf is
1426 * at the front of the nbp structure so that you can
1427 * cast pointers between the two structure easily.
1428 */
1429 nbp = pool_get(&vndbuf_pool, PR_WAITOK);
1430 buf_init(&nbp->vb_buf);
1431 nbp->vb_buf.b_flags = bp->b_flags;
1432 nbp->vb_buf.b_cflags = bp->b_cflags;
1433 nbp->vb_buf.b_oflags = bp->b_oflags;
1434 nbp->vb_buf.b_bcount = sz;
1435 nbp->vb_buf.b_bufsize = sz;
1436 nbp->vb_buf.b_error = 0;
1437 nbp->vb_buf.b_data = addr;
1438 nbp->vb_buf.b_lblkno = 0;
1439 nbp->vb_buf.b_blkno = nbn + btodb(off);
1440 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
1441 nbp->vb_buf.b_iodone = sw_reg_biodone;
1442 nbp->vb_buf.b_vp = vp;
1443 nbp->vb_buf.b_objlock = vp->v_interlock;
1444 if (vp->v_type == VBLK) {
1445 nbp->vb_buf.b_dev = vp->v_rdev;
1446 }
1447
1448 nbp->vb_xfer = vnx; /* patch it back in to vnx */
1449
1450 /*
1451 * Just sort by block number
1452 */
1453 s = splbio();
1454 if (vnx->vx_error != 0) {
1455 buf_destroy(&nbp->vb_buf);
1456 pool_put(&vndbuf_pool, nbp);
1457 goto out;
1458 }
1459 vnx->vx_pending++;
1460
1461 /* sort it in and start I/O if we are not over our limit */
1462 /* XXXAD locking */
1463 bufq_put(sdp->swd_tab, &nbp->vb_buf);
1464 sw_reg_start(sdp);
1465 splx(s);
1466
1467 /*
1468 * advance to the next I/O
1469 */
1470 byteoff += sz;
1471 addr += sz;
1472 }
1473
1474 s = splbio();
1475
1476 out: /* Arrive here at splbio */
1477 vnx->vx_flags &= ~VX_BUSY;
1478 if (vnx->vx_pending == 0) {
1479 error = vnx->vx_error;
1480 pool_put(&vndxfer_pool, vnx);
1481 bp->b_error = error;
1482 biodone(bp);
1483 }
1484 splx(s);
1485 }
1486
1487 /*
1488 * sw_reg_start: start an I/O request on the requested swapdev
1489 *
1490 * => reqs are sorted by b_rawblkno (above)
1491 */
1492 static void
1493 sw_reg_start(struct swapdev *sdp)
1494 {
1495 struct buf *bp;
1496 struct vnode *vp;
1497 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1498
1499 /* recursion control */
1500 if ((sdp->swd_flags & SWF_BUSY) != 0)
1501 return;
1502
1503 sdp->swd_flags |= SWF_BUSY;
1504
1505 while (sdp->swd_active < sdp->swd_maxactive) {
1506 bp = bufq_get(sdp->swd_tab);
1507 if (bp == NULL)
1508 break;
1509 sdp->swd_active++;
1510
1511 UVMHIST_LOG(pdhist,
1512 "sw_reg_start: bp %#jx vp %#jx blkno %#jx cnt %#jx",
1513 (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno,
1514 bp->b_bcount);
1515 vp = bp->b_vp;
1516 KASSERT(bp->b_objlock == vp->v_interlock);
1517 if ((bp->b_flags & B_READ) == 0) {
1518 mutex_enter(vp->v_interlock);
1519 vp->v_numoutput++;
1520 mutex_exit(vp->v_interlock);
1521 }
1522 VOP_STRATEGY(vp, bp);
1523 }
1524 sdp->swd_flags &= ~SWF_BUSY;
1525 }
1526
1527 /*
1528 * sw_reg_biodone: one of our i/o's has completed
1529 */
1530 static void
1531 sw_reg_biodone(struct buf *bp)
1532 {
1533 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL);
1534 }
1535
1536 /*
1537 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1538 *
1539 * => note that we can recover the vndbuf struct by casting the buf ptr
1540 */
1541 static void
1542 sw_reg_iodone(struct work *wk, void *dummy)
1543 {
1544 struct vndbuf *vbp = (void *)wk;
1545 struct vndxfer *vnx = vbp->vb_xfer;
1546 struct buf *pbp = vnx->vx_bp; /* parent buffer */
1547 struct swapdev *sdp = vnx->vx_sdp;
1548 int s, resid, error;
1549 KASSERT(&vbp->vb_buf.b_work == wk);
1550 UVMHIST_FUNC(__func__);
1551 UVMHIST_CALLARGS(pdhist, " vbp=%#jx vp=%#jx blkno=%#jx addr=%#jx",
1552 (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno,
1553 (uintptr_t)vbp->vb_buf.b_data);
1554 UVMHIST_LOG(pdhist, " cnt=%#jx resid=%#jx",
1555 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1556
1557 /*
1558 * protect vbp at splbio and update.
1559 */
1560
1561 s = splbio();
1562 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1563 pbp->b_resid -= resid;
1564 vnx->vx_pending--;
1565
1566 if (vbp->vb_buf.b_error != 0) {
1567 /* pass error upward */
1568 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO;
1569 UVMHIST_LOG(pdhist, " got error=%jd !", error, 0, 0, 0);
1570 vnx->vx_error = error;
1571 }
1572
1573 /*
1574 * kill vbp structure
1575 */
1576 buf_destroy(&vbp->vb_buf);
1577 pool_put(&vndbuf_pool, vbp);
1578
1579 /*
1580 * wrap up this transaction if it has run to completion or, in
1581 * case of an error, when all auxiliary buffers have returned.
1582 */
1583 if (vnx->vx_error != 0) {
1584 /* pass error upward */
1585 error = vnx->vx_error;
1586 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1587 pbp->b_error = error;
1588 biodone(pbp);
1589 pool_put(&vndxfer_pool, vnx);
1590 }
1591 } else if (pbp->b_resid == 0) {
1592 KASSERT(vnx->vx_pending == 0);
1593 if ((vnx->vx_flags & VX_BUSY) == 0) {
1594 UVMHIST_LOG(pdhist, " iodone, pbp=%#jx error=%jd !",
1595 (uintptr_t)pbp, vnx->vx_error, 0, 0);
1596 biodone(pbp);
1597 pool_put(&vndxfer_pool, vnx);
1598 }
1599 }
1600
1601 /*
1602 * done! start next swapdev I/O if one is pending
1603 */
1604 sdp->swd_active--;
1605 sw_reg_start(sdp);
1606 splx(s);
1607 }
1608
1609
1610 /*
1611 * uvm_swap_alloc: allocate space on swap
1612 *
1613 * => allocation is done "round robin" down the priority list, as we
1614 * allocate in a priority we "rotate" the circle queue.
1615 * => space can be freed with uvm_swap_free
1616 * => we return the page slot number in /dev/drum (0 == invalid slot)
1617 * => we lock uvm_swap_data_lock
1618 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1619 */
1620 int
1621 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok)
1622 {
1623 struct swapdev *sdp;
1624 struct swappri *spp;
1625 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1626
1627 /*
1628 * no swap devices configured yet? definite failure.
1629 */
1630 if (uvmexp.nswapdev < 1)
1631 return 0;
1632
1633 /*
1634 * XXXJAK: BEGIN HACK
1635 *
1636 * blist_alloc() in subr_blist.c will panic if we try to allocate
1637 * too many slots.
1638 */
1639 if (*nslots > BLIST_MAX_ALLOC) {
1640 if (__predict_false(lessok == false))
1641 return 0;
1642 *nslots = BLIST_MAX_ALLOC;
1643 }
1644 /* XXXJAK: END HACK */
1645
1646 /*
1647 * lock data lock, convert slots into blocks, and enter loop
1648 */
1649 mutex_enter(&uvm_swap_data_lock);
1650
1651 ReTry: /* XXXMRG */
1652 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
1653 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1654 uint64_t result;
1655
1656 /* if it's not enabled, then we can't swap from it */
1657 if ((sdp->swd_flags & SWF_ENABLE) == 0)
1658 continue;
1659 if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1660 continue;
1661 result = blist_alloc(sdp->swd_blist, *nslots);
1662 if (result == BLIST_NONE) {
1663 continue;
1664 }
1665 KASSERT(result < sdp->swd_drumsize);
1666
1667 /*
1668 * successful allocation! now rotate the tailq.
1669 */
1670 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1671 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1672 sdp->swd_npginuse += *nslots;
1673 uvmexp.swpginuse += *nslots;
1674 mutex_exit(&uvm_swap_data_lock);
1675 /* done! return drum slot number */
1676 UVMHIST_LOG(pdhist,
1677 "success! returning %jd slots starting at %jd",
1678 *nslots, result + sdp->swd_drumoffset, 0, 0);
1679 return (result + sdp->swd_drumoffset);
1680 }
1681 }
1682
1683 /* XXXMRG: BEGIN HACK */
1684 if (*nslots > 1 && lessok) {
1685 *nslots = 1;
1686 /* XXXMRG: ugh! blist should support this for us */
1687 goto ReTry;
1688 }
1689 /* XXXMRG: END HACK */
1690
1691 mutex_exit(&uvm_swap_data_lock);
1692 return 0;
1693 }
1694
1695 /*
1696 * uvm_swapisfull: return true if most of available swap is allocated
1697 * and in use. we don't count some small portion as it may be inaccessible
1698 * to us at any given moment, for example if there is lock contention or if
1699 * pages are busy.
1700 */
1701 bool
1702 uvm_swapisfull(void)
1703 {
1704 int swpgonly;
1705 bool rv;
1706
1707 if (uvmexp.swpages == 0) {
1708 return true;
1709 }
1710
1711 mutex_enter(&uvm_swap_data_lock);
1712 KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
1713 swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 /
1714 uvm_swapisfull_factor);
1715 rv = (swpgonly >= uvmexp.swpgavail);
1716 mutex_exit(&uvm_swap_data_lock);
1717
1718 return (rv);
1719 }
1720
1721 /*
1722 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1723 *
1724 * => we lock uvm_swap_data_lock
1725 */
1726 void
1727 uvm_swap_markbad(int startslot, int nslots)
1728 {
1729 struct swapdev *sdp;
1730 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1731
1732 mutex_enter(&uvm_swap_data_lock);
1733 sdp = swapdrum_getsdp(startslot);
1734 KASSERT(sdp != NULL);
1735
1736 /*
1737 * we just keep track of how many pages have been marked bad
1738 * in this device, to make everything add up in swap_off().
1739 * we assume here that the range of slots will all be within
1740 * one swap device.
1741 */
1742
1743 KASSERT(uvmexp.swpgonly >= nslots);
1744 atomic_add_int(&uvmexp.swpgonly, -nslots);
1745 sdp->swd_npgbad += nslots;
1746 UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0);
1747 mutex_exit(&uvm_swap_data_lock);
1748 }
1749
1750 /*
1751 * uvm_swap_free: free swap slots
1752 *
1753 * => this can be all or part of an allocation made by uvm_swap_alloc
1754 * => we lock uvm_swap_data_lock
1755 */
1756 void
1757 uvm_swap_free(int startslot, int nslots)
1758 {
1759 struct swapdev *sdp;
1760 UVMHIST_FUNC(__func__);
1761 UVMHIST_CALLARGS(pdhist, "freeing %jd slots starting at %jd", nslots,
1762 startslot, 0, 0);
1763
1764 /*
1765 * ignore attempts to free the "bad" slot.
1766 */
1767
1768 if (startslot == SWSLOT_BAD) {
1769 return;
1770 }
1771
1772 /*
1773 * convert drum slot offset back to sdp, free the blocks
1774 * in the extent, and return. must hold pri lock to do
1775 * lookup and access the extent.
1776 */
1777
1778 mutex_enter(&uvm_swap_data_lock);
1779 sdp = swapdrum_getsdp(startslot);
1780 KASSERT(uvmexp.nswapdev >= 1);
1781 KASSERT(sdp != NULL);
1782 KASSERT(sdp->swd_npginuse >= nslots);
1783 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
1784 sdp->swd_npginuse -= nslots;
1785 uvmexp.swpginuse -= nslots;
1786 mutex_exit(&uvm_swap_data_lock);
1787 }
1788
1789 /*
1790 * uvm_swap_put: put any number of pages into a contig place on swap
1791 *
1792 * => can be sync or async
1793 */
1794
1795 int
1796 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags)
1797 {
1798 int error;
1799
1800 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1801 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1802 return error;
1803 }
1804
1805 /*
1806 * uvm_swap_get: get a single page from swap
1807 *
1808 * => usually a sync op (from fault)
1809 */
1810
1811 int
1812 uvm_swap_get(struct vm_page *page, int swslot, int flags)
1813 {
1814 int error;
1815
1816 atomic_inc_uint(&uvmexp.nswget);
1817 KASSERT(flags & PGO_SYNCIO);
1818 if (swslot == SWSLOT_BAD) {
1819 return EIO;
1820 }
1821
1822 error = uvm_swap_io(&page, swslot, 1, B_READ |
1823 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1824 if (error == 0) {
1825
1826 /*
1827 * this page is no longer only in swap.
1828 */
1829
1830 KASSERT(uvmexp.swpgonly > 0);
1831 atomic_dec_uint(&uvmexp.swpgonly);
1832 }
1833 return error;
1834 }
1835
1836 /*
1837 * uvm_swap_io: do an i/o operation to swap
1838 */
1839
1840 static int
1841 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags)
1842 {
1843 daddr_t startblk;
1844 struct buf *bp;
1845 vaddr_t kva;
1846 int error, mapinflags;
1847 bool write, async, swap_encrypt;
1848 UVMHIST_FUNC(__func__);
1849 UVMHIST_CALLARGS(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%#jx",
1850 startslot, npages, flags, 0);
1851
1852 write = (flags & B_READ) == 0;
1853 async = (flags & B_ASYNC) != 0;
1854 swap_encrypt = atomic_load_relaxed(&uvm_swap_encrypt);
1855
1856 /*
1857 * allocate a buf for the i/o.
1858 */
1859
1860 KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async));
1861 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp);
1862 if (bp == NULL) {
1863 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM);
1864 return ENOMEM;
1865 }
1866
1867 /*
1868 * convert starting drum slot to block number
1869 */
1870
1871 startblk = btodb((uint64_t)startslot << PAGE_SHIFT);
1872
1873 /*
1874 * first, map the pages into the kernel.
1875 */
1876
1877 mapinflags = !write ?
1878 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ :
1879 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE;
1880 if (write && swap_encrypt) /* need to encrypt in-place */
1881 mapinflags |= UVMPAGER_MAPIN_READ;
1882 kva = uvm_pagermapin(pps, npages, mapinflags);
1883
1884 /*
1885 * encrypt writes in place if requested
1886 */
1887
1888 if (write) do {
1889 struct swapdev *sdp;
1890 int i;
1891
1892 /*
1893 * Get the swapdev so we can discriminate on the
1894 * encryption state. There may or may not be an
1895 * encryption key generated; we may or may not be asked
1896 * to encrypt swap.
1897 *
1898 * 1. NO KEY, NO ENCRYPTION: Nothing to do.
1899 *
1900 * 2. NO KEY, BUT ENCRYPTION: Generate a key, encrypt,
1901 * and mark the slots encrypted.
1902 *
1903 * 3. KEY, BUT NO ENCRYPTION: The slots may already be
1904 * marked encrypted from a past life. Mark them not
1905 * encrypted.
1906 *
1907 * 4. KEY, ENCRYPTION: Encrypt and mark the slots
1908 * encrypted.
1909 */
1910 mutex_enter(&uvm_swap_data_lock);
1911 sdp = swapdrum_getsdp(startslot);
1912 if (!sdp->swd_encinit) {
1913 if (!swap_encrypt) {
1914 mutex_exit(&uvm_swap_data_lock);
1915 break;
1916 }
1917 uvm_swap_genkey(sdp);
1918 }
1919 KASSERT(sdp->swd_encinit);
1920 mutex_exit(&uvm_swap_data_lock);
1921
1922 for (i = 0; i < npages; i++) {
1923 int s = startslot + i;
1924 KDASSERT(swapdrum_sdp_is(s, sdp));
1925 KASSERT(s >= sdp->swd_drumoffset);
1926 s -= sdp->swd_drumoffset;
1927 KASSERT(s < sdp->swd_drumsize);
1928
1929 if (swap_encrypt) {
1930 uvm_swap_encryptpage(sdp,
1931 (void *)(kva + (vsize_t)i*PAGE_SIZE), s);
1932 atomic_or_32(&sdp->swd_encmap[s/32],
1933 __BIT(s%32));
1934 } else {
1935 atomic_and_32(&sdp->swd_encmap[s/32],
1936 ~__BIT(s%32));
1937 }
1938 }
1939 } while (0);
1940
1941 /*
1942 * fill in the bp/sbp. we currently route our i/o through
1943 * /dev/drum's vnode [swapdev_vp].
1944 */
1945
1946 bp->b_cflags = BC_BUSY | BC_NOCACHE;
1947 bp->b_flags = (flags & (B_READ|B_ASYNC));
1948 bp->b_proc = &proc0; /* XXX */
1949 bp->b_vnbufs.le_next = NOLIST;
1950 bp->b_data = (void *)kva;
1951 bp->b_blkno = startblk;
1952 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
1953
1954 /*
1955 * bump v_numoutput (counter of number of active outputs).
1956 */
1957
1958 if (write) {
1959 mutex_enter(swapdev_vp->v_interlock);
1960 swapdev_vp->v_numoutput++;
1961 mutex_exit(swapdev_vp->v_interlock);
1962 }
1963
1964 /*
1965 * for async ops we must set up the iodone handler.
1966 */
1967
1968 if (async) {
1969 bp->b_iodone = uvm_aio_aiodone;
1970 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1971 if (curlwp == uvm.pagedaemon_lwp)
1972 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1973 else
1974 BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
1975 } else {
1976 bp->b_iodone = NULL;
1977 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1978 }
1979 UVMHIST_LOG(pdhist,
1980 "about to start io: data = %#jx blkno = %#jx, bcount = %jd",
1981 (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1982
1983 /*
1984 * now we start the I/O, and if async, return.
1985 */
1986
1987 VOP_STRATEGY(swapdev_vp, bp);
1988 if (async) {
1989 /*
1990 * Reads are always synchronous; if this changes, we
1991 * need to add an asynchronous path for decryption.
1992 */
1993 KASSERT(write);
1994 return 0;
1995 }
1996
1997 /*
1998 * must be sync i/o. wait for it to finish
1999 */
2000
2001 error = biowait(bp);
2002 if (error)
2003 goto out;
2004
2005 /*
2006 * decrypt reads in place if needed
2007 */
2008
2009 if (!write) do {
2010 struct swapdev *sdp;
2011 bool encinit;
2012 int i;
2013
2014 /*
2015 * Get the sdp. Everything about it except the encinit
2016 * bit, saying whether the encryption key is
2017 * initialized or not, and the encrypted bit for each
2018 * page, is stable until all swap pages have been
2019 * released and the device is removed.
2020 */
2021 mutex_enter(&uvm_swap_data_lock);
2022 sdp = swapdrum_getsdp(startslot);
2023 encinit = sdp->swd_encinit;
2024 mutex_exit(&uvm_swap_data_lock);
2025
2026 if (!encinit)
2027 /*
2028 * If there's no encryption key, there's no way
2029 * any of these slots can be encrypted, so
2030 * nothing to do here.
2031 */
2032 break;
2033 for (i = 0; i < npages; i++) {
2034 int s = startslot + i;
2035 KDASSERT(swapdrum_sdp_is(s, sdp));
2036 KASSERT(s >= sdp->swd_drumoffset);
2037 s -= sdp->swd_drumoffset;
2038 KASSERT(s < sdp->swd_drumsize);
2039 if ((atomic_load_relaxed(&sdp->swd_encmap[s/32]) &
2040 __BIT(s%32)) == 0)
2041 continue;
2042 uvm_swap_decryptpage(sdp,
2043 (void *)(kva + (vsize_t)i*PAGE_SIZE), s);
2044 }
2045 } while (0);
2046 out:
2047 /*
2048 * kill the pager mapping
2049 */
2050
2051 uvm_pagermapout(kva, npages);
2052
2053 /*
2054 * now dispose of the buf and we're done.
2055 */
2056
2057 if (write) {
2058 mutex_enter(swapdev_vp->v_interlock);
2059 vwakeup(bp);
2060 mutex_exit(swapdev_vp->v_interlock);
2061 }
2062 putiobuf(bp);
2063 UVMHIST_LOG(pdhist, "<- done (sync) error=%jd", error, 0, 0, 0);
2064
2065 return (error);
2066 }
2067
2068 /*
2069 * uvm_swap_genkey(sdp)
2070 *
2071 * Generate a key for swap encryption.
2072 */
2073 static void
2074 uvm_swap_genkey(struct swapdev *sdp)
2075 {
2076 uint8_t key[32];
2077
2078 KASSERT(!sdp->swd_encinit);
2079
2080 cprng_strong(kern_cprng, key, sizeof key, 0);
2081 aes_setenckey256(&sdp->swd_enckey, key);
2082 aes_setdeckey256(&sdp->swd_deckey, key);
2083 explicit_memset(key, 0, sizeof key);
2084
2085 sdp->swd_encinit = true;
2086 }
2087
2088 /*
2089 * uvm_swap_encryptpage(sdp, kva, slot)
2090 *
2091 * Encrypt one page of data at kva for the specified slot number
2092 * in the swap device.
2093 */
2094 static void
2095 uvm_swap_encryptpage(struct swapdev *sdp, void *kva, int slot)
2096 {
2097 uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16);
2098
2099 /* iv := AES_k(le32enc(slot) || 0^96) */
2100 le32enc(preiv, slot);
2101 aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS);
2102
2103 /* *kva := AES-CBC_k(iv, *kva) */
2104 aes_cbc_enc(&sdp->swd_enckey, kva, kva, PAGE_SIZE, iv,
2105 AES_256_NROUNDS);
2106
2107 explicit_memset(&iv, 0, sizeof iv);
2108 }
2109
2110 /*
2111 * uvm_swap_decryptpage(sdp, kva, slot)
2112 *
2113 * Decrypt one page of data at kva for the specified slot number
2114 * in the swap device.
2115 */
2116 static void
2117 uvm_swap_decryptpage(struct swapdev *sdp, void *kva, int slot)
2118 {
2119 uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16);
2120
2121 /* iv := AES_k(le32enc(slot) || 0^96) */
2122 le32enc(preiv, slot);
2123 aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS);
2124
2125 /* *kva := AES-CBC^{-1}_k(iv, *kva) */
2126 aes_cbc_dec(&sdp->swd_deckey, kva, kva, PAGE_SIZE, iv,
2127 AES_256_NROUNDS);
2128
2129 explicit_memset(&iv, 0, sizeof iv);
2130 }
2131
2132 SYSCTL_SETUP(sysctl_uvmswap_setup, "sysctl uvmswap setup")
2133 {
2134
2135 sysctl_createv(clog, 0, NULL, NULL,
2136 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "swap_encrypt",
2137 SYSCTL_DESCR("Encrypt data when swapped out to disk"),
2138 NULL, 0, &uvm_swap_encrypt, 0,
2139 CTL_VM, CTL_CREATE, CTL_EOL);
2140 }
2141