subr_vmem.c revision 1.117 1 /* $NetBSD: subr_vmem.c,v 1.117 2024/12/06 19:17:44 riastradh Exp $ */
2
3 /*-
4 * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi,
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 /*
30 * reference:
31 * - Magazines and Vmem: Extending the Slab Allocator
32 * to Many CPUs and Arbitrary Resources
33 * http://www.usenix.org/event/usenix01/bonwick.html
34 *
35 * locking & the boundary tag pool:
36 * - A pool(9) is used for vmem boundary tags
37 * - During a pool get call the global vmem_btag_refill_lock is taken,
38 * to serialize access to the allocation reserve, but no other
39 * vmem arena locks.
40 * - During pool_put calls no vmem mutexes are locked.
41 * - pool_drain doesn't hold the pool's mutex while releasing memory to
42 * its backing therefore no interference with any vmem mutexes.
43 * - The boundary tag pool is forced to put page headers into pool pages
44 * (PR_PHINPAGE) and not off page to avoid pool recursion.
45 * (due to sizeof(bt_t) it should be the case anyway)
46 */
47
48 #include <sys/cdefs.h>
49 __KERNEL_RCSID(0, "$NetBSD: subr_vmem.c,v 1.117 2024/12/06 19:17:44 riastradh Exp $");
50
51 #if defined(_KERNEL) && defined(_KERNEL_OPT)
52 #include "opt_ddb.h"
53 #endif /* defined(_KERNEL) && defined(_KERNEL_OPT) */
54
55 #include <sys/param.h>
56 #include <sys/types.h>
57
58 #include <sys/bitops.h>
59 #include <sys/hash.h>
60 #include <sys/queue.h>
61
62 #if defined(_KERNEL)
63
64 #include <sys/atomic.h>
65 #include <sys/callout.h>
66 #include <sys/kernel.h> /* hz */
67 #include <sys/kmem.h>
68 #include <sys/pool.h>
69 #include <sys/systm.h>
70 #include <sys/vmem.h>
71 #include <sys/vmem_impl.h>
72 #include <sys/workqueue.h>
73
74 #include <uvm/uvm.h>
75 #include <uvm/uvm_extern.h>
76 #include <uvm/uvm_km.h>
77 #include <uvm/uvm_page.h>
78 #include <uvm/uvm_pdaemon.h>
79
80 #else /* defined(_KERNEL) */
81
82 #include <assert.h>
83 #include <errno.h>
84 #include <stdio.h>
85 #include <stdlib.h>
86 #include <string.h>
87
88 #include "../sys/vmem.h"
89 #include "../sys/vmem_impl.h"
90
91 #endif /* defined(_KERNEL) */
92
93 #if defined(_KERNEL)
94
95 #include <sys/evcnt.h>
96
97 #define VMEM_EVCNT_DEFINE(name) \
98 struct evcnt vmem_evcnt_##name = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, \
99 "vmem", #name); \
100 EVCNT_ATTACH_STATIC(vmem_evcnt_##name)
101 #define VMEM_EVCNT_INCR(ev) (vmem_evcnt_##ev.ev_count++)
102 #define VMEM_EVCNT_DECR(ev) (vmem_evcnt_##ev.ev_count--)
103
104 VMEM_EVCNT_DEFINE(static_bt_count);
105 VMEM_EVCNT_DEFINE(static_bt_inuse);
106
107 #define VMEM_CONDVAR_INIT(vm, wchan) cv_init(&vm->vm_cv, wchan)
108 #define VMEM_CONDVAR_DESTROY(vm) cv_destroy(&vm->vm_cv)
109 #define VMEM_CONDVAR_WAIT(vm) cv_wait(&vm->vm_cv, &vm->vm_lock)
110 #define VMEM_CONDVAR_BROADCAST(vm) cv_broadcast(&vm->vm_cv)
111
112 #else /* defined(_KERNEL) */
113
114 #define VMEM_EVCNT_INCR(ev) __nothing
115 #define VMEM_EVCNT_DECR(ev) __nothing
116
117 #define VMEM_CONDVAR_INIT(vm, wchan) __nothing
118 #define VMEM_CONDVAR_DESTROY(vm) __nothing
119 #define VMEM_CONDVAR_WAIT(vm) __nothing
120 #define VMEM_CONDVAR_BROADCAST(vm) __nothing
121
122 #define UNITTEST
123 #define KASSERT(a) assert(a)
124 #define KASSERTMSG(a, m, ...) assert(a)
125 #define mutex_init(a, b, c) __nothing
126 #define mutex_destroy(a) __nothing
127 #define mutex_enter(a) __nothing
128 #define mutex_tryenter(a) true
129 #define mutex_exit(a) __nothing
130 #define mutex_owned(a) true
131 #define ASSERT_SLEEPABLE() __nothing
132 #define panic(...) (printf(__VA_ARGS__), abort())
133
134 #endif /* defined(_KERNEL) */
135
136 #if defined(VMEM_SANITY)
137 static void vmem_check(vmem_t *);
138 #else /* defined(VMEM_SANITY) */
139 #define vmem_check(vm) __nothing
140 #endif /* defined(VMEM_SANITY) */
141
142 #define VMEM_HASHSIZE_MIN 1 /* XXX */
143 #define VMEM_HASHSIZE_MAX 65536 /* XXX */
144 #define VMEM_HASHSIZE_INIT 1
145
146 #define VM_FITMASK (VM_BESTFIT | VM_INSTANTFIT)
147
148 #if defined(_KERNEL)
149 static bool vmem_bootstrapped = false;
150 static kmutex_t vmem_list_lock;
151 static LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list);
152 #endif /* defined(_KERNEL) */
153
154 /* ---- misc */
155
156 #define VMEM_LOCK(vm) mutex_enter(&(vm)->vm_lock)
157 #define VMEM_TRYLOCK(vm) mutex_tryenter(&(vm)->vm_lock)
158 #define VMEM_UNLOCK(vm) mutex_exit(&(vm)->vm_lock)
159 #define VMEM_LOCK_INIT(vm, ipl) mutex_init(&(vm)->vm_lock, MUTEX_DEFAULT, (ipl))
160 #define VMEM_LOCK_DESTROY(vm) mutex_destroy(&(vm)->vm_lock)
161 #define VMEM_ASSERT_LOCKED(vm) KASSERT(mutex_owned(&(vm)->vm_lock))
162
163 #define VMEM_ALIGNUP(addr, align) \
164 (-(-(addr) & -(align)))
165
166 #define VMEM_CROSS_P(addr1, addr2, boundary) \
167 ((((addr1) ^ (addr2)) & -(boundary)) != 0)
168
169 #define ORDER2SIZE(order) ((vmem_size_t)1 << (order))
170 #define SIZE2ORDER(size) ((int)ilog2(size))
171
172 static void
173 vmem_kick_pdaemon(void)
174 {
175 #if defined(_KERNEL)
176 uvm_kick_pdaemon();
177 #endif
178 }
179
180 static void vmem_xfree_bt(vmem_t *, bt_t *);
181
182 #if !defined(_KERNEL)
183 #define xmalloc(sz, flags) malloc(sz)
184 #define xfree(p, sz) free(p)
185 #define bt_alloc(vm, flags) malloc(sizeof(bt_t))
186 #define bt_free(vm, bt) free(bt)
187 #define bt_freetrim(vm, l) __nothing
188 #else /* defined(_KERNEL) */
189
190 #define xmalloc(sz, flags) \
191 kmem_alloc(sz, ((flags) & VM_SLEEP) ? KM_SLEEP : KM_NOSLEEP);
192 #define xfree(p, sz) kmem_free(p, sz);
193
194 /*
195 * BT_RESERVE calculation:
196 * we allocate memory for boundary tags with vmem; therefore we have
197 * to keep a reserve of bts used to allocated memory for bts.
198 * This reserve is 4 for each arena involved in allocating vmems memory.
199 * BT_MAXFREE: don't cache excessive counts of bts in arenas
200 */
201 #define STATIC_BT_COUNT 200
202 #define BT_MINRESERVE 4
203 #define BT_MAXFREE 64
204
205 static struct vmem_btag static_bts[STATIC_BT_COUNT];
206 static int static_bt_count = STATIC_BT_COUNT;
207
208 static struct vmem kmem_va_meta_arena_store;
209 vmem_t *kmem_va_meta_arena;
210 static struct vmem kmem_meta_arena_store;
211 vmem_t *kmem_meta_arena = NULL;
212
213 static kmutex_t vmem_btag_refill_lock;
214 static kmutex_t vmem_btag_lock;
215 static LIST_HEAD(, vmem_btag) vmem_btag_freelist;
216 static size_t vmem_btag_freelist_count = 0;
217 static struct pool vmem_btag_pool;
218 static bool vmem_btag_pool_initialized __read_mostly;
219
220 /* ---- boundary tag */
221
222 static int bt_refill(vmem_t *vm);
223 static int bt_refill_locked(vmem_t *vm);
224
225 static void *
226 pool_page_alloc_vmem_meta(struct pool *pp, int flags)
227 {
228 const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP;
229 vmem_addr_t va;
230 int ret;
231
232 ret = vmem_alloc(kmem_meta_arena, pp->pr_alloc->pa_pagesz,
233 (vflags & ~VM_FITMASK) | VM_INSTANTFIT | VM_POPULATING, &va);
234
235 return ret ? NULL : (void *)va;
236 }
237
238 static void
239 pool_page_free_vmem_meta(struct pool *pp, void *v)
240 {
241
242 vmem_free(kmem_meta_arena, (vmem_addr_t)v, pp->pr_alloc->pa_pagesz);
243 }
244
245 /* allocator for vmem-pool metadata */
246 struct pool_allocator pool_allocator_vmem_meta = {
247 .pa_alloc = pool_page_alloc_vmem_meta,
248 .pa_free = pool_page_free_vmem_meta,
249 .pa_pagesz = 0
250 };
251
252 static int
253 bt_refill_locked(vmem_t *vm)
254 {
255 bt_t *bt;
256
257 VMEM_ASSERT_LOCKED(vm);
258
259 if (vm->vm_nfreetags > BT_MINRESERVE) {
260 return 0;
261 }
262
263 mutex_enter(&vmem_btag_lock);
264 while (!LIST_EMPTY(&vmem_btag_freelist) &&
265 vm->vm_nfreetags <= BT_MINRESERVE &&
266 (vm->vm_flags & VM_PRIVTAGS) == 0) {
267 bt = LIST_FIRST(&vmem_btag_freelist);
268 LIST_REMOVE(bt, bt_freelist);
269 bt->bt_flags = 0;
270 LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
271 vm->vm_nfreetags++;
272 vmem_btag_freelist_count--;
273 VMEM_EVCNT_INCR(static_bt_inuse);
274 }
275 mutex_exit(&vmem_btag_lock);
276
277 while (vm->vm_nfreetags <= BT_MINRESERVE) {
278 VMEM_UNLOCK(vm);
279 KASSERT(vmem_btag_pool_initialized);
280 mutex_enter(&vmem_btag_refill_lock);
281 bt = pool_get(&vmem_btag_pool, PR_NOWAIT);
282 mutex_exit(&vmem_btag_refill_lock);
283 VMEM_LOCK(vm);
284 if (bt == NULL)
285 break;
286 bt->bt_flags = 0;
287 LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
288 vm->vm_nfreetags++;
289 }
290
291 if (vm->vm_nfreetags <= BT_MINRESERVE) {
292 return ENOMEM;
293 }
294
295 if (kmem_meta_arena != NULL) {
296 VMEM_UNLOCK(vm);
297 (void)bt_refill(kmem_arena);
298 (void)bt_refill(kmem_va_meta_arena);
299 (void)bt_refill(kmem_meta_arena);
300 VMEM_LOCK(vm);
301 }
302
303 return 0;
304 }
305
306 static int
307 bt_refill(vmem_t *vm)
308 {
309 int rv;
310
311 VMEM_LOCK(vm);
312 rv = bt_refill_locked(vm);
313 VMEM_UNLOCK(vm);
314 return rv;
315 }
316
317 static bt_t *
318 bt_alloc(vmem_t *vm, vm_flag_t flags)
319 {
320 bt_t *bt;
321
322 VMEM_ASSERT_LOCKED(vm);
323
324 while (vm->vm_nfreetags <= BT_MINRESERVE && (flags & VM_POPULATING) == 0) {
325 if (bt_refill_locked(vm)) {
326 if ((flags & VM_NOSLEEP) != 0) {
327 return NULL;
328 }
329
330 /*
331 * It would be nice to wait for something specific here
332 * but there are multiple ways that a retry could
333 * succeed and we can't wait for multiple things
334 * simultaneously. So we'll just sleep for an arbitrary
335 * short period of time and retry regardless.
336 * This should be a very rare case.
337 */
338
339 vmem_kick_pdaemon();
340 kpause("btalloc", false, 1, &vm->vm_lock);
341 }
342 }
343 bt = LIST_FIRST(&vm->vm_freetags);
344 LIST_REMOVE(bt, bt_freelist);
345 vm->vm_nfreetags--;
346
347 return bt;
348 }
349
350 static void
351 bt_free(vmem_t *vm, bt_t *bt)
352 {
353
354 VMEM_ASSERT_LOCKED(vm);
355
356 LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
357 vm->vm_nfreetags++;
358 }
359
360 static void
361 bt_freetrim(vmem_t *vm, int freelimit)
362 {
363 bt_t *bt, *next_bt;
364 LIST_HEAD(, vmem_btag) tofree;
365
366 VMEM_ASSERT_LOCKED(vm);
367
368 LIST_INIT(&tofree);
369
370 LIST_FOREACH_SAFE(bt, &vm->vm_freetags, bt_freelist, next_bt) {
371 if (vm->vm_nfreetags <= freelimit) {
372 break;
373 }
374 if (bt->bt_flags & BT_F_PRIVATE) {
375 continue;
376 }
377 LIST_REMOVE(bt, bt_freelist);
378 vm->vm_nfreetags--;
379 if (bt >= static_bts
380 && bt < &static_bts[STATIC_BT_COUNT]) {
381 mutex_enter(&vmem_btag_lock);
382 LIST_INSERT_HEAD(&vmem_btag_freelist, bt, bt_freelist);
383 vmem_btag_freelist_count++;
384 mutex_exit(&vmem_btag_lock);
385 VMEM_EVCNT_DECR(static_bt_inuse);
386 } else {
387 LIST_INSERT_HEAD(&tofree, bt, bt_freelist);
388 }
389 }
390
391 VMEM_UNLOCK(vm);
392 while (!LIST_EMPTY(&tofree)) {
393 bt = LIST_FIRST(&tofree);
394 LIST_REMOVE(bt, bt_freelist);
395 pool_put(&vmem_btag_pool, bt);
396 }
397 }
398
399 /*
400 * Add private boundary tags (statically-allocated by the caller)
401 * to a vmem arena's free tag list.
402 */
403 void
404 vmem_add_bts(vmem_t *vm, struct vmem_btag *bts, unsigned int nbts)
405 {
406 VMEM_LOCK(vm);
407 while (nbts != 0) {
408 bts->bt_flags = BT_F_PRIVATE;
409 LIST_INSERT_HEAD(&vm->vm_freetags, bts, bt_freelist);
410 vm->vm_nfreetags++;
411 bts++;
412 nbts--;
413 }
414 VMEM_UNLOCK(vm);
415 }
416 #endif /* defined(_KERNEL) */
417
418 /*
419 * freelist[0] ... [1, 1]
420 * freelist[1] ... [2, 3]
421 * freelist[2] ... [4, 7]
422 * freelist[3] ... [8, 15]
423 * :
424 * freelist[n] ... [(1 << n), (1 << (n + 1)) - 1]
425 * :
426 */
427
428 static struct vmem_freelist *
429 bt_freehead_tofree(vmem_t *vm, vmem_size_t size)
430 {
431 const vmem_size_t qsize = size >> vm->vm_quantum_shift;
432 const int idx = SIZE2ORDER(qsize);
433
434 KASSERT(size != 0);
435 KASSERT(qsize != 0);
436 KASSERT((size & vm->vm_quantum_mask) == 0);
437 KASSERT(idx >= 0);
438 KASSERT(idx < VMEM_MAXORDER);
439
440 return &vm->vm_freelist[idx];
441 }
442
443 /*
444 * bt_freehead_toalloc: return the freelist for the given size and allocation
445 * strategy.
446 *
447 * for VM_INSTANTFIT, return the list in which any blocks are large enough
448 * for the requested size. otherwise, return the list which can have blocks
449 * large enough for the requested size.
450 */
451
452 static struct vmem_freelist *
453 bt_freehead_toalloc(vmem_t *vm, vmem_size_t size, vm_flag_t strat)
454 {
455 const vmem_size_t qsize = size >> vm->vm_quantum_shift;
456 int idx = SIZE2ORDER(qsize);
457
458 KASSERT(size != 0);
459 KASSERT(qsize != 0);
460 KASSERT((size & vm->vm_quantum_mask) == 0);
461
462 if (strat == VM_INSTANTFIT && ORDER2SIZE(idx) != qsize) {
463 idx++;
464 /* check too large request? */
465 }
466 KASSERT(idx >= 0);
467 KASSERT(idx < VMEM_MAXORDER);
468
469 return &vm->vm_freelist[idx];
470 }
471
472 /* ---- boundary tag hash */
473
474 static struct vmem_hashlist *
475 bt_hashhead(vmem_t *vm, vmem_addr_t addr)
476 {
477 struct vmem_hashlist *list;
478 unsigned int hash;
479
480 hash = hash32_buf(&addr, sizeof(addr), HASH32_BUF_INIT);
481 list = &vm->vm_hashlist[hash & vm->vm_hashmask];
482
483 return list;
484 }
485
486 static bt_t *
487 bt_lookupbusy(vmem_t *vm, vmem_addr_t addr)
488 {
489 struct vmem_hashlist *list;
490 bt_t *bt;
491
492 list = bt_hashhead(vm, addr);
493 LIST_FOREACH(bt, list, bt_hashlist) {
494 if (bt->bt_start == addr) {
495 break;
496 }
497 }
498
499 return bt;
500 }
501
502 static void
503 bt_rembusy(vmem_t *vm, bt_t *bt)
504 {
505
506 KASSERT(vm->vm_nbusytag > 0);
507 vm->vm_inuse -= bt->bt_size;
508 vm->vm_nbusytag--;
509 LIST_REMOVE(bt, bt_hashlist);
510 }
511
512 static void
513 bt_insbusy(vmem_t *vm, bt_t *bt)
514 {
515 struct vmem_hashlist *list;
516
517 KASSERT(bt->bt_type == BT_TYPE_BUSY);
518
519 list = bt_hashhead(vm, bt->bt_start);
520 LIST_INSERT_HEAD(list, bt, bt_hashlist);
521 if (++vm->vm_nbusytag > vm->vm_maxbusytag) {
522 vm->vm_maxbusytag = vm->vm_nbusytag;
523 }
524 vm->vm_inuse += bt->bt_size;
525 }
526
527 /* ---- boundary tag list */
528
529 static void
530 bt_remseg(vmem_t *vm, bt_t *bt)
531 {
532
533 TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist);
534 }
535
536 static void
537 bt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev)
538 {
539
540 TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist);
541 }
542
543 static void
544 bt_insseg_tail(vmem_t *vm, bt_t *bt)
545 {
546
547 TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist);
548 }
549
550 static void
551 bt_remfree(vmem_t *vm, bt_t *bt)
552 {
553
554 KASSERT(bt->bt_type == BT_TYPE_FREE);
555
556 LIST_REMOVE(bt, bt_freelist);
557 }
558
559 static void
560 bt_insfree(vmem_t *vm, bt_t *bt)
561 {
562 struct vmem_freelist *list;
563
564 list = bt_freehead_tofree(vm, bt->bt_size);
565 LIST_INSERT_HEAD(list, bt, bt_freelist);
566 }
567
568 /* ---- vmem internal functions */
569
570 #if defined(QCACHE)
571 static inline vm_flag_t
572 prf_to_vmf(int prflags)
573 {
574 vm_flag_t vmflags;
575
576 KASSERT((prflags & ~(PR_LIMITFAIL | PR_WAITOK | PR_NOWAIT)) == 0);
577 if ((prflags & PR_WAITOK) != 0) {
578 vmflags = VM_SLEEP;
579 } else {
580 vmflags = VM_NOSLEEP;
581 }
582 return vmflags;
583 }
584
585 static inline int
586 vmf_to_prf(vm_flag_t vmflags)
587 {
588 int prflags;
589
590 if ((vmflags & VM_SLEEP) != 0) {
591 prflags = PR_WAITOK;
592 } else {
593 prflags = PR_NOWAIT;
594 }
595 return prflags;
596 }
597
598 static size_t
599 qc_poolpage_size(size_t qcache_max)
600 {
601 int i;
602
603 for (i = 0; ORDER2SIZE(i) <= qcache_max * 3; i++) {
604 /* nothing */
605 }
606 return ORDER2SIZE(i);
607 }
608
609 static void *
610 qc_poolpage_alloc(struct pool *pool, int prflags)
611 {
612 qcache_t *qc = QC_POOL_TO_QCACHE(pool);
613 vmem_t *vm = qc->qc_vmem;
614 vmem_addr_t addr;
615
616 if (vmem_alloc(vm, pool->pr_alloc->pa_pagesz,
617 prf_to_vmf(prflags) | VM_INSTANTFIT, &addr) != 0)
618 return NULL;
619 return (void *)addr;
620 }
621
622 static void
623 qc_poolpage_free(struct pool *pool, void *addr)
624 {
625 qcache_t *qc = QC_POOL_TO_QCACHE(pool);
626 vmem_t *vm = qc->qc_vmem;
627
628 vmem_free(vm, (vmem_addr_t)addr, pool->pr_alloc->pa_pagesz);
629 }
630
631 static void
632 qc_init(vmem_t *vm, size_t qcache_max, int ipl)
633 {
634 qcache_t *prevqc;
635 struct pool_allocator *pa;
636 int qcache_idx_max;
637 int i;
638
639 KASSERT((qcache_max & vm->vm_quantum_mask) == 0);
640 if (qcache_max > (VMEM_QCACHE_IDX_MAX << vm->vm_quantum_shift)) {
641 qcache_max = VMEM_QCACHE_IDX_MAX << vm->vm_quantum_shift;
642 }
643 vm->vm_qcache_max = qcache_max;
644 pa = &vm->vm_qcache_allocator;
645 memset(pa, 0, sizeof(*pa));
646 pa->pa_alloc = qc_poolpage_alloc;
647 pa->pa_free = qc_poolpage_free;
648 pa->pa_pagesz = qc_poolpage_size(qcache_max);
649
650 qcache_idx_max = qcache_max >> vm->vm_quantum_shift;
651 prevqc = NULL;
652 for (i = qcache_idx_max; i > 0; i--) {
653 qcache_t *qc = &vm->vm_qcache_store[i - 1];
654 size_t size = i << vm->vm_quantum_shift;
655 pool_cache_t pc;
656
657 qc->qc_vmem = vm;
658 snprintf(qc->qc_name, sizeof(qc->qc_name), "%s-%zu",
659 vm->vm_name, size);
660
661 pc = pool_cache_init(size,
662 ORDER2SIZE(vm->vm_quantum_shift), 0,
663 PR_NOALIGN | PR_NOTOUCH | PR_RECURSIVE /* XXX */,
664 qc->qc_name, pa, ipl, NULL, NULL, NULL);
665
666 KASSERT(pc);
667
668 qc->qc_cache = pc;
669 KASSERT(qc->qc_cache != NULL); /* XXX */
670 if (prevqc != NULL &&
671 qc->qc_cache->pc_pool.pr_itemsperpage ==
672 prevqc->qc_cache->pc_pool.pr_itemsperpage) {
673 pool_cache_destroy(qc->qc_cache);
674 vm->vm_qcache[i - 1] = prevqc;
675 continue;
676 }
677 qc->qc_cache->pc_pool.pr_qcache = qc;
678 vm->vm_qcache[i - 1] = qc;
679 prevqc = qc;
680 }
681 }
682
683 static void
684 qc_destroy(vmem_t *vm)
685 {
686 const qcache_t *prevqc;
687 int i;
688 int qcache_idx_max;
689
690 qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
691 prevqc = NULL;
692 for (i = 0; i < qcache_idx_max; i++) {
693 qcache_t *qc = vm->vm_qcache[i];
694
695 if (prevqc == qc) {
696 continue;
697 }
698 pool_cache_destroy(qc->qc_cache);
699 prevqc = qc;
700 }
701 }
702 #endif
703
704 #if defined(_KERNEL)
705 static void
706 vmem_bootstrap(void)
707 {
708
709 mutex_init(&vmem_list_lock, MUTEX_DEFAULT, IPL_NONE);
710 mutex_init(&vmem_btag_lock, MUTEX_DEFAULT, IPL_VM);
711 mutex_init(&vmem_btag_refill_lock, MUTEX_DEFAULT, IPL_VM);
712
713 while (static_bt_count-- > 0) {
714 bt_t *bt = &static_bts[static_bt_count];
715 LIST_INSERT_HEAD(&vmem_btag_freelist, bt, bt_freelist);
716 VMEM_EVCNT_INCR(static_bt_count);
717 vmem_btag_freelist_count++;
718 }
719 vmem_bootstrapped = TRUE;
720 }
721
722 void
723 vmem_subsystem_init(vmem_t *vm)
724 {
725
726 kmem_va_meta_arena = vmem_init(&kmem_va_meta_arena_store, "vmem-va",
727 0, 0, PAGE_SIZE, vmem_alloc, vmem_free, vm,
728 0, VM_NOSLEEP | VM_BOOTSTRAP | VM_LARGEIMPORT,
729 IPL_VM);
730
731 kmem_meta_arena = vmem_init(&kmem_meta_arena_store, "vmem-meta",
732 0, 0, PAGE_SIZE,
733 uvm_km_kmem_alloc, uvm_km_kmem_free, kmem_va_meta_arena,
734 0, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM);
735
736 pool_init(&vmem_btag_pool, sizeof(bt_t), coherency_unit, 0,
737 PR_PHINPAGE, "vmembt", &pool_allocator_vmem_meta, IPL_VM);
738 vmem_btag_pool_initialized = true;
739 }
740 #endif /* defined(_KERNEL) */
741
742 static int
743 vmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, vm_flag_t flags,
744 int spanbttype)
745 {
746 bt_t *btspan;
747 bt_t *btfree;
748
749 VMEM_ASSERT_LOCKED(vm);
750 KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
751 KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
752 KASSERT(spanbttype == BT_TYPE_SPAN ||
753 spanbttype == BT_TYPE_SPAN_STATIC);
754
755 btspan = bt_alloc(vm, flags);
756 if (btspan == NULL) {
757 return ENOMEM;
758 }
759 btfree = bt_alloc(vm, flags);
760 if (btfree == NULL) {
761 bt_free(vm, btspan);
762 return ENOMEM;
763 }
764
765 btspan->bt_type = spanbttype;
766 btspan->bt_start = addr;
767 btspan->bt_size = size;
768
769 btfree->bt_type = BT_TYPE_FREE;
770 btfree->bt_start = addr;
771 btfree->bt_size = size;
772
773 bt_insseg_tail(vm, btspan);
774 bt_insseg(vm, btfree, btspan);
775 bt_insfree(vm, btfree);
776 vm->vm_size += size;
777
778 return 0;
779 }
780
781 static void
782 vmem_destroy1(vmem_t *vm)
783 {
784
785 #if defined(QCACHE)
786 qc_destroy(vm);
787 #endif /* defined(QCACHE) */
788 VMEM_LOCK(vm);
789
790 for (int i = 0; i < vm->vm_hashsize; i++) {
791 bt_t *bt;
792
793 while ((bt = LIST_FIRST(&vm->vm_hashlist[i])) != NULL) {
794 KASSERT(bt->bt_type == BT_TYPE_SPAN_STATIC);
795 LIST_REMOVE(bt, bt_hashlist);
796 bt_free(vm, bt);
797 }
798 }
799
800 /* bt_freetrim() drops the lock. */
801 bt_freetrim(vm, 0);
802 if (vm->vm_hashlist != &vm->vm_hash0) {
803 xfree(vm->vm_hashlist,
804 sizeof(struct vmem_hashlist) * vm->vm_hashsize);
805 }
806
807 VMEM_CONDVAR_DESTROY(vm);
808 VMEM_LOCK_DESTROY(vm);
809 xfree(vm, sizeof(*vm));
810 }
811
812 static int
813 vmem_import(vmem_t *vm, vmem_size_t size, vm_flag_t flags)
814 {
815 vmem_addr_t addr;
816 int rc;
817
818 VMEM_ASSERT_LOCKED(vm);
819
820 if (vm->vm_importfn == NULL) {
821 return EINVAL;
822 }
823
824 if (vm->vm_flags & VM_LARGEIMPORT) {
825 size *= 16;
826 }
827
828 VMEM_UNLOCK(vm);
829 if (vm->vm_flags & VM_XIMPORT) {
830 rc = __FPTRCAST(vmem_ximport_t *, vm->vm_importfn)(vm->vm_arg,
831 size, &size, flags, &addr);
832 } else {
833 rc = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr);
834 }
835 VMEM_LOCK(vm);
836
837 if (rc) {
838 return ENOMEM;
839 }
840
841 if (vmem_add1(vm, addr, size, flags, BT_TYPE_SPAN) != 0) {
842 VMEM_UNLOCK(vm);
843 (*vm->vm_releasefn)(vm->vm_arg, addr, size);
844 VMEM_LOCK(vm);
845 return ENOMEM;
846 }
847
848 return 0;
849 }
850
851 #if defined(_KERNEL)
852 static int
853 vmem_rehash(vmem_t *vm, size_t newhashsize, vm_flag_t flags)
854 {
855 bt_t *bt;
856 int i;
857 struct vmem_hashlist *newhashlist;
858 struct vmem_hashlist *oldhashlist;
859 size_t oldhashsize;
860
861 KASSERT(newhashsize > 0);
862
863 /* Round hash size up to a power of 2. */
864 newhashsize = 1 << (ilog2(newhashsize) + 1);
865
866 newhashlist =
867 xmalloc(sizeof(struct vmem_hashlist) * newhashsize, flags);
868 if (newhashlist == NULL) {
869 return ENOMEM;
870 }
871 for (i = 0; i < newhashsize; i++) {
872 LIST_INIT(&newhashlist[i]);
873 }
874
875 VMEM_LOCK(vm);
876 /* Decay back to a small hash slowly. */
877 if (vm->vm_maxbusytag >= 2) {
878 vm->vm_maxbusytag = vm->vm_maxbusytag / 2 - 1;
879 if (vm->vm_nbusytag > vm->vm_maxbusytag) {
880 vm->vm_maxbusytag = vm->vm_nbusytag;
881 }
882 } else {
883 vm->vm_maxbusytag = vm->vm_nbusytag;
884 }
885 oldhashlist = vm->vm_hashlist;
886 oldhashsize = vm->vm_hashsize;
887 vm->vm_hashlist = newhashlist;
888 vm->vm_hashsize = newhashsize;
889 vm->vm_hashmask = newhashsize - 1;
890 if (oldhashlist == NULL) {
891 VMEM_UNLOCK(vm);
892 return 0;
893 }
894 for (i = 0; i < oldhashsize; i++) {
895 while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) {
896 bt_rembusy(vm, bt); /* XXX */
897 bt_insbusy(vm, bt);
898 }
899 }
900 VMEM_UNLOCK(vm);
901
902 if (oldhashlist != &vm->vm_hash0) {
903 xfree(oldhashlist,
904 sizeof(struct vmem_hashlist) * oldhashsize);
905 }
906
907 return 0;
908 }
909 #endif /* _KERNEL */
910
911 /*
912 * vmem_fit: check if a bt can satisfy the given restrictions.
913 *
914 * it's a caller's responsibility to ensure the region is big enough
915 * before calling us.
916 */
917
918 static int
919 vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align,
920 vmem_size_t phase, vmem_size_t nocross,
921 vmem_addr_t minaddr, vmem_addr_t maxaddr, vmem_addr_t *addrp)
922 {
923 vmem_addr_t start;
924 vmem_addr_t end;
925
926 KASSERT(size > 0);
927 KASSERT(bt->bt_size >= size); /* caller's responsibility */
928
929 /*
930 * XXX assumption: vmem_addr_t and vmem_size_t are
931 * unsigned integer of the same size.
932 */
933
934 start = bt->bt_start;
935 if (start < minaddr) {
936 start = minaddr;
937 }
938 end = BT_END(bt);
939 if (end > maxaddr) {
940 end = maxaddr;
941 }
942 if (start > end) {
943 return ENOMEM;
944 }
945
946 start = VMEM_ALIGNUP(start - phase, align) + phase;
947 if (start < bt->bt_start) {
948 start += align;
949 }
950 if (VMEM_CROSS_P(start, start + size - 1, nocross)) {
951 KASSERT(align < nocross);
952 start = VMEM_ALIGNUP(start - phase, nocross) + phase;
953 }
954 if (start <= end && end - start >= size - 1) {
955 KASSERT((start & (align - 1)) == phase);
956 KASSERT(!VMEM_CROSS_P(start, start + size - 1, nocross));
957 KASSERT(minaddr <= start);
958 KASSERT(maxaddr == 0 || start + size - 1 <= maxaddr);
959 KASSERT(bt->bt_start <= start);
960 KASSERT(BT_END(bt) - start >= size - 1);
961 *addrp = start;
962 return 0;
963 }
964 return ENOMEM;
965 }
966
967 /* ---- vmem API */
968
969 /*
970 * vmem_init: creates a vmem arena.
971 */
972
973 vmem_t *
974 vmem_init(vmem_t *vm, const char *name,
975 vmem_addr_t base, vmem_size_t size, vmem_size_t quantum,
976 vmem_import_t *importfn, vmem_release_t *releasefn,
977 vmem_t *arg, vmem_size_t qcache_max, vm_flag_t flags, int ipl)
978 {
979 int i;
980
981 KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
982 KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
983 KASSERT(quantum > 0);
984 KASSERT(powerof2(quantum));
985
986 /*
987 * If private tags are going to be used, they must
988 * be added to the arena before the first span is
989 * added.
990 */
991 KASSERT((flags & VM_PRIVTAGS) == 0 || size == 0);
992
993 #if defined(_KERNEL)
994 /* XXX: SMP, we get called early... */
995 if (!vmem_bootstrapped) {
996 vmem_bootstrap();
997 }
998 #endif /* defined(_KERNEL) */
999
1000 if (vm == NULL) {
1001 vm = xmalloc(sizeof(*vm), flags);
1002 }
1003 if (vm == NULL) {
1004 return NULL;
1005 }
1006
1007 VMEM_CONDVAR_INIT(vm, "vmem");
1008 VMEM_LOCK_INIT(vm, ipl);
1009 vm->vm_flags = flags;
1010 vm->vm_nfreetags = 0;
1011 LIST_INIT(&vm->vm_freetags);
1012 strlcpy(vm->vm_name, name, sizeof(vm->vm_name));
1013 vm->vm_quantum_mask = quantum - 1;
1014 vm->vm_quantum_shift = SIZE2ORDER(quantum);
1015 KASSERT(ORDER2SIZE(vm->vm_quantum_shift) == quantum);
1016 vm->vm_importfn = importfn;
1017 vm->vm_releasefn = releasefn;
1018 vm->vm_arg = arg;
1019 vm->vm_nbusytag = 0;
1020 vm->vm_maxbusytag = 0;
1021 vm->vm_size = 0;
1022 vm->vm_inuse = 0;
1023 #if defined(QCACHE)
1024 qc_init(vm, qcache_max, ipl);
1025 #endif /* defined(QCACHE) */
1026
1027 TAILQ_INIT(&vm->vm_seglist);
1028 for (i = 0; i < VMEM_MAXORDER; i++) {
1029 LIST_INIT(&vm->vm_freelist[i]);
1030 }
1031 memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0));
1032 vm->vm_hashsize = 1;
1033 vm->vm_hashmask = vm->vm_hashsize - 1;
1034 vm->vm_hashlist = &vm->vm_hash0;
1035
1036 if (size != 0) {
1037 if (vmem_add(vm, base, size, flags) != 0) {
1038 vmem_destroy1(vm);
1039 return NULL;
1040 }
1041 }
1042
1043 #if defined(_KERNEL)
1044 if (flags & VM_BOOTSTRAP) {
1045 bt_refill(vm);
1046 }
1047
1048 mutex_enter(&vmem_list_lock);
1049 LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist);
1050 mutex_exit(&vmem_list_lock);
1051 #endif /* defined(_KERNEL) */
1052
1053 return vm;
1054 }
1055
1056
1057
1058 /*
1059 * vmem_create: create an arena.
1060 *
1061 * => must not be called from interrupt context.
1062 */
1063
1064 vmem_t *
1065 vmem_create(const char *name, vmem_addr_t base, vmem_size_t size,
1066 vmem_size_t quantum, vmem_import_t *importfn, vmem_release_t *releasefn,
1067 vmem_t *source, vmem_size_t qcache_max, vm_flag_t flags, int ipl)
1068 {
1069
1070 KASSERT((flags & (VM_XIMPORT)) == 0);
1071
1072 return vmem_init(NULL, name, base, size, quantum,
1073 importfn, releasefn, source, qcache_max, flags, ipl);
1074 }
1075
1076 /*
1077 * vmem_xcreate: create an arena takes alternative import func.
1078 *
1079 * => must not be called from interrupt context.
1080 */
1081
1082 vmem_t *
1083 vmem_xcreate(const char *name, vmem_addr_t base, vmem_size_t size,
1084 vmem_size_t quantum, vmem_ximport_t *importfn, vmem_release_t *releasefn,
1085 vmem_t *source, vmem_size_t qcache_max, vm_flag_t flags, int ipl)
1086 {
1087
1088 KASSERT((flags & (VM_XIMPORT)) == 0);
1089
1090 return vmem_init(NULL, name, base, size, quantum,
1091 __FPTRCAST(vmem_import_t *, importfn), releasefn, source,
1092 qcache_max, flags | VM_XIMPORT, ipl);
1093 }
1094
1095 void
1096 vmem_destroy(vmem_t *vm)
1097 {
1098
1099 #if defined(_KERNEL)
1100 mutex_enter(&vmem_list_lock);
1101 LIST_REMOVE(vm, vm_alllist);
1102 mutex_exit(&vmem_list_lock);
1103 #endif /* defined(_KERNEL) */
1104
1105 vmem_destroy1(vm);
1106 }
1107
1108 vmem_size_t
1109 vmem_roundup_size(vmem_t *vm, vmem_size_t size)
1110 {
1111
1112 return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask;
1113 }
1114
1115 /*
1116 * vmem_alloc: allocate resource from the arena.
1117 */
1118
1119 int
1120 vmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags, vmem_addr_t *addrp)
1121 {
1122 const vm_flag_t strat __diagused = flags & VM_FITMASK;
1123 int error;
1124
1125 KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
1126 KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
1127
1128 KASSERT(size > 0);
1129 KASSERT(strat == VM_BESTFIT || strat == VM_INSTANTFIT);
1130 if ((flags & VM_SLEEP) != 0) {
1131 ASSERT_SLEEPABLE();
1132 }
1133
1134 #if defined(QCACHE)
1135 if (size <= vm->vm_qcache_max) {
1136 void *p;
1137 int qidx = (size + vm->vm_quantum_mask) >> vm->vm_quantum_shift;
1138 qcache_t *qc = vm->vm_qcache[qidx - 1];
1139
1140 p = pool_cache_get(qc->qc_cache, vmf_to_prf(flags));
1141 if (addrp != NULL)
1142 *addrp = (vmem_addr_t)p;
1143 error = (p == NULL) ? ENOMEM : 0;
1144 goto out;
1145 }
1146 #endif /* defined(QCACHE) */
1147
1148 error = vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX,
1149 flags, addrp);
1150 #if defined(QCACHE)
1151 out:
1152 #endif /* defined(QCACHE) */
1153 KASSERTMSG(error || addrp == NULL ||
1154 (*addrp & vm->vm_quantum_mask) == 0,
1155 "vmem %s mask=0x%jx addr=0x%jx",
1156 vm->vm_name, (uintmax_t)vm->vm_quantum_mask, (uintmax_t)*addrp);
1157 KASSERT(error == 0 || (flags & VM_SLEEP) == 0);
1158 return error;
1159 }
1160
1161 int
1162 vmem_xalloc_addr(vmem_t *vm, const vmem_addr_t addr, const vmem_size_t size,
1163 vm_flag_t flags)
1164 {
1165 vmem_addr_t result;
1166 int error;
1167
1168 KASSERT((addr & vm->vm_quantum_mask) == 0);
1169 KASSERT(size != 0);
1170
1171 flags = (flags & ~VM_INSTANTFIT) | VM_BESTFIT;
1172
1173 error = vmem_xalloc(vm, size, 0, 0, 0, addr, addr + size - 1,
1174 flags, &result);
1175
1176 KASSERT(error || result == addr);
1177 KASSERT(error == 0 || (flags & VM_SLEEP) == 0);
1178 return error;
1179 }
1180
1181 int
1182 vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align,
1183 const vmem_size_t phase, const vmem_size_t nocross,
1184 const vmem_addr_t minaddr, const vmem_addr_t maxaddr, const vm_flag_t flags,
1185 vmem_addr_t *addrp)
1186 {
1187 struct vmem_freelist *list;
1188 struct vmem_freelist *first;
1189 struct vmem_freelist *end;
1190 bt_t *bt;
1191 bt_t *btnew;
1192 bt_t *btnew2;
1193 const vmem_size_t size = vmem_roundup_size(vm, size0);
1194 vm_flag_t strat = flags & VM_FITMASK;
1195 vmem_addr_t start;
1196 int rc;
1197
1198 KASSERT(size0 > 0);
1199 KASSERT(size > 0);
1200 KASSERT(strat == VM_BESTFIT || strat == VM_INSTANTFIT);
1201 if ((flags & VM_SLEEP) != 0) {
1202 ASSERT_SLEEPABLE();
1203 }
1204 KASSERT((align & vm->vm_quantum_mask) == 0);
1205 KASSERT((align & (align - 1)) == 0);
1206 KASSERT((phase & vm->vm_quantum_mask) == 0);
1207 KASSERT((nocross & vm->vm_quantum_mask) == 0);
1208 KASSERT((nocross & (nocross - 1)) == 0);
1209 KASSERT(align == 0 || phase < align);
1210 KASSERT(phase == 0 || phase < align);
1211 KASSERT(nocross == 0 || nocross >= size);
1212 KASSERT(minaddr <= maxaddr);
1213 KASSERT(!VMEM_CROSS_P(phase, phase + size - 1, nocross));
1214
1215 if (align == 0) {
1216 align = vm->vm_quantum_mask + 1;
1217 }
1218
1219 /*
1220 * allocate boundary tags before acquiring the vmem lock.
1221 */
1222 VMEM_LOCK(vm);
1223 btnew = bt_alloc(vm, flags);
1224 if (btnew == NULL) {
1225 VMEM_UNLOCK(vm);
1226 return ENOMEM;
1227 }
1228 btnew2 = bt_alloc(vm, flags); /* XXX not necessary if no restrictions */
1229 if (btnew2 == NULL) {
1230 bt_free(vm, btnew);
1231 VMEM_UNLOCK(vm);
1232 return ENOMEM;
1233 }
1234
1235 /*
1236 * choose a free block from which we allocate.
1237 */
1238 retry_strat:
1239 first = bt_freehead_toalloc(vm, size, strat);
1240 end = &vm->vm_freelist[VMEM_MAXORDER];
1241 retry:
1242 bt = NULL;
1243 vmem_check(vm);
1244 if (strat == VM_INSTANTFIT) {
1245 /*
1246 * just choose the first block which satisfies our restrictions.
1247 *
1248 * note that we don't need to check the size of the blocks
1249 * because any blocks found on these list should be larger than
1250 * the given size.
1251 */
1252 for (list = first; list < end; list++) {
1253 bt = LIST_FIRST(list);
1254 if (bt != NULL) {
1255 rc = vmem_fit(bt, size, align, phase,
1256 nocross, minaddr, maxaddr, &start);
1257 if (rc == 0) {
1258 goto gotit;
1259 }
1260 /*
1261 * don't bother to follow the bt_freelist link
1262 * here. the list can be very long and we are
1263 * told to run fast. blocks from the later free
1264 * lists are larger and have better chances to
1265 * satisfy our restrictions.
1266 */
1267 }
1268 }
1269 } else { /* VM_BESTFIT */
1270 /*
1271 * we assume that, for space efficiency, it's better to
1272 * allocate from a smaller block. thus we will start searching
1273 * from the lower-order list than VM_INSTANTFIT.
1274 * however, don't bother to find the smallest block in a free
1275 * list because the list can be very long. we can revisit it
1276 * if/when it turns out to be a problem.
1277 *
1278 * note that the 'first' list can contain blocks smaller than
1279 * the requested size. thus we need to check bt_size.
1280 */
1281 for (list = first; list < end; list++) {
1282 LIST_FOREACH(bt, list, bt_freelist) {
1283 if (bt->bt_size >= size) {
1284 rc = vmem_fit(bt, size, align, phase,
1285 nocross, minaddr, maxaddr, &start);
1286 if (rc == 0) {
1287 goto gotit;
1288 }
1289 }
1290 }
1291 }
1292 }
1293 #if 1
1294 if (strat == VM_INSTANTFIT) {
1295 strat = VM_BESTFIT;
1296 goto retry_strat;
1297 }
1298 #endif
1299 if (align != vm->vm_quantum_mask + 1 || phase != 0 || nocross != 0) {
1300
1301 /*
1302 * XXX should try to import a region large enough to
1303 * satisfy restrictions?
1304 */
1305
1306 goto fail;
1307 }
1308 /* XXX eeek, minaddr & maxaddr not respected */
1309 if (vmem_import(vm, size, flags) == 0) {
1310 goto retry;
1311 }
1312 /* XXX */
1313
1314 if ((flags & VM_SLEEP) != 0) {
1315 vmem_kick_pdaemon();
1316 VMEM_CONDVAR_WAIT(vm);
1317 goto retry;
1318 }
1319 fail:
1320 bt_free(vm, btnew);
1321 bt_free(vm, btnew2);
1322 VMEM_UNLOCK(vm);
1323 return ENOMEM;
1324
1325 gotit:
1326 KASSERT(bt->bt_type == BT_TYPE_FREE);
1327 KASSERT(bt->bt_size >= size);
1328 bt_remfree(vm, bt);
1329 vmem_check(vm);
1330 if (bt->bt_start != start) {
1331 btnew2->bt_type = BT_TYPE_FREE;
1332 btnew2->bt_start = bt->bt_start;
1333 btnew2->bt_size = start - bt->bt_start;
1334 bt->bt_start = start;
1335 bt->bt_size -= btnew2->bt_size;
1336 bt_insfree(vm, btnew2);
1337 bt_insseg(vm, btnew2, TAILQ_PREV(bt, vmem_seglist, bt_seglist));
1338 btnew2 = NULL;
1339 vmem_check(vm);
1340 }
1341 KASSERT(bt->bt_start == start);
1342 if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) {
1343 /* split */
1344 btnew->bt_type = BT_TYPE_BUSY;
1345 btnew->bt_start = bt->bt_start;
1346 btnew->bt_size = size;
1347 bt->bt_start = bt->bt_start + size;
1348 bt->bt_size -= size;
1349 bt_insfree(vm, bt);
1350 bt_insseg(vm, btnew, TAILQ_PREV(bt, vmem_seglist, bt_seglist));
1351 bt_insbusy(vm, btnew);
1352 vmem_check(vm);
1353 } else {
1354 bt->bt_type = BT_TYPE_BUSY;
1355 bt_insbusy(vm, bt);
1356 vmem_check(vm);
1357 bt_free(vm, btnew);
1358 btnew = bt;
1359 }
1360 if (btnew2 != NULL) {
1361 bt_free(vm, btnew2);
1362 }
1363 KASSERT(btnew->bt_size >= size);
1364 btnew->bt_type = BT_TYPE_BUSY;
1365 if (addrp != NULL)
1366 *addrp = btnew->bt_start;
1367 VMEM_UNLOCK(vm);
1368 KASSERTMSG(addrp == NULL ||
1369 (*addrp & vm->vm_quantum_mask) == 0,
1370 "vmem %s mask=0x%jx addr=0x%jx",
1371 vm->vm_name, (uintmax_t)vm->vm_quantum_mask, (uintmax_t)*addrp);
1372 return 0;
1373 }
1374
1375 /*
1376 * vmem_free: free the resource to the arena.
1377 */
1378
1379 void
1380 vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
1381 {
1382
1383 KASSERT(size > 0);
1384 KASSERTMSG((addr & vm->vm_quantum_mask) == 0,
1385 "vmem %s mask=0x%jx addr=0x%jx",
1386 vm->vm_name, (uintmax_t)vm->vm_quantum_mask, (uintmax_t)addr);
1387
1388 #if defined(QCACHE)
1389 if (size <= vm->vm_qcache_max) {
1390 int qidx = (size + vm->vm_quantum_mask) >> vm->vm_quantum_shift;
1391 qcache_t *qc = vm->vm_qcache[qidx - 1];
1392
1393 pool_cache_put(qc->qc_cache, (void *)addr);
1394 return;
1395 }
1396 #endif /* defined(QCACHE) */
1397
1398 vmem_xfree(vm, addr, size);
1399 }
1400
1401 void
1402 vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
1403 {
1404 bt_t *bt;
1405
1406 KASSERT(size > 0);
1407 KASSERTMSG((addr & vm->vm_quantum_mask) == 0,
1408 "vmem %s mask=0x%jx addr=0x%jx",
1409 vm->vm_name, (uintmax_t)vm->vm_quantum_mask, (uintmax_t)addr);
1410
1411 VMEM_LOCK(vm);
1412
1413 bt = bt_lookupbusy(vm, addr);
1414 KASSERTMSG(bt != NULL, "vmem %s addr 0x%jx size 0x%jx",
1415 vm->vm_name, (uintmax_t)addr, (uintmax_t)size);
1416 KASSERT(bt->bt_start == addr);
1417 KASSERT(bt->bt_size == vmem_roundup_size(vm, size) ||
1418 bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask);
1419
1420 /* vmem_xfree_bt() drops the lock. */
1421 vmem_xfree_bt(vm, bt);
1422 }
1423
1424 void
1425 vmem_xfreeall(vmem_t *vm)
1426 {
1427 bt_t *bt;
1428
1429 #if defined(QCACHE)
1430 /* This can't be used if the arena has a quantum cache. */
1431 KASSERT(vm->vm_qcache_max == 0);
1432 #endif /* defined(QCACHE) */
1433
1434 for (;;) {
1435 VMEM_LOCK(vm);
1436 TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
1437 if (bt->bt_type == BT_TYPE_BUSY)
1438 break;
1439 }
1440 if (bt != NULL) {
1441 /* vmem_xfree_bt() drops the lock. */
1442 vmem_xfree_bt(vm, bt);
1443 } else {
1444 VMEM_UNLOCK(vm);
1445 return;
1446 }
1447 }
1448 }
1449
1450 static void
1451 vmem_xfree_bt(vmem_t *vm, bt_t *bt)
1452 {
1453 bt_t *t;
1454
1455 VMEM_ASSERT_LOCKED(vm);
1456
1457 KASSERT(bt->bt_type == BT_TYPE_BUSY);
1458 bt_rembusy(vm, bt);
1459 bt->bt_type = BT_TYPE_FREE;
1460
1461 /* coalesce */
1462 t = TAILQ_NEXT(bt, bt_seglist);
1463 if (t != NULL && t->bt_type == BT_TYPE_FREE) {
1464 KASSERT(BT_END(bt) < t->bt_start); /* YYY */
1465 bt_remfree(vm, t);
1466 bt_remseg(vm, t);
1467 bt->bt_size += t->bt_size;
1468 bt_free(vm, t);
1469 }
1470 t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
1471 if (t != NULL && t->bt_type == BT_TYPE_FREE) {
1472 KASSERT(BT_END(t) < bt->bt_start); /* YYY */
1473 bt_remfree(vm, t);
1474 bt_remseg(vm, t);
1475 bt->bt_size += t->bt_size;
1476 bt->bt_start = t->bt_start;
1477 bt_free(vm, t);
1478 }
1479
1480 t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
1481 KASSERT(t != NULL);
1482 KASSERT(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY);
1483 if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN &&
1484 t->bt_size == bt->bt_size) {
1485 vmem_addr_t spanaddr;
1486 vmem_size_t spansize;
1487
1488 KASSERT(t->bt_start == bt->bt_start);
1489 spanaddr = bt->bt_start;
1490 spansize = bt->bt_size;
1491 bt_remseg(vm, bt);
1492 bt_free(vm, bt);
1493 bt_remseg(vm, t);
1494 bt_free(vm, t);
1495 vm->vm_size -= spansize;
1496 VMEM_CONDVAR_BROADCAST(vm);
1497 /* bt_freetrim() drops the lock. */
1498 bt_freetrim(vm, BT_MAXFREE);
1499 (*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize);
1500 } else {
1501 bt_insfree(vm, bt);
1502 VMEM_CONDVAR_BROADCAST(vm);
1503 /* bt_freetrim() drops the lock. */
1504 bt_freetrim(vm, BT_MAXFREE);
1505 }
1506 }
1507
1508 /*
1509 * vmem_add:
1510 *
1511 * => caller must ensure appropriate spl,
1512 * if the arena can be accessed from interrupt context.
1513 */
1514
1515 int
1516 vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, vm_flag_t flags)
1517 {
1518 int rv;
1519
1520 VMEM_LOCK(vm);
1521 rv = vmem_add1(vm, addr, size, flags, BT_TYPE_SPAN_STATIC);
1522 VMEM_UNLOCK(vm);
1523
1524 return rv;
1525 }
1526
1527 /*
1528 * vmem_size: information about arenas size
1529 *
1530 * => return free/allocated size in arena
1531 */
1532 vmem_size_t
1533 vmem_size(vmem_t *vm, int typemask)
1534 {
1535
1536 switch (typemask) {
1537 case VMEM_ALLOC:
1538 return vm->vm_inuse;
1539 case VMEM_FREE:
1540 return vm->vm_size - vm->vm_inuse;
1541 case VMEM_FREE|VMEM_ALLOC:
1542 return vm->vm_size;
1543 default:
1544 panic("vmem_size");
1545 }
1546 }
1547
1548 /* ---- rehash */
1549
1550 #if defined(_KERNEL)
1551 static struct callout vmem_rehash_ch;
1552 static int vmem_rehash_interval;
1553 static struct workqueue *vmem_rehash_wq;
1554 static struct work vmem_rehash_wk;
1555
1556 static void
1557 vmem_rehash_all(struct work *wk, void *dummy)
1558 {
1559 vmem_t *vm;
1560
1561 KASSERT(wk == &vmem_rehash_wk);
1562 mutex_enter(&vmem_list_lock);
1563 LIST_FOREACH(vm, &vmem_list, vm_alllist) {
1564 size_t desired;
1565 size_t current;
1566
1567 desired = atomic_load_relaxed(&vm->vm_maxbusytag);
1568 current = atomic_load_relaxed(&vm->vm_hashsize);
1569
1570 if (desired > VMEM_HASHSIZE_MAX) {
1571 desired = VMEM_HASHSIZE_MAX;
1572 } else if (desired < VMEM_HASHSIZE_MIN) {
1573 desired = VMEM_HASHSIZE_MIN;
1574 }
1575 if (desired > current * 2 || desired * 2 < current) {
1576 vmem_rehash(vm, desired, VM_NOSLEEP);
1577 }
1578 }
1579 mutex_exit(&vmem_list_lock);
1580
1581 callout_schedule(&vmem_rehash_ch, vmem_rehash_interval);
1582 }
1583
1584 static void
1585 vmem_rehash_all_kick(void *dummy)
1586 {
1587
1588 workqueue_enqueue(vmem_rehash_wq, &vmem_rehash_wk, NULL);
1589 }
1590
1591 void
1592 vmem_rehash_start(void)
1593 {
1594 int error;
1595
1596 error = workqueue_create(&vmem_rehash_wq, "vmem_rehash",
1597 vmem_rehash_all, NULL, PRI_VM, IPL_SOFTCLOCK, WQ_MPSAFE);
1598 if (error) {
1599 panic("%s: workqueue_create %d\n", __func__, error);
1600 }
1601 callout_init(&vmem_rehash_ch, CALLOUT_MPSAFE);
1602 callout_setfunc(&vmem_rehash_ch, vmem_rehash_all_kick, NULL);
1603
1604 vmem_rehash_interval = hz * 10;
1605 callout_schedule(&vmem_rehash_ch, vmem_rehash_interval);
1606 }
1607 #endif /* defined(_KERNEL) */
1608
1609 /* ---- debug */
1610
1611 #if defined(DDB) || defined(UNITTEST) || defined(VMEM_SANITY)
1612
1613 static void bt_dump(const bt_t *, void (*)(const char *, ...)
1614 __printflike(1, 2));
1615
1616 static const char *
1617 bt_type_string(int type)
1618 {
1619 static const char * const table[] = {
1620 [BT_TYPE_BUSY] = "busy",
1621 [BT_TYPE_FREE] = "free",
1622 [BT_TYPE_SPAN] = "span",
1623 [BT_TYPE_SPAN_STATIC] = "static span",
1624 };
1625
1626 if (type >= __arraycount(table)) {
1627 return "BOGUS";
1628 }
1629 return table[type];
1630 }
1631
1632 static void
1633 bt_dump(const bt_t *bt, void (*pr)(const char *, ...))
1634 {
1635
1636 (*pr)("\t%p: %" PRIu64 ", %" PRIu64 ", %d(%s)\n",
1637 bt, (uint64_t)bt->bt_start, (uint64_t)bt->bt_size,
1638 bt->bt_type, bt_type_string(bt->bt_type));
1639 }
1640
1641 static void
1642 vmem_dump(const vmem_t *vm , void (*pr)(const char *, ...) __printflike(1, 2))
1643 {
1644 const bt_t *bt;
1645 int i;
1646
1647 (*pr)("vmem %p '%s'\n", vm, vm->vm_name);
1648 TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
1649 bt_dump(bt, pr);
1650 }
1651
1652 for (i = 0; i < VMEM_MAXORDER; i++) {
1653 const struct vmem_freelist *fl = &vm->vm_freelist[i];
1654
1655 if (LIST_EMPTY(fl)) {
1656 continue;
1657 }
1658
1659 (*pr)("freelist[%d]\n", i);
1660 LIST_FOREACH(bt, fl, bt_freelist) {
1661 bt_dump(bt, pr);
1662 }
1663 }
1664 }
1665
1666 #endif /* defined(DDB) || defined(UNITTEST) || defined(VMEM_SANITY) */
1667
1668 #if defined(DDB)
1669 static bt_t *
1670 vmem_whatis_lookup(vmem_t *vm, uintptr_t addr)
1671 {
1672 bt_t *bt;
1673
1674 TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
1675 if (BT_ISSPAN_P(bt)) {
1676 continue;
1677 }
1678 if (bt->bt_start <= addr && addr <= BT_END(bt)) {
1679 return bt;
1680 }
1681 }
1682
1683 return NULL;
1684 }
1685
1686 void
1687 vmem_whatis(uintptr_t addr, void (*pr)(const char *, ...))
1688 {
1689 vmem_t *vm;
1690
1691 LIST_FOREACH(vm, &vmem_list, vm_alllist) {
1692 bt_t *bt;
1693
1694 bt = vmem_whatis_lookup(vm, addr);
1695 if (bt == NULL) {
1696 continue;
1697 }
1698 (*pr)("%p is %p+%zu in VMEM '%s' (%s)\n",
1699 (void *)addr, (void *)bt->bt_start,
1700 (size_t)(addr - bt->bt_start), vm->vm_name,
1701 (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free");
1702 }
1703 }
1704
1705 void
1706 vmem_printall(const char *modif, void (*pr)(const char *, ...))
1707 {
1708 const vmem_t *vm;
1709
1710 LIST_FOREACH(vm, &vmem_list, vm_alllist) {
1711 vmem_dump(vm, pr);
1712 }
1713 }
1714
1715 void
1716 vmem_print(uintptr_t addr, const char *modif, void (*pr)(const char *, ...))
1717 {
1718 const vmem_t *vm = (const void *)addr;
1719
1720 vmem_dump(vm, pr);
1721 }
1722 #endif /* defined(DDB) */
1723
1724 #if defined(_KERNEL)
1725 #define vmem_printf printf
1726 #else
1727 #include <stdio.h>
1728 #include <stdarg.h>
1729
1730 static void
1731 vmem_printf(const char *fmt, ...)
1732 {
1733 va_list ap;
1734 va_start(ap, fmt);
1735 vprintf(fmt, ap);
1736 va_end(ap);
1737 }
1738 #endif
1739
1740 #if defined(VMEM_SANITY)
1741
1742 static bool
1743 vmem_check_sanity(vmem_t *vm)
1744 {
1745 const bt_t *bt, *bt2;
1746
1747 KASSERT(vm != NULL);
1748
1749 TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
1750 if (bt->bt_start > BT_END(bt)) {
1751 printf("corrupted tag\n");
1752 bt_dump(bt, vmem_printf);
1753 return false;
1754 }
1755 }
1756 TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
1757 TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) {
1758 if (bt == bt2) {
1759 continue;
1760 }
1761 if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) {
1762 continue;
1763 }
1764 if (bt->bt_start <= BT_END(bt2) &&
1765 bt2->bt_start <= BT_END(bt)) {
1766 printf("overwrapped tags\n");
1767 bt_dump(bt, vmem_printf);
1768 bt_dump(bt2, vmem_printf);
1769 return false;
1770 }
1771 }
1772 }
1773
1774 return true;
1775 }
1776
1777 static void
1778 vmem_check(vmem_t *vm)
1779 {
1780
1781 if (!vmem_check_sanity(vm)) {
1782 panic("insanity vmem %p", vm);
1783 }
1784 }
1785
1786 #endif /* defined(VMEM_SANITY) */
1787
1788 #if defined(UNITTEST)
1789 int
1790 main(void)
1791 {
1792 int rc;
1793 vmem_t *vm;
1794 vmem_addr_t p;
1795 struct reg {
1796 vmem_addr_t p;
1797 vmem_size_t sz;
1798 bool x;
1799 } *reg = NULL;
1800 int nreg = 0;
1801 int nalloc = 0;
1802 int nfree = 0;
1803 vmem_size_t total = 0;
1804 #if 1
1805 vm_flag_t strat = VM_INSTANTFIT;
1806 #else
1807 vm_flag_t strat = VM_BESTFIT;
1808 #endif
1809
1810 vm = vmem_create("test", 0, 0, 1, NULL, NULL, NULL, 0, VM_SLEEP,
1811 #ifdef _KERNEL
1812 IPL_NONE
1813 #else
1814 0
1815 #endif
1816 );
1817 if (vm == NULL) {
1818 printf("vmem_create\n");
1819 exit(EXIT_FAILURE);
1820 }
1821 vmem_dump(vm, vmem_printf);
1822
1823 rc = vmem_add(vm, 0, 50, VM_SLEEP);
1824 assert(rc == 0);
1825 rc = vmem_add(vm, 100, 200, VM_SLEEP);
1826 assert(rc == 0);
1827 rc = vmem_add(vm, 2000, 1, VM_SLEEP);
1828 assert(rc == 0);
1829 rc = vmem_add(vm, 40000, 65536, VM_SLEEP);
1830 assert(rc == 0);
1831 rc = vmem_add(vm, 10000, 10000, VM_SLEEP);
1832 assert(rc == 0);
1833 rc = vmem_add(vm, 500, 1000, VM_SLEEP);
1834 assert(rc == 0);
1835 rc = vmem_add(vm, 0xffffff00, 0x100, VM_SLEEP);
1836 assert(rc == 0);
1837 rc = vmem_xalloc(vm, 0x101, 0, 0, 0,
1838 0xffffff00, 0xffffffff, strat|VM_SLEEP, &p);
1839 assert(rc != 0);
1840 rc = vmem_xalloc(vm, 50, 0, 0, 0, 0, 49, strat|VM_SLEEP, &p);
1841 assert(rc == 0 && p == 0);
1842 vmem_xfree(vm, p, 50);
1843 rc = vmem_xalloc(vm, 25, 0, 0, 0, 0, 24, strat|VM_SLEEP, &p);
1844 assert(rc == 0 && p == 0);
1845 rc = vmem_xalloc(vm, 0x100, 0, 0, 0,
1846 0xffffff01, 0xffffffff, strat|VM_SLEEP, &p);
1847 assert(rc != 0);
1848 rc = vmem_xalloc(vm, 0x100, 0, 0, 0,
1849 0xffffff00, 0xfffffffe, strat|VM_SLEEP, &p);
1850 assert(rc != 0);
1851 rc = vmem_xalloc(vm, 0x100, 0, 0, 0,
1852 0xffffff00, 0xffffffff, strat|VM_SLEEP, &p);
1853 assert(rc == 0);
1854 vmem_dump(vm, vmem_printf);
1855 for (;;) {
1856 struct reg *r;
1857 int t = rand() % 100;
1858
1859 if (t > 45) {
1860 /* alloc */
1861 vmem_size_t sz = rand() % 500 + 1;
1862 bool x;
1863 vmem_size_t align, phase, nocross;
1864 vmem_addr_t minaddr, maxaddr;
1865
1866 if (t > 70) {
1867 x = true;
1868 /* XXX */
1869 align = 1 << (rand() % 15);
1870 phase = rand() % 65536;
1871 nocross = 1 << (rand() % 15);
1872 if (align <= phase) {
1873 phase = 0;
1874 }
1875 if (VMEM_CROSS_P(phase, phase + sz - 1,
1876 nocross)) {
1877 nocross = 0;
1878 }
1879 do {
1880 minaddr = rand() % 50000;
1881 maxaddr = rand() % 70000;
1882 } while (minaddr > maxaddr);
1883 printf("=== xalloc %" PRIu64
1884 " align=%" PRIu64 ", phase=%" PRIu64
1885 ", nocross=%" PRIu64 ", min=%" PRIu64
1886 ", max=%" PRIu64 "\n",
1887 (uint64_t)sz,
1888 (uint64_t)align,
1889 (uint64_t)phase,
1890 (uint64_t)nocross,
1891 (uint64_t)minaddr,
1892 (uint64_t)maxaddr);
1893 rc = vmem_xalloc(vm, sz, align, phase, nocross,
1894 minaddr, maxaddr, strat|VM_SLEEP, &p);
1895 } else {
1896 x = false;
1897 printf("=== alloc %" PRIu64 "\n", (uint64_t)sz);
1898 rc = vmem_alloc(vm, sz, strat|VM_SLEEP, &p);
1899 }
1900 printf("-> %" PRIu64 "\n", (uint64_t)p);
1901 vmem_dump(vm, vmem_printf);
1902 if (rc != 0) {
1903 if (x) {
1904 continue;
1905 }
1906 break;
1907 }
1908 nreg++;
1909 reg = realloc(reg, sizeof(*reg) * nreg);
1910 r = ®[nreg - 1];
1911 r->p = p;
1912 r->sz = sz;
1913 r->x = x;
1914 total += sz;
1915 nalloc++;
1916 } else if (nreg != 0) {
1917 /* free */
1918 r = ®[rand() % nreg];
1919 printf("=== free %" PRIu64 ", %" PRIu64 "\n",
1920 (uint64_t)r->p, (uint64_t)r->sz);
1921 if (r->x) {
1922 vmem_xfree(vm, r->p, r->sz);
1923 } else {
1924 vmem_free(vm, r->p, r->sz);
1925 }
1926 total -= r->sz;
1927 vmem_dump(vm, vmem_printf);
1928 *r = reg[nreg - 1];
1929 nreg--;
1930 nfree++;
1931 }
1932 printf("total=%" PRIu64 "\n", (uint64_t)total);
1933 }
1934 fprintf(stderr, "total=%" PRIu64 ", nalloc=%d, nfree=%d\n",
1935 (uint64_t)total, nalloc, nfree);
1936 exit(EXIT_SUCCESS);
1937 }
1938 #endif /* defined(UNITTEST) */
1939