uvm_pglist.c revision 1.73 1 /* $NetBSD: uvm_pglist.c,v 1.73 2019/12/13 20:10:22 ad Exp $ */
2
3 /*-
4 * Copyright (c) 1997 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * uvm_pglist.c: pglist functions
35 */
36
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: uvm_pglist.c,v 1.73 2019/12/13 20:10:22 ad Exp $");
39
40 #include <sys/param.h>
41 #include <sys/systm.h>
42
43 #include <uvm/uvm.h>
44 #include <uvm/uvm_pdpolicy.h>
45
46 #ifdef VM_PAGE_ALLOC_MEMORY_STATS
47 #define STAT_INCR(v) (v)++
48 #define STAT_DECR(v) do { \
49 if ((v) == 0) \
50 printf("%s:%d -- Already 0!\n", __FILE__, __LINE__); \
51 else \
52 (v)--; \
53 } while (/*CONSTCOND*/ 0)
54 u_long uvm_pglistalloc_npages;
55 #else
56 #define STAT_INCR(v)
57 #define STAT_DECR(v)
58 #endif
59
60 /*
61 * uvm_pglistalloc: allocate a list of pages
62 *
63 * => allocated pages are placed onto an rlist. rlist is
64 * initialized by uvm_pglistalloc.
65 * => returns 0 on success or errno on failure
66 * => implementation allocates a single segment if any constraints are
67 * imposed by call arguments.
68 * => doesn't take into account clean non-busy pages on inactive list
69 * that could be used(?)
70 * => params:
71 * size the size of the allocation, rounded to page size.
72 * low the low address of the allowed allocation range.
73 * high the high address of the allowed allocation range.
74 * alignment memory must be aligned to this power-of-two boundary.
75 * boundary no segment in the allocation may cross this
76 * power-of-two boundary (relative to zero).
77 */
78
79 static void
80 uvm_pglist_add(struct vm_page *pg, struct pglist *rlist)
81 {
82 int free_list __unused, color __unused, pgflidx;
83
84 KASSERT(mutex_owned(&uvm_fpageqlock));
85
86 #if PGFL_NQUEUES != 2
87 #error uvm_pglistalloc needs to be updated
88 #endif
89
90 free_list = uvm_page_lookup_freelist(pg);
91 color = VM_PGCOLOR_BUCKET(pg);
92 pgflidx = (pg->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN;
93 #ifdef UVMDEBUG
94 struct vm_page *tp;
95 LIST_FOREACH(tp,
96 &uvm.page_free[free_list].pgfl_buckets[color].pgfl_queues[pgflidx],
97 pageq.list) {
98 if (tp == pg)
99 break;
100 }
101 if (tp == NULL)
102 panic("uvm_pglistalloc: page not on freelist");
103 #endif
104 LIST_REMOVE(pg, pageq.list); /* global */
105 LIST_REMOVE(pg, listq.list); /* cpu */
106 uvmexp.free--;
107 if (pg->flags & PG_ZERO)
108 uvmexp.zeropages--;
109 VM_FREE_PAGE_TO_CPU(pg)->pages[pgflidx]--;
110 pg->flags = PG_CLEAN;
111 pg->uobject = NULL;
112 pg->uanon = NULL;
113 TAILQ_INSERT_TAIL(rlist, pg, pageq.queue);
114 STAT_INCR(uvm_pglistalloc_npages);
115 }
116
117 static int
118 uvm_pglistalloc_c_ps(uvm_physseg_t psi, int num, paddr_t low, paddr_t high,
119 paddr_t alignment, paddr_t boundary, struct pglist *rlist)
120 {
121 signed int candidate, limit, candidateidx, end, idx, skip;
122 int pagemask;
123 bool second_pass;
124 #ifdef DEBUG
125 paddr_t idxpa, lastidxpa;
126 paddr_t cidx = 0; /* XXX: GCC */
127 #endif
128 #ifdef PGALLOC_VERBOSE
129 printf("pgalloc: contig %d pgs from psi %zd\n", num, ps - vm_physmem);
130 #endif
131
132 KASSERT(mutex_owned(&uvm_fpageqlock));
133
134 low = atop(low);
135 high = atop(high);
136 alignment = atop(alignment);
137
138 /*
139 * Make sure that physseg falls within with range to be allocated from.
140 */
141 if (high <= uvm_physseg_get_avail_start(psi) || low >= uvm_physseg_get_avail_end(psi))
142 return 0;
143
144 /*
145 * We start our search at the just after where the last allocation
146 * succeeded.
147 */
148 candidate = roundup2(uimax(low, uvm_physseg_get_avail_start(psi) +
149 uvm_physseg_get_start_hint(psi)), alignment);
150 limit = uimin(high, uvm_physseg_get_avail_end(psi));
151 pagemask = ~((boundary >> PAGE_SHIFT) - 1);
152 skip = 0;
153 second_pass = false;
154
155 for (;;) {
156 bool ok = true;
157 signed int cnt;
158
159 if (candidate + num > limit) {
160 if (uvm_physseg_get_start_hint(psi) == 0 || second_pass) {
161 /*
162 * We've run past the allowable range.
163 */
164 return 0; /* FAIL = 0 pages*/
165 }
166 /*
167 * We've wrapped around the end of this segment
168 * so restart at the beginning but now our limit
169 * is were we started.
170 */
171 second_pass = true;
172 candidate = roundup2(uimax(low, uvm_physseg_get_avail_start(psi)), alignment);
173 limit = uimin(limit, uvm_physseg_get_avail_start(psi) +
174 uvm_physseg_get_start_hint(psi));
175 skip = 0;
176 continue;
177 }
178 if (boundary != 0 &&
179 ((candidate ^ (candidate + num - 1)) & pagemask) != 0) {
180 /*
181 * Region crosses boundary. Jump to the boundary
182 * just crossed and ensure alignment.
183 */
184 candidate = (candidate + num - 1) & pagemask;
185 candidate = roundup2(candidate, alignment);
186 skip = 0;
187 continue;
188 }
189 #ifdef DEBUG
190 /*
191 * Make sure this is a managed physical page.
192 */
193
194 if (uvm_physseg_find(candidate, &cidx) != psi)
195 panic("pgalloc contig: botch1");
196 if (cidx != candidate - uvm_physseg_get_start(psi))
197 panic("pgalloc contig: botch2");
198 if (uvm_physseg_find(candidate + num - 1, &cidx) != psi)
199 panic("pgalloc contig: botch3");
200 if (cidx != candidate - uvm_physseg_get_start(psi) + num - 1)
201 panic("pgalloc contig: botch4");
202 #endif
203 candidateidx = candidate - uvm_physseg_get_start(psi);
204 end = candidateidx + num;
205
206 /*
207 * Found a suitable starting page. See if the range is free.
208 */
209 #ifdef PGALLOC_VERBOSE
210 printf("%s: ps=%p candidate=%#x end=%#x skip=%#x, align=%#"PRIxPADDR,
211 __func__, ps, candidateidx, end, skip, alignment);
212 #endif
213 /*
214 * We start at the end and work backwards since if we find a
215 * non-free page, it makes no sense to continue.
216 *
217 * But on the plus size we have "vetted" some number of free
218 * pages. If this iteration fails, we may be able to skip
219 * testing most of those pages again in the next pass.
220 */
221 for (idx = end - 1; idx >= candidateidx + skip; idx--) {
222 if (VM_PAGE_IS_FREE(uvm_physseg_get_pg(psi, idx)) == 0) {
223 ok = false;
224 break;
225 }
226
227 #ifdef DEBUG
228 if (idx > candidateidx) {
229 idxpa = VM_PAGE_TO_PHYS(uvm_physseg_get_pg(psi, idx));
230 lastidxpa = VM_PAGE_TO_PHYS(uvm_physseg_get_pg(psi, idx - 1));
231 if ((lastidxpa + PAGE_SIZE) != idxpa) {
232 /*
233 * Region not contiguous.
234 */
235 panic("pgalloc contig: botch5");
236 }
237 if (boundary != 0 &&
238 ((lastidxpa ^ idxpa) & ~(boundary - 1))
239 != 0) {
240 /*
241 * Region crosses boundary.
242 */
243 panic("pgalloc contig: botch6");
244 }
245 }
246 #endif
247 }
248
249 if (ok) {
250 while (skip-- > 0) {
251 KDASSERT(VM_PAGE_IS_FREE(uvm_physseg_get_pg(psi, candidateidx + skip)));
252 }
253 #ifdef PGALLOC_VERBOSE
254 printf(": ok\n");
255 #endif
256 break;
257 }
258
259 #ifdef PGALLOC_VERBOSE
260 printf(": non-free at %#x\n", idx - candidateidx);
261 #endif
262 /*
263 * count the number of pages we can advance
264 * since we know they aren't all free.
265 */
266 cnt = idx + 1 - candidateidx;
267 /*
268 * now round up that to the needed alignment.
269 */
270 cnt = roundup2(cnt, alignment);
271 /*
272 * The number of pages we can skip checking
273 * (might be 0 if cnt > num).
274 */
275 skip = uimax(num - cnt, 0);
276 candidate += cnt;
277 }
278
279 /*
280 * we have a chunk of memory that conforms to the requested constraints.
281 */
282 for (idx = candidateidx; idx < end; idx++)
283 uvm_pglist_add(uvm_physseg_get_pg(psi, idx), rlist);
284
285 /*
286 * the next time we need to search this segment, start after this
287 * chunk of pages we just allocated.
288 */
289 uvm_physseg_set_start_hint(psi, candidate + num -
290 uvm_physseg_get_avail_start(psi));
291 KASSERTMSG(uvm_physseg_get_start_hint(psi) <=
292 uvm_physseg_get_avail_end(psi) - uvm_physseg_get_avail_start(psi),
293 "%x %u (%#x) <= %#"PRIxPADDR" - %#"PRIxPADDR" (%#"PRIxPADDR")",
294 candidate + num,
295 uvm_physseg_get_start_hint(psi), uvm_physseg_get_start_hint(psi),
296 uvm_physseg_get_avail_end(psi), uvm_physseg_get_avail_start(psi),
297 uvm_physseg_get_avail_end(psi) - uvm_physseg_get_avail_start(psi));
298
299 #ifdef PGALLOC_VERBOSE
300 printf("got %d pgs\n", num);
301 #endif
302 return num; /* number of pages allocated */
303 }
304
305 static int
306 uvm_pglistalloc_contig(int num, paddr_t low, paddr_t high, paddr_t alignment,
307 paddr_t boundary, struct pglist *rlist)
308 {
309 int fl;
310 int error;
311
312 uvm_physseg_t psi;
313 /* Default to "lose". */
314 error = ENOMEM;
315
316 /*
317 * Block all memory allocation and lock the free list.
318 */
319 mutex_spin_enter(&uvm_fpageqlock);
320
321 /* Are there even any free pages? */
322 if (uvmexp.free <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel))
323 goto out;
324
325 for (fl = 0; fl < VM_NFREELIST; fl++) {
326 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
327 for (psi = uvm_physseg_get_last(); uvm_physseg_valid_p(psi); psi = uvm_physseg_get_prev(psi))
328 #else
329 for (psi = uvm_physseg_get_first(); uvm_physseg_valid_p(psi); psi = uvm_physseg_get_next(psi))
330 #endif
331 {
332 if (uvm_physseg_get_free_list(psi) != fl)
333 continue;
334
335 num -= uvm_pglistalloc_c_ps(psi, num, low, high,
336 alignment, boundary, rlist);
337 if (num == 0) {
338 #ifdef PGALLOC_VERBOSE
339 printf("pgalloc: %"PRIxMAX"-%"PRIxMAX"\n",
340 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_FIRST(rlist)),
341 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_LAST(rlist, pglist)));
342 #endif
343 error = 0;
344 goto out;
345 }
346 }
347 }
348
349 out:
350 /*
351 * check to see if we need to generate some free pages waking
352 * the pagedaemon.
353 */
354
355 uvm_kick_pdaemon();
356 mutex_spin_exit(&uvm_fpageqlock);
357 return (error);
358 }
359
360 static int
361 uvm_pglistalloc_s_ps(uvm_physseg_t psi, int num, paddr_t low, paddr_t high,
362 struct pglist *rlist)
363 {
364 int todo, limit, candidate;
365 struct vm_page *pg;
366 bool second_pass;
367 #ifdef PGALLOC_VERBOSE
368 printf("pgalloc: simple %d pgs from psi %zd\n", num, psi);
369 #endif
370
371 KASSERT(mutex_owned(&uvm_fpageqlock));
372 KASSERT(uvm_physseg_get_start(psi) <= uvm_physseg_get_avail_start(psi));
373 KASSERT(uvm_physseg_get_start(psi) <= uvm_physseg_get_avail_end(psi));
374 KASSERT(uvm_physseg_get_avail_start(psi) <= uvm_physseg_get_end(psi));
375 KASSERT(uvm_physseg_get_avail_end(psi) <= uvm_physseg_get_end(psi));
376
377 low = atop(low);
378 high = atop(high);
379 todo = num;
380 candidate = uimax(low, uvm_physseg_get_avail_start(psi) +
381 uvm_physseg_get_start_hint(psi));
382 limit = uimin(high, uvm_physseg_get_avail_end(psi));
383 pg = uvm_physseg_get_pg(psi, candidate - uvm_physseg_get_start(psi));
384 second_pass = false;
385
386 /*
387 * Make sure that physseg falls within with range to be allocated from.
388 */
389 if (high <= uvm_physseg_get_avail_start(psi) ||
390 low >= uvm_physseg_get_avail_end(psi))
391 return 0;
392
393 again:
394 for (;; candidate++, pg++) {
395 if (candidate >= limit) {
396 if (uvm_physseg_get_start_hint(psi) == 0 || second_pass) {
397 candidate = limit - 1;
398 break;
399 }
400 second_pass = true;
401 candidate = uimax(low, uvm_physseg_get_avail_start(psi));
402 limit = uimin(limit, uvm_physseg_get_avail_start(psi) +
403 uvm_physseg_get_start_hint(psi));
404 pg = uvm_physseg_get_pg(psi, candidate - uvm_physseg_get_start(psi));
405 goto again;
406 }
407 #if defined(DEBUG)
408 {
409 paddr_t cidx = 0;
410 const uvm_physseg_t bank = uvm_physseg_find(candidate, &cidx);
411 KDASSERTMSG(bank == psi,
412 "uvm_physseg_find(%#x) (%"PRIxPHYSSEG ") != psi %"PRIxPHYSSEG,
413 candidate, bank, psi);
414 KDASSERTMSG(cidx == candidate - uvm_physseg_get_start(psi),
415 "uvm_physseg_find(%#x): %#"PRIxPADDR" != off %"PRIxPADDR,
416 candidate, cidx, candidate - uvm_physseg_get_start(psi));
417 }
418 #endif
419 if (VM_PAGE_IS_FREE(pg) == 0)
420 continue;
421
422 uvm_pglist_add(pg, rlist);
423 if (--todo == 0) {
424 break;
425 }
426 }
427
428 /*
429 * The next time we need to search this segment,
430 * start just after the pages we just allocated.
431 */
432 uvm_physseg_set_start_hint(psi, candidate + 1 - uvm_physseg_get_avail_start(psi));
433 KASSERTMSG(uvm_physseg_get_start_hint(psi) <= uvm_physseg_get_avail_end(psi) -
434 uvm_physseg_get_avail_start(psi),
435 "%#x %u (%#x) <= %#"PRIxPADDR" - %#"PRIxPADDR" (%#"PRIxPADDR")",
436 candidate + 1,
437 uvm_physseg_get_start_hint(psi),
438 uvm_physseg_get_start_hint(psi),
439 uvm_physseg_get_avail_end(psi),
440 uvm_physseg_get_avail_start(psi),
441 uvm_physseg_get_avail_end(psi) - uvm_physseg_get_avail_start(psi));
442
443 #ifdef PGALLOC_VERBOSE
444 printf("got %d pgs\n", num - todo);
445 #endif
446 return (num - todo); /* number of pages allocated */
447 }
448
449 static int
450 uvm_pglistalloc_simple(int num, paddr_t low, paddr_t high,
451 struct pglist *rlist, int waitok)
452 {
453 int fl, error;
454 uvm_physseg_t psi;
455 int count = 0;
456
457 /* Default to "lose". */
458 error = ENOMEM;
459
460 again:
461 /*
462 * Block all memory allocation and lock the free list.
463 */
464 mutex_spin_enter(&uvm_fpageqlock);
465 count++;
466
467 /* Are there even any free pages? */
468 if (uvmexp.free <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel))
469 goto out;
470
471 for (fl = 0; fl < VM_NFREELIST; fl++) {
472 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
473 for (psi = uvm_physseg_get_last(); uvm_physseg_valid_p(psi); psi = uvm_physseg_get_prev(psi))
474 #else
475 for (psi = uvm_physseg_get_first(); uvm_physseg_valid_p(psi); psi = uvm_physseg_get_next(psi))
476 #endif
477 {
478 if (uvm_physseg_get_free_list(psi) != fl)
479 continue;
480
481 num -= uvm_pglistalloc_s_ps(psi, num, low, high, rlist);
482 if (num == 0) {
483 error = 0;
484 goto out;
485 }
486 }
487
488 }
489
490 out:
491 /*
492 * check to see if we need to generate some free pages waking
493 * the pagedaemon.
494 */
495
496 uvm_kick_pdaemon();
497 mutex_spin_exit(&uvm_fpageqlock);
498
499 if (error) {
500 if (waitok) {
501 /* XXX perhaps some time limitation? */
502 #ifdef DEBUG
503 if (count == 1)
504 printf("pglistalloc waiting\n");
505 #endif
506 uvm_wait("pglalloc");
507 goto again;
508 } else
509 uvm_pglistfree(rlist);
510 }
511 #ifdef PGALLOC_VERBOSE
512 if (!error)
513 printf("pgalloc: %"PRIxMAX"..%"PRIxMAX"\n",
514 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_FIRST(rlist)),
515 (uintmax_t) VM_PAGE_TO_PHYS(TAILQ_LAST(rlist, pglist)));
516 #endif
517 return (error);
518 }
519
520 int
521 uvm_pglistalloc(psize_t size, paddr_t low, paddr_t high, paddr_t alignment,
522 paddr_t boundary, struct pglist *rlist, int nsegs, int waitok)
523 {
524 int num, res;
525
526 KASSERT((alignment & (alignment - 1)) == 0);
527 KASSERT((boundary & (boundary - 1)) == 0);
528
529 /*
530 * Our allocations are always page granularity, so our alignment
531 * must be, too.
532 */
533 if (alignment < PAGE_SIZE)
534 alignment = PAGE_SIZE;
535 if (boundary != 0 && boundary < size)
536 return (EINVAL);
537 num = atop(round_page(size));
538 low = roundup2(low, alignment);
539
540 TAILQ_INIT(rlist);
541
542 if ((nsegs < size >> PAGE_SHIFT) || (alignment != PAGE_SIZE) ||
543 (boundary != 0))
544 res = uvm_pglistalloc_contig(num, low, high, alignment,
545 boundary, rlist);
546 else
547 res = uvm_pglistalloc_simple(num, low, high, rlist, waitok);
548
549 return (res);
550 }
551
552 /*
553 * uvm_pglistfree: free a list of pages
554 *
555 * => pages should already be unmapped
556 */
557
558 void
559 uvm_pglistfree(struct pglist *list)
560 {
561 struct uvm_cpu *ucpu;
562 struct vm_page *pg;
563 int index, color, queue;
564 bool iszero;
565
566 /*
567 * Lock the free list and free each page.
568 */
569
570 mutex_spin_enter(&uvm_fpageqlock);
571 ucpu = curcpu()->ci_data.cpu_uvm;
572 while ((pg = TAILQ_FIRST(list)) != NULL) {
573 KASSERT(!uvmpdpol_pageisqueued_p(pg));
574 TAILQ_REMOVE(list, pg, pageq.queue);
575 iszero = (pg->flags & PG_ZERO);
576 pg->flags = (pg->flags & PG_ZERO) | PG_FREE;
577 #ifdef DEBUG
578 pg->uobject = (void *)0xdeadbeef;
579 pg->uanon = (void *)0xdeadbeef;
580 #endif /* DEBUG */
581 #ifdef DEBUG
582 if (iszero)
583 uvm_pagezerocheck(pg);
584 #endif /* DEBUG */
585 index = uvm_page_lookup_freelist(pg);
586 color = VM_PGCOLOR_BUCKET(pg);
587 queue = iszero ? PGFL_ZEROS : PGFL_UNKNOWN;
588 pg->offset = (uintptr_t)ucpu;
589 LIST_INSERT_HEAD(&uvm.page_free[index].pgfl_buckets[color].
590 pgfl_queues[queue], pg, pageq.list);
591 LIST_INSERT_HEAD(&ucpu->page_free[index].pgfl_buckets[color].
592 pgfl_queues[queue], pg, listq.list);
593 uvmexp.free++;
594 if (iszero)
595 uvmexp.zeropages++;
596 ucpu->pages[queue]++;
597 STAT_DECR(uvm_pglistalloc_npages);
598 }
599 if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN])
600 ucpu->page_idle_zero = vm_page_zero_enable;
601 mutex_spin_exit(&uvm_fpageqlock);
602 }
603