Home | History | Annotate | Line # | Download | only in xen
      1 /* $NetBSD: balloon.c,v 1.24 2024/09/24 20:54:53 andvar Exp $ */
      2 
      3 /*-
      4  * Copyright (c) 2010 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Cherry G. Mathew <cherry (at) zyx.in> and
      9  * Jean-Yves Migeon <jym (at) NetBSD.org>
     10  *
     11  * Redistribution and use in source and binary forms, with or without
     12  * modification, are permitted provided that the following conditions
     13  * are met:
     14  * 1. Redistributions of source code must retain the above copyright
     15  *    notice, this list of conditions and the following disclaimer.
     16  * 2. Redistributions in binary form must reproduce the above copyright
     17  *    notice, this list of conditions and the following disclaimer in the
     18  *    documentation and/or other materials provided with the distribution.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     30  * POSSIBILITY OF SUCH DAMAGE.
     31  */
     32 
     33 /*
     34  * The Xen balloon driver enables growing and shrinking PV domains
     35  * memory on the fly, by allocating and freeing memory pages directly.
     36  * This management needs domain cooperation to work properly, especially
     37  * during balloon_inflate() operation where a domain gives back memory to
     38  * the hypervisor.
     39  *
     40  * Shrinking memory on a live system is a difficult task, and may render
     41  * it unstable or lead to crash. The driver takes a conservative approach
     42  * there by doing memory operations in small steps of a few MiB each time. It
     43  * will also refuse to decrease reservation below a certain threshold
     44  * (XEN_RESERVATION_MIN), so as to avoid a complete kernel memory exhaustion.
     45  *
     46  * The user can intervene at two different levels to manage the ballooning
     47  * of a domain:
     48  * - directly within the domain using a sysctl(9) interface.
     49  * - through the Xentools, by modifying the memory/target entry associated
     50  *   to a domain. This is usually done in dom0.
     51  *
     52  * Modification of the reservation is signaled by writing inside the
     53  * memory/target node in Xenstore. Writing new values will fire the xenbus
     54  * watcher, and wakeup the balloon thread to inflate or deflate balloon.
     55  *
     56  * Both sysctl(9) nodes and memory/target entry assume that the values passed
     57  * to them are in KiB. Internally, the driver will convert this value in
     58  * pages (assuming a page is PAGE_SIZE bytes), and issue the correct hypercalls
     59  * to decrease/increase domain's reservation accordingly.
     60  *
     61  * XXX Pages used by balloon are tracked through entries stored in a SLIST.
     62  * This allows driver to conveniently add/remove wired pages from memory
     63  * without the need to support these "memory gaps" inside uvm(9). Still, the
     64  * driver does not currently "plug" new pages into uvm(9) when more memory
     65  * is available than originally managed by balloon. For example, deflating
     66  * balloon with a total number of pages above physmem is not supported for
     67  * now. See balloon_deflate() for more details.
     68  *
     69  */
     70 
     71 #define BALLOONDEBUG 0
     72 
     73 #if defined(_KERNEL_OPT)
     74 #include "opt_uvm_hotplug.h"
     75 #endif
     76 
     77 #include <sys/cdefs.h>
     78 __KERNEL_RCSID(0, "$NetBSD: balloon.c,v 1.24 2024/09/24 20:54:53 andvar Exp $");
     79 
     80 #include <sys/inttypes.h>
     81 #include <sys/device.h>
     82 #include <sys/param.h>
     83 
     84 #include <sys/atomic.h>
     85 #include <sys/condvar.h>
     86 #include <sys/kernel.h>
     87 #include <sys/kmem.h>
     88 #include <sys/kthread.h>
     89 #include <sys/mutex.h>
     90 #include <sys/pool.h>
     91 #include <sys/queue.h>
     92 #include <sys/sysctl.h>
     93 
     94 #include <xen/xen.h>
     95 #include <xen/xenbus.h>
     96 #include <xen/balloon.h>
     97 
     98 #include <uvm/uvm.h>
     99 #include <uvm/uvm.h>
    100 #include <uvm/uvm_physseg.h>
    101 #include <xen/xenpmap.h>
    102 
    103 #include "locators.h"
    104 
    105 /*
    106  * Number of MFNs stored in the array passed back and forth between domain
    107  * and balloon/hypervisor, during balloon_inflate() / balloon_deflate(). These
    108  * should fit in a page, for performance reasons.
    109  */
    110 #define BALLOON_DELTA (PAGE_SIZE / sizeof(xen_pfn_t))
    111 
    112 /*
    113  * Safeguard value. Refuse to go below this threshold, so that domain
    114  * can keep some free pages for its own use. Value is arbitrary, and may
    115  * evolve with time.
    116  */
    117 #define BALLOON_BALLAST 256 /* In pages - 1MiB */
    118 #define XEN_RESERVATION_MIN (uvmexp.freemin + BALLOON_BALLAST) /* In pages */
    119 
    120 /* KB <-> PAGEs */
    121 #define PAGE_SIZE_KB (PAGE_SIZE >> 10) /* page size in KB */
    122 #define BALLOON_PAGES_TO_KB(_pg) ((uint64_t)_pg * PAGE_SIZE_KB)
    123 #define BALLOON_KB_TO_PAGES(_kb) (roundup(_kb, PAGE_SIZE_KB) / PAGE_SIZE_KB)
    124 
    125 /*
    126  * A balloon page entry. Needed to track pages put/reclaimed from balloon
    127  */
    128 struct balloon_page_entry {
    129 	struct vm_page *pg;
    130 	SLIST_ENTRY(balloon_page_entry) entry;
    131 };
    132 
    133 struct balloon_xenbus_softc {
    134 	device_t sc_dev;
    135 	struct sysctllog *sc_log;
    136 
    137 	kmutex_t balloon_mtx;   /* Protects condvar, target and res_min (below) */
    138 	kcondvar_t balloon_cv;  /* Condvar variable for target (below) */
    139 	size_t balloon_target;  /* Target domain reservation size in pages. */
    140 	/* Minimum amount of memory reserved by domain, in KiB */
    141 	uint64_t balloon_res_min;
    142 
    143 	xen_pfn_t *sc_mfn_list; /* List of MFNs passed from/to balloon */
    144 	pool_cache_t bpge_pool; /* pool cache for balloon page entries */
    145 	/* linked list for tracking pages used by balloon */
    146 	SLIST_HEAD(, balloon_page_entry) balloon_page_entries;
    147 	size_t balloon_num_page_entries;
    148 };
    149 
    150 static size_t xenmem_get_currentreservation(void);
    151 static size_t xenmem_get_maxreservation(void);
    152 
    153 static int  bpge_ctor(void *, void *, int);
    154 static void bpge_dtor(void *, void *);
    155 
    156 static void   balloon_thread(void *);
    157 static size_t balloon_deflate(struct balloon_xenbus_softc*, size_t);
    158 static size_t balloon_inflate(struct balloon_xenbus_softc*, size_t);
    159 
    160 static void sysctl_kern_xen_balloon_setup(struct balloon_xenbus_softc *);
    161 static void balloon_xenbus_watcher(struct xenbus_watch *, const char **,
    162 				   unsigned int);
    163 
    164 static int  balloon_xenbus_match(device_t, cfdata_t, void *);
    165 static void balloon_xenbus_attach(device_t, device_t, void *);
    166 
    167 CFATTACH_DECL_NEW(balloon, sizeof(struct balloon_xenbus_softc),
    168     balloon_xenbus_match, balloon_xenbus_attach, NULL, NULL);
    169 
    170 static struct xenbus_watch balloon_xenbus_watch = {
    171 	.node = __UNCONST("memory/target"),
    172 	.xbw_callback = balloon_xenbus_watcher,
    173 };
    174 
    175 static struct balloon_xenbus_softc *balloon_sc;
    176 
    177 static int
    178 balloon_xenbus_match(device_t parent, cfdata_t match, void *aux)
    179 {
    180 	struct xenbusdev_attach_args *xa = aux;
    181 
    182 	if (strcmp(xa->xa_type, "balloon") != 0)
    183 		return 0;
    184 
    185 	if (match->cf_loc[XENBUSCF_ID] != XENBUSCF_ID_DEFAULT &&
    186 	    match->cf_loc[XENBUSCF_ID] != xa->xa_id)
    187 		return 0;
    188 
    189 	return 1;
    190 }
    191 
    192 static void
    193 balloon_xenbus_attach(device_t parent, device_t self, void *aux)
    194 {
    195 	xen_pfn_t *mfn_list;
    196 	size_t currentpages;
    197 	struct balloon_xenbus_softc *sc = balloon_sc = device_private(self);
    198 
    199 	aprint_normal(": Xen Balloon driver\n");
    200 	sc->sc_dev = self;
    201 
    202 	/* Initialize target mutex and condvar */
    203 	mutex_init(&sc->balloon_mtx, MUTEX_DEFAULT, IPL_NONE);
    204 	cv_init(&sc->balloon_cv, "xen_balloon");
    205 
    206 	SLIST_INIT(&sc->balloon_page_entries);
    207 	sc->balloon_num_page_entries = 0;
    208 
    209 	/* Get current number of pages */
    210 	currentpages = xenmem_get_currentreservation();
    211 
    212 	KASSERT(currentpages > 0);
    213 
    214 	/* Update initial target value - no need to lock for initialization */
    215 	sc->balloon_target = currentpages;
    216 
    217 	/* Set the values used by sysctl */
    218 	sc->balloon_res_min =
    219 	    BALLOON_PAGES_TO_KB(XEN_RESERVATION_MIN);
    220 
    221 	aprint_normal_dev(self, "current reservation: %"PRIu64" KiB\n",
    222 	    BALLOON_PAGES_TO_KB(currentpages));
    223 #if BALLOONDEBUG
    224 	aprint_normal_dev(self, "min reservation: %"PRIu64" KiB\n",
    225 	    sc->balloon_res_min);
    226 	aprint_normal_dev(self, "max reservation: %"PRIu64" KiB\n",
    227 	    BALLOON_PAGES_TO_KB(xenmem_get_maxreservation()));
    228 #endif
    229 
    230 	sc->bpge_pool = pool_cache_init(sizeof(struct balloon_page_entry),
    231 	    0, 0, 0, "xen_bpge", NULL, IPL_NONE, bpge_ctor, bpge_dtor, NULL);
    232 
    233 	sysctl_kern_xen_balloon_setup(sc);
    234 
    235 	/* List of MFNs passed from/to balloon for inflating/deflating */
    236 	mfn_list = kmem_alloc(BALLOON_DELTA * sizeof(*mfn_list), KM_SLEEP);
    237 	sc->sc_mfn_list = mfn_list;
    238 
    239 	/* Setup xenbus node watch callback */
    240 	if (register_xenbus_watch(&balloon_xenbus_watch)) {
    241 		aprint_error_dev(self, "unable to watch memory/target\n");
    242 		goto error;
    243 	}
    244 
    245 	/* Setup kernel thread to asynchronously (in/de)-flate the balloon */
    246 	if (kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, balloon_thread,
    247 	    sc, NULL, "xen_balloon")) {
    248 		aprint_error_dev(self, "unable to create balloon thread\n");
    249 		unregister_xenbus_watch(&balloon_xenbus_watch);
    250 		goto error;
    251 	}
    252 
    253 	if (!pmf_device_register(self, NULL, NULL))
    254 		aprint_error_dev(self, "couldn't establish power handler\n");
    255 
    256 	return;
    257 
    258 error:
    259 	sysctl_teardown(&sc->sc_log);
    260 	cv_destroy(&sc->balloon_cv);
    261 	mutex_destroy(&sc->balloon_mtx);
    262 	return;
    263 
    264 }
    265 
    266 /*
    267  * Returns maximum memory reservation available to current domain. In Xen
    268  * with DOMID_SELF, this hypercall never fails: return value should be
    269  * interpreted as unsigned.
    270  *
    271  */
    272 static size_t
    273 xenmem_get_maxreservation(void)
    274 {
    275 	unsigned int ret;
    276 
    277 	ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation,
    278 	    & (domid_t) { DOMID_SELF });
    279 
    280 	if (ret == 0) {
    281 		/* well, a maximum reservation of 0 is really bogus */
    282 		panic("%s failed, maximum reservation returned 0", __func__);
    283 	}
    284 
    285 	return ret;
    286 }
    287 
    288 /* Returns current reservation, in pages */
    289 static size_t
    290 xenmem_get_currentreservation(void)
    291 {
    292 	int ret;
    293 
    294 	ret = HYPERVISOR_memory_op(XENMEM_current_reservation,
    295 				   & (domid_t) { DOMID_SELF });
    296 
    297 	if (ret < 0) {
    298 		panic("%s failed: %d", __func__, ret);
    299 	}
    300 
    301 	return ret;
    302 }
    303 
    304 /*
    305  * Get value (in KiB) of memory/target in XenStore for current domain
    306  * A return value of 0 can be considered as bogus or absent.
    307  */
    308 static unsigned long long
    309 balloon_xenbus_read_target(void)
    310 {
    311 	unsigned long long new_target;
    312 	int err = xenbus_read_ull(NULL, "memory", "target", &new_target, 0);
    313 
    314 	switch(err) {
    315 	case 0:
    316 		return new_target;
    317 	case ENOENT:
    318 		break;
    319 	default:
    320 		device_printf(balloon_sc->sc_dev,
    321 		    "error %d, couldn't read xenbus target node\n", err);
    322 		break;
    323 	}
    324 
    325 	return 0;
    326 }
    327 
    328 /* Set memory/target value (in KiB) in XenStore for current domain */
    329 static void
    330 balloon_xenbus_write_target(unsigned long long new_target)
    331 {
    332 	int err = xenbus_printf(NULL, "memory", "target", "%llu", new_target);
    333 
    334 	if (err != 0) {
    335 		device_printf(balloon_sc->sc_dev,
    336 		    "error %d, couldn't write xenbus target node\n", err);
    337 	}
    338 
    339 	return;
    340 }
    341 
    342 static int
    343 bpge_ctor(void *arg, void *obj, int flags)
    344 {
    345 	struct balloon_page_entry *bpge = obj;
    346 
    347 	bpge->pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
    348 	if (bpge->pg == NULL)
    349 		return ENOMEM;
    350 
    351 	return 0;
    352 
    353 }
    354 
    355 static void
    356 bpge_dtor(void *arg, void *obj)
    357 {
    358 	struct balloon_page_entry *bpge = obj;
    359 
    360 	uvm_pagefree(bpge->pg);
    361 }
    362 
    363 /*
    364  * Inflate balloon. Pages are moved out of domain's memory towards balloon.
    365  */
    366 static size_t
    367 balloon_inflate(struct balloon_xenbus_softc *sc, size_t tpages)
    368 {
    369 	int rpages, ret;
    370 	paddr_t pa;
    371 	struct balloon_page_entry *bpg_entry;
    372 	xen_pfn_t *mfn_list = sc->sc_mfn_list;
    373 
    374 	struct xen_memory_reservation reservation = {
    375 		.mem_flags = 0,
    376 		.extent_order = 0,
    377 		.domid        = DOMID_SELF
    378 	};
    379 
    380 	KASSERT(tpages > 0);
    381 	KASSERT(tpages <= BALLOON_DELTA);
    382 
    383 	memset(mfn_list, 0, BALLOON_DELTA * sizeof(*mfn_list));
    384 
    385 	/* allocate pages that will be given to Hypervisor */
    386 	for (rpages = 0; rpages < tpages; rpages++) {
    387 
    388 		bpg_entry = pool_cache_get(sc->bpge_pool, PR_WAITOK);
    389 		if (bpg_entry == NULL) {
    390 			/* failed reserving a page for balloon */
    391 			break;
    392 		}
    393 
    394 		pa = VM_PAGE_TO_PHYS(bpg_entry->pg);
    395 
    396 		mfn_list[rpages] = xpmap_ptom(pa) >> PAGE_SHIFT;
    397 
    398 		/* Invalidate pg */
    399 		xpmap_ptom_unmap(pa);
    400 
    401 		SLIST_INSERT_HEAD(&balloon_sc->balloon_page_entries,
    402 				  bpg_entry, entry);
    403 		balloon_sc->balloon_num_page_entries++;
    404 	}
    405 
    406 	/* Hand over pages to Hypervisor */
    407 	set_xen_guest_handle(reservation.extent_start, mfn_list);
    408 	reservation.nr_extents = rpages;
    409 
    410 	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
    411 				   &reservation);
    412 	if (ret != rpages) {
    413 		/*
    414 		 * we are in bad shape: the operation failed for certain
    415 		 * MFNs. As the API does not allow us to know which frame
    416 		 * numbers were erroneous, we cannot really recover safely.
    417 		 */
    418 		panic("%s: decrease reservation failed: was %d, "
    419 		    "returned %d", device_xname(sc->sc_dev), rpages, ret);
    420 	}
    421 
    422 #if BALLOONDEBUG
    423 	device_printf(sc->sc_dev, "inflate %zu => inflated by %d\n",
    424 	    tpages, rpages);
    425 #endif
    426 	return rpages;
    427 }
    428 
    429 /*
    430  * Deflate balloon. Pages are given back to domain's memory.
    431  */
    432 static size_t
    433 balloon_deflate(struct balloon_xenbus_softc *sc, size_t tpages)
    434 {
    435 	int rpages, s, ret;
    436 	paddr_t pa;
    437 	struct balloon_page_entry *bpg_entry;
    438 	xen_pfn_t *mfn_list = sc->sc_mfn_list;
    439 
    440 	struct xen_memory_reservation reservation = {
    441 		.mem_flags = 0,
    442 		.extent_order = 0,
    443 		.domid        = DOMID_SELF
    444 	};
    445 
    446 	KASSERT(tpages > 0);
    447 	KASSERT(tpages <= BALLOON_DELTA);
    448 
    449 	memset(mfn_list, 0, BALLOON_DELTA * sizeof(*mfn_list));
    450 
    451 #ifndef UVM_HOTPLUG
    452 	/*
    453 	 * If the list is empty, we are deflating balloon beyond empty. This
    454 	 * is currently unsupported as this would require to dynamically add
    455 	 * new memory pages inside uvm(9) and instruct pmap(9) on how to
    456 	 * handle them. For now, we clip reservation up to the point we
    457 	 * can manage them, eg. the remaining bpg entries in the SLIST.
    458 	 * XXX find a way to hotplug memory through uvm(9)/pmap(9).
    459 	 */
    460 	if (tpages > sc->balloon_num_page_entries) {
    461 		device_printf(sc->sc_dev,
    462 		    "memory 'hot-plug' unsupported - clipping "
    463 		    "reservation %zu => %zu pages.\n",
    464 		    tpages, sc->balloon_num_page_entries);
    465 		tpages = sc->balloon_num_page_entries;
    466 	}
    467 #endif
    468 
    469 	/* reclaim pages from balloon */
    470 	set_xen_guest_handle(reservation.extent_start, mfn_list);
    471 	reservation.nr_extents = tpages;
    472 
    473 	ret = HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation);
    474 
    475 	if (ret < 0) {
    476 		panic("%s: increase reservation failed, ret %d",
    477 		    device_xname(sc->sc_dev), ret);
    478 	}
    479 
    480 	if (ret != tpages) {
    481 		device_printf(sc->sc_dev,
    482 		    "increase reservation incomplete: was %zu, "
    483 		    "returned %d\n", tpages, ret);
    484 	}
    485 
    486 	/* plug pages back into memory through bpge entries */
    487 	for (rpages = 0; rpages < ret; rpages++) {
    488 #ifdef UVM_HOTPLUG
    489 		extern paddr_t pmap_pa_end;
    490 		if (sc->balloon_num_page_entries == 0) { /*XXX: consolidate */
    491 			/* "hot-plug": Stick it at the end of memory */
    492 			pa = pmap_pa_end;
    493 
    494 			/* P2M update */
    495 #if defined(_LP64) || defined(PAE)
    496 			atomic_add_64(&pmap_pa_end, PAGE_SIZE);
    497 #else
    498 			atomic_add_32(&pmap_pa_end, PAGE_SIZE);
    499 #endif
    500 			s = splvm();
    501 			xpmap_ptom_map(pa, ptoa(mfn_list[rpages]));
    502 			xpq_queue_machphys_update(ptoa(mfn_list[rpages]), pa);
    503 			xpq_flush_queue();
    504 			splx(s);
    505 
    506 			if (uvm_physseg_plug(atop(pa), 1, NULL) == false) {
    507 				/* Undo P2M */
    508 				s = splvm();
    509 				xpmap_ptom_unmap(pa);
    510 				xpq_queue_machphys_update(ptoa(mfn_list[rpages]), 0);
    511 				xpq_flush_queue();
    512 				splx(s);
    513 #if defined(_LP64) || defined(PAE)
    514 				atomic_add_64(&pmap_pa_end, -PAGE_SIZE);
    515 #else
    516 				atomic_add_32(&pmap_pa_end, -PAGE_SIZE);
    517 #endif
    518 				break;
    519 			}
    520 			continue;
    521 		}
    522 #else
    523 		if (sc->balloon_num_page_entries == 0) {
    524 			/*
    525 			 * XXX This is the case where extra "hot-plug"
    526 			 * mem w.r.t boot comes in
    527 			 */
    528 			device_printf(sc->sc_dev,
    529 			    "List empty. Cannot be collapsed further!\n");
    530 			break;
    531 		}
    532 #endif
    533 		bpg_entry = SLIST_FIRST(&balloon_sc->balloon_page_entries);
    534 		SLIST_REMOVE_HEAD(&balloon_sc->balloon_page_entries, entry);
    535 		balloon_sc->balloon_num_page_entries--;
    536 
    537 		/* Update P->M */
    538 		pa = VM_PAGE_TO_PHYS(bpg_entry->pg);
    539 
    540 		s = splvm();
    541 		xpmap_ptom_map(pa, ptoa(mfn_list[rpages]));
    542 		xpq_queue_machphys_update(ptoa(mfn_list[rpages]), pa);
    543 		xpq_flush_queue();
    544 		splx(s);
    545 
    546 		pool_cache_put(sc->bpge_pool, bpg_entry);
    547 	}
    548 
    549 #if BALLOONDEBUG
    550 	device_printf(sc->sc_dev, "deflate %zu => deflated by %d\n",
    551 	    tpages, rpages);
    552 #endif
    553 	return rpages;
    554 }
    555 
    556 /*
    557  * The balloon thread is responsible for handling inflate/deflate balloon
    558  * requests for the current domain given the new "target" value.
    559  */
    560 static void
    561 balloon_thread(void *cookie)
    562 {
    563 	int ret;
    564 	size_t current, diff, target;
    565 	struct balloon_xenbus_softc *sc = cookie;
    566 
    567 	for/*ever*/ (;;) {
    568 		current = xenmem_get_currentreservation();
    569 
    570 		/*
    571 		 * We assume that balloon_xenbus_watcher() and
    572 		 * sysctl(9) handlers checked the sanity of the
    573 		 * new target value.
    574 		 */
    575 		mutex_enter(&sc->balloon_mtx);
    576 		target = sc->balloon_target;
    577 		if (current != target) {
    578 			/*
    579 			 * There is work to do. Inflate/deflate in
    580 			 * increments of BALLOON_DELTA pages at maximum. The
    581 			 * risk of integer wrapping is mitigated by
    582 			 * BALLOON_DELTA, which is the upper bound.
    583 			 */
    584 			mutex_exit(&sc->balloon_mtx);
    585 			diff = MIN(target - current, BALLOON_DELTA);
    586 			if (current < target)
    587 				ret = balloon_deflate(sc, diff);
    588 			else
    589 				ret = balloon_inflate(sc, diff);
    590 
    591 			if (ret != diff) {
    592 				/*
    593 				 * Something went wrong during operation.
    594 				 * Log error then feedback current value in
    595 				 * target so that thread gets back to waiting
    596 				 * for the next iteration
    597 				 */
    598 				device_printf(sc->sc_dev,
    599 				    "WARNING: balloon could not reach target "
    600 				    "%zu (current %zu)\n",
    601 				    target, current);
    602 				current = xenmem_get_currentreservation();
    603 				mutex_enter(&sc->balloon_mtx);
    604 				sc->balloon_target = current;
    605 				mutex_exit(&sc->balloon_mtx);
    606 			}
    607 		} else {
    608 			/* no need for change -- wait for a signal */
    609 			cv_wait(&sc->balloon_cv, &sc->balloon_mtx);
    610 			mutex_exit(&sc->balloon_mtx);
    611 		}
    612 	}
    613 }
    614 
    615 /*
    616  * Handler called when memory/target value changes inside Xenstore.
    617  * All sanity checks must also happen in this handler, as it is the common
    618  * entry point where controller domain schedules balloon operations.
    619  */
    620 static void
    621 balloon_xenbus_watcher(struct xenbus_watch *watch, const char **vec,
    622 		       unsigned int len)
    623 {
    624 	size_t new_target;
    625 	uint64_t target_kb, target_max, target_min;
    626 
    627 	target_kb = balloon_xenbus_read_target();
    628 	if (target_kb == 0) {
    629 		/* bogus -- just return */
    630 		return;
    631 	}
    632 
    633 	mutex_enter(&balloon_sc->balloon_mtx);
    634 	target_min = balloon_sc->balloon_res_min;
    635 	mutex_exit(&balloon_sc->balloon_mtx);
    636 	if (target_kb < target_min) {
    637 		device_printf(balloon_sc->sc_dev,
    638 		    "new target %"PRIu64" is below min %"PRIu64"\n",
    639 		    target_kb, target_min);
    640 		return;
    641 	}
    642 
    643 	target_max = BALLOON_PAGES_TO_KB(xenmem_get_maxreservation());
    644 	if (target_kb > target_max) {
    645 		/*
    646 		 * Should not happen. Hypervisor should block balloon
    647 		 * requests above mem-max.
    648 		 */
    649 		device_printf(balloon_sc->sc_dev,
    650 		    "new target %"PRIu64" is above max %"PRIu64"\n",
    651 		    target_kb, target_max);
    652 		return;
    653 	}
    654 
    655 	new_target = BALLOON_KB_TO_PAGES(target_kb);
    656 
    657 	device_printf(balloon_sc->sc_dev,
    658 	    "current reservation: %zu pages => target: %zu pages\n",
    659 	    xenmem_get_currentreservation(), new_target);
    660 
    661 	/* Only update target if its value changes */
    662 	mutex_enter(&balloon_sc->balloon_mtx);
    663 	if (balloon_sc->balloon_target != new_target) {
    664 		balloon_sc->balloon_target = new_target;
    665 		cv_signal(&balloon_sc->balloon_cv);
    666 	}
    667 	mutex_exit(&balloon_sc->balloon_mtx);
    668 
    669 	return;
    670 }
    671 
    672 /*
    673  * sysctl(9) stuff
    674  */
    675 
    676 /* routine to control the minimum memory reserved for the domain */
    677 static int
    678 sysctl_kern_xen_balloon_min(SYSCTLFN_ARGS)
    679 {
    680 	struct sysctlnode node;
    681 	u_quad_t newval;
    682 	int error;
    683 
    684 	node = *rnode;
    685 	node.sysctl_data = &newval;
    686 
    687 	mutex_enter(&balloon_sc->balloon_mtx);
    688 	newval = balloon_sc->balloon_res_min;
    689 	mutex_exit(&balloon_sc->balloon_mtx);
    690 
    691 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    692 	if (error || newp == NULL)
    693 		return error;
    694 
    695 	/* Safeguard value: refuse to go below. */
    696 	if (newval < XEN_RESERVATION_MIN) {
    697 		device_printf(balloon_sc->sc_dev,
    698 		    "cannot set min below minimum safe value (%d)\n",
    699 		    XEN_RESERVATION_MIN);
    700 		return EPERM;
    701 	}
    702 
    703 	mutex_enter(&balloon_sc->balloon_mtx);
    704 	if (balloon_sc->balloon_res_min != newval)
    705 		balloon_sc->balloon_res_min = newval;
    706 	mutex_exit(&balloon_sc->balloon_mtx);
    707 
    708 	return 0;
    709 }
    710 
    711 /* Returns the maximum memory reservation of the domain */
    712 static int
    713 sysctl_kern_xen_balloon_max(SYSCTLFN_ARGS)
    714 {
    715 	struct sysctlnode node;
    716 	u_quad_t node_val;
    717 
    718 	node = *rnode;
    719 
    720 	node_val = BALLOON_PAGES_TO_KB(xenmem_get_maxreservation());
    721 	node.sysctl_data = &node_val;
    722 	return sysctl_lookup(SYSCTLFN_CALL(&node));
    723 }
    724 
    725 /* Returns the current memory reservation of the domain */
    726 static int
    727 sysctl_kern_xen_balloon_current(SYSCTLFN_ARGS)
    728 {
    729 	struct sysctlnode node;
    730 	u_quad_t node_val;
    731 
    732 	node = *rnode;
    733 
    734 	node_val = BALLOON_PAGES_TO_KB(xenmem_get_currentreservation());
    735 	node.sysctl_data = &node_val;
    736 	return sysctl_lookup(SYSCTLFN_CALL(&node));
    737 }
    738 
    739 /*
    740  * Returns the target memory reservation of the domain
    741  * When reading, this sysctl will return the value of the balloon_target
    742  * variable, converted into KiB
    743  * When used for writing, it will update the new memory/target value
    744  * in XenStore, but will not update the balloon_target variable directly.
    745  * This will be done by the Xenbus watch handler, balloon_xenbus_watcher().
    746  */
    747 static int
    748 sysctl_kern_xen_balloon_target(SYSCTLFN_ARGS)
    749 {
    750 	struct sysctlnode node;
    751 	u_quad_t newval, res_min, res_max;
    752 	int error;
    753 
    754 	node = *rnode;
    755 	node.sysctl_data = &newval;
    756 
    757 	mutex_enter(&balloon_sc->balloon_mtx);
    758 	newval = BALLOON_PAGES_TO_KB(balloon_sc->balloon_target);
    759 	res_min = balloon_sc->balloon_res_min;
    760 	mutex_exit(&balloon_sc->balloon_mtx);
    761 
    762 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    763 	if (newp == NULL || error != 0) {
    764 		return error;
    765 	}
    766 
    767 	/*
    768 	 * Sanity check new size
    769 	 * We should not balloon below the minimum reservation
    770 	 * set by the domain, nor above the maximum reservation set
    771 	 * by domain controller.
    772 	 * Note: domain is not supposed to receive balloon requests when
    773 	 * they are above maximum reservation, but better be safe than
    774 	 * sorry.
    775 	 */
    776 	res_max = BALLOON_PAGES_TO_KB(xenmem_get_maxreservation());
    777 	if (newval < res_min || newval > res_max) {
    778 #if BALLOONDEBUG
    779 		device_printf(balloon_sc->sc_dev,
    780 		    "new value out of bounds: %"PRIu64"\n", newval);
    781 		device_printf(balloon_sc->sc_dev,
    782 		    "min %"PRIu64", max %"PRIu64"\n", res_min, res_max);
    783 #endif
    784 		return EPERM;
    785 	}
    786 
    787 	/*
    788 	 * Write new value inside Xenstore. This will fire the memory/target
    789 	 * watch handler, balloon_xenbus_watcher().
    790 	 */
    791 	balloon_xenbus_write_target(newval);
    792 
    793 	return 0;
    794 }
    795 
    796 /* sysctl(9) nodes creation */
    797 static void
    798 sysctl_kern_xen_balloon_setup(struct balloon_xenbus_softc *sc)
    799 {
    800 	const struct sysctlnode *node = NULL;
    801 	struct sysctllog **clog = &sc->sc_log;
    802 
    803 	sysctl_createv(clog, 0, NULL, &node,
    804 	    CTLFLAG_PERMANENT,
    805 	    CTLTYPE_NODE, "machdep", NULL,
    806 	    NULL, 0, NULL, 0,
    807 	    CTL_MACHDEP, CTL_EOL);
    808 
    809 	sysctl_createv(clog, 0, &node, &node,
    810 	    CTLFLAG_PERMANENT,
    811 	    CTLTYPE_NODE, "xen",
    812 	    SYSCTL_DESCR("Xen top level node"),
    813 	    NULL, 0, NULL, 0,
    814 	    CTL_CREATE, CTL_EOL);
    815 
    816 	sysctl_createv(clog, 0, &node, &node,
    817 	    CTLFLAG_PERMANENT,
    818 	    CTLTYPE_NODE, "balloon",
    819 	    SYSCTL_DESCR("Balloon details"),
    820 	    NULL, 0, NULL, 0,
    821 	    CTL_CREATE, CTL_EOL);
    822 
    823 	sysctl_createv(clog, 0, &node, NULL,
    824 	    CTLFLAG_PERMANENT | CTLFLAG_READONLY,
    825 	    CTLTYPE_QUAD, "current",
    826 	    SYSCTL_DESCR("Domain's current memory reservation from "
    827 		"hypervisor, in KiB."),
    828 	    sysctl_kern_xen_balloon_current, 0, NULL, 0,
    829 	    CTL_CREATE, CTL_EOL);
    830 
    831 	sysctl_createv(clog, 0, &node, NULL,
    832 	    CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    833 	    CTLTYPE_QUAD, "target",
    834 	    SYSCTL_DESCR("Target memory reservation for domain, in KiB."),
    835 	    sysctl_kern_xen_balloon_target, 0, NULL, 0,
    836 	    CTL_CREATE, CTL_EOL);
    837 
    838 	sysctl_createv(clog, 0, &node, NULL,
    839 	    CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    840 	    CTLTYPE_QUAD, "min",
    841 	    SYSCTL_DESCR("Minimum amount of memory the domain "
    842 		"reserves, in KiB."),
    843 	    sysctl_kern_xen_balloon_min, 0, NULL, 0,
    844 	    CTL_CREATE, CTL_EOL);
    845 
    846 	sysctl_createv(clog, 0, &node, NULL,
    847 	    CTLFLAG_PERMANENT | CTLFLAG_READONLY,
    848 	    CTLTYPE_QUAD, "max",
    849 	    SYSCTL_DESCR("Maximum amount of memory the domain "
    850 		"can use, in KiB."),
    851 	    sysctl_kern_xen_balloon_max, 0, NULL, 0,
    852 	    CTL_CREATE, CTL_EOL);
    853 }
    854