1 /******************************************************************************
4 * Xen balloon driver - enables returning/claiming memory to/from Xen.
6 * Copyright (c) 2003, B Dragovic
7 * Copyright (c) 2003-2004, M Williamson, K Fraser
8 * Copyright (c) 2005 Dan M. Smith, IBM Corporation
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License version 2
12 * as published by the Free Software Foundation; or, when distributed
13 * separately from the Linux kernel or incorporated into other
14 * software packages, subject to the following license:
16 * Permission is hereby granted, free of charge, to any person obtaining a copy
17 * of this source file (the "Software"), to deal in the Software without
18 * restriction, including without limitation the rights to use, copy, modify,
19 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
20 * and to permit persons to whom the Software is furnished to do so, subject to
21 * the following conditions:
23 * The above copyright notice and this permission notice shall be included in
24 * all copies or substantial portions of the Software.
26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
29 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
30 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
31 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35 #include <linux/kernel.h>
36 #include <linux/module.h>
37 #include <linux/sched.h>
38 #include <linux/errno.h>
39 #include <linux/list.h>
41 #include <linux/swap.h>
42 #include <linux/bootmem.h>
43 #include <linux/highmem.h>
44 #include <linux/slab.h>
45 #include <linux/mutex.h>
46 #include <xen/xen_proc.h>
47 #include <asm/hypervisor.h>
48 #include <xen/balloon.h>
49 #include <xen/interface/memory.h>
50 #include <asm/maddr.h>
52 #include <asm/pgalloc.h>
53 #include <asm/pgtable.h>
54 #include <asm/uaccess.h>
56 #include <xen/xenbus.h>
59 #ifdef HAVE_XEN_PLATFORM_COMPAT_H
60 #include <xen/platform-compat.h>
64 static struct proc_dir_entry *balloon_pde;
67 static DEFINE_MUTEX(balloon_mutex);
70 * Protects atomic reservation decrease/increase against concurrent increases.
71 * Also protects non-atomic updates of current_pages and driver_pages, and
74 DEFINE_SPINLOCK(balloon_lock);
76 struct balloon_stats balloon_stats;
78 /* We increase/decrease in batches which fit in a page */
79 static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
82 #define inc_totalhigh_pages() (totalhigh_pages++)
83 #define dec_totalhigh_pages() (totalhigh_pages--)
85 #define inc_totalhigh_pages() ((void)0)
86 #define dec_totalhigh_pages() ((void)0)
91 * In HVM guests accounting here uses the Xen visible values, but the kernel
92 * determined totalram_pages value shouldn't get altered. Since totalram_pages
93 * includes neither the kernel static image nor any memory allocated prior to
94 * or from the bootmem allocator, we have to synchronize the two values.
96 static unsigned long __read_mostly totalram_bias;
98 #define totalram_bias 0
101 /* List of ballooned pages, threaded through the mem_map array. */
102 static LIST_HEAD(ballooned_pages);
104 /* Main work function, always executed in process context. */
105 static void balloon_process(struct work_struct *unused);
106 static DECLARE_WORK(balloon_worker, balloon_process);
108 /* When ballooning out (allocating memory to return to Xen) we don't really
109 want the kernel to try too hard since that can trigger the oom killer. */
110 #define GFP_BALLOON (GFP_HIGHUSER|__GFP_NOWARN|__GFP_NORETRY|__GFP_NOMEMALLOC|\
111 __GFP_NOTRACK|__GFP_COLD)
113 #define PAGE_TO_LIST(p) (&(p)->lru)
114 #define LIST_TO_PAGE(l) list_entry((l), struct page, lru)
115 #define UNLIST_PAGE(p) \
117 list_del(PAGE_TO_LIST(p)); \
118 PAGE_TO_LIST(p)->next = NULL; \
119 PAGE_TO_LIST(p)->prev = NULL; \
122 #define IPRINTK(fmt, args...) pr_info("xen_mem: " fmt, ##args)
123 #define WPRINTK(fmt, args...) pr_warning("xen_mem: " fmt, ##args)
125 /* balloon_append: add the given page to the balloon. */
126 static void balloon_append(struct page *page, int account)
130 /* Lowmem is re-populated first, so highmem pages go at list tail. */
131 if (PageHighMem(page)) {
132 list_add_tail(PAGE_TO_LIST(page), &ballooned_pages);
135 dec_totalhigh_pages();
137 list_add(PAGE_TO_LIST(page), &ballooned_pages);
141 pfn = page_to_pfn(page);
143 SetPageReserved(page);
144 set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
145 page_zone(page)->present_pages--;
147 BUG_ON(!PageReserved(page));
148 WARN_ON_ONCE(phys_to_machine_mapping_valid(pfn));
152 /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
153 static struct page *balloon_retrieve(int *was_empty)
158 if (list_empty(&ballooned_pages))
161 page = LIST_TO_PAGE(ballooned_pages.next);
163 BUG_ON(!PageReserved(page));
165 if (PageHighMem(page)) {
167 inc_totalhigh_pages();
171 zone = page_zone(page);
172 *was_empty |= !populated_zone(zone);
173 zone->present_pages++;
178 static struct page *balloon_first_page(void)
180 if (list_empty(&ballooned_pages))
182 return LIST_TO_PAGE(ballooned_pages.next);
185 static struct page *balloon_next_page(struct page *page)
187 struct list_head *next = PAGE_TO_LIST(page)->next;
188 if (next == &ballooned_pages)
190 return LIST_TO_PAGE(next);
193 static inline void balloon_free_page(struct page *page)
196 if (put_page_testzero(page))
197 free_hot_cold_page(page, 1);
199 /* free_hot_cold_page() is not being exported. */
204 static void balloon_alarm(unsigned long unused)
206 schedule_work(&balloon_worker);
208 static DEFINE_TIMER(balloon_timer, balloon_alarm, 0, 0);
210 static unsigned long current_target(void)
212 unsigned long target = bs.target_pages;
213 if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high))
214 target = bs.current_pages + bs.balloon_low + bs.balloon_high;
218 unsigned long balloon_minimum_target(void)
221 #define max_pfn num_physpages
223 unsigned long min_pages, curr_pages = current_target();
225 #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
226 /* Simple continuous piecewiese linear function:
227 * max MiB -> min MiB gradient
238 if (max_pfn < MB2PAGES(128))
239 min_pages = MB2PAGES(8) + (max_pfn >> 1);
240 else if (max_pfn < MB2PAGES(512))
241 min_pages = MB2PAGES(40) + (max_pfn >> 2);
242 else if (max_pfn < MB2PAGES(2048))
243 min_pages = MB2PAGES(104) + (max_pfn >> 3);
245 min_pages = MB2PAGES(296) + (max_pfn >> 5);
248 /* Don't enforce growth */
249 return min(min_pages, curr_pages);
255 static int increase_reservation(unsigned long nr_pages)
257 unsigned long pfn, i, flags;
260 int need_zonelists_rebuild = 0;
261 struct xen_memory_reservation reservation = {
267 if (nr_pages > ARRAY_SIZE(frame_list))
268 nr_pages = ARRAY_SIZE(frame_list);
272 page = balloon_first_page();
273 for (i = 0; i < nr_pages; i++) {
274 BUG_ON(page == NULL);
275 frame_list[i] = page_to_pfn(page);;
276 page = balloon_next_page(page);
279 set_xen_guest_handle(reservation.extent_start, frame_list);
280 reservation.nr_extents = nr_pages;
281 rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
285 for (i = 0; i < rc; i++) {
286 page = balloon_retrieve(&need_zonelists_rebuild);
287 BUG_ON(page == NULL);
289 pfn = page_to_pfn(page);
290 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
291 phys_to_machine_mapping_valid(pfn));
293 set_phys_to_machine(pfn, frame_list[i]);
296 /* Link back into the page tables if not highmem. */
297 if (pfn < max_low_pfn) {
299 ret = HYPERVISOR_update_va_mapping(
300 (unsigned long)__va(pfn << PAGE_SHIFT),
301 pfn_pte_ma(frame_list[i], PAGE_KERNEL),
307 /* Relinquish the page back to the allocator. */
308 ClearPageReserved(page);
309 init_page_count(page);
310 balloon_free_page(page);
313 bs.current_pages += rc;
314 totalram_pages = bs.current_pages - totalram_bias;
317 balloon_unlock(flags);
320 setup_per_zone_wmarks();
323 if (need_zonelists_rebuild)
324 build_all_zonelists(NULL);
326 vm_total_pages = nr_free_pagecache_pages();
329 return rc < 0 ? rc : rc != nr_pages;
332 static int decrease_reservation(unsigned long nr_pages)
334 unsigned long pfn, i, flags;
339 struct xen_memory_reservation reservation = {
345 if (nr_pages > ARRAY_SIZE(frame_list))
346 nr_pages = ARRAY_SIZE(frame_list);
348 for (i = 0; i < nr_pages; i++) {
349 if ((page = alloc_page(GFP_BALLOON)) == NULL) {
355 pfn = page_to_pfn(page);
356 frame_list[i] = pfn_to_mfn(pfn);
358 if (!PageHighMem(page)) {
359 v = phys_to_virt(pfn << PAGE_SHIFT);
360 xen_scrub_pages(v, 1);
362 ret = HYPERVISOR_update_va_mapping(
363 (unsigned long)v, __pte_ma(0), 0);
367 #ifdef CONFIG_XEN_SCRUB_PAGES
370 xen_scrub_pages(v, 1);
377 /* Ensure that ballooned highmem pages don't have kmaps. */
384 /* No more mappings: invalidate P2M and add to balloon. */
385 for (i = 0; i < nr_pages; i++) {
386 pfn = mfn_to_pfn(frame_list[i]);
387 balloon_append(pfn_to_page(pfn), 1);
390 set_xen_guest_handle(reservation.extent_start, frame_list);
391 reservation.nr_extents = nr_pages;
392 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
393 BUG_ON(ret != nr_pages);
395 bs.current_pages -= nr_pages;
396 totalram_pages = bs.current_pages - totalram_bias;
398 balloon_unlock(flags);
404 * We avoid multiple worker processes conflicting via the balloon mutex.
405 * We may of course race updates of the target counts (which are protected
406 * by the balloon lock), or with changes to the Xen hard limit, but we will
407 * recover from these in time.
409 static void balloon_process(struct work_struct *unused)
414 mutex_lock(&balloon_mutex);
417 credit = current_target() - bs.current_pages;
419 need_sleep = (increase_reservation(credit) != 0);
421 need_sleep = (decrease_reservation(-credit) != 0);
423 #ifndef CONFIG_PREEMPT
427 } while ((credit != 0) && !need_sleep);
429 /* Schedule more work if there is some still to be done. */
430 if (current_target() != bs.current_pages)
431 mod_timer(&balloon_timer, jiffies + HZ);
433 mutex_unlock(&balloon_mutex);
436 /* Resets the Xen limit, sets new target, and kicks off processing. */
437 void balloon_set_new_target(unsigned long target)
439 /* No need for lock. Not read-modify-write updates. */
440 bs.target_pages = max(target, balloon_minimum_target());
441 schedule_work(&balloon_worker);
444 static struct xenbus_watch target_watch =
446 .node = "memory/target"
449 /* React to a change in the target key */
450 static void watch_target(struct xenbus_watch *watch,
451 const char **vec, unsigned int len)
453 unsigned long long new_target;
456 err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
458 /* This is ok (for domain0 at least) - so just return */
462 /* The given memory/target value is in KiB, so it needs converting to
463 * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
465 balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
468 static int balloon_init_watcher(struct notifier_block *notifier,
474 err = register_xenbus_watch(&target_watch);
476 pr_err("Failed to set balloon watcher\n");
481 #ifdef CONFIG_PROC_FS
482 static int balloon_write(struct file *file, const char __user *buffer,
483 unsigned long count, void *data)
485 char memstring[64], *endchar;
486 unsigned long long target_bytes;
488 if (!capable(CAP_SYS_ADMIN))
492 return -EBADMSG; /* runt */
493 if (count > sizeof(memstring))
494 return -EFBIG; /* too long */
496 if (copy_from_user(memstring, buffer, count))
498 memstring[sizeof(memstring)-1] = '\0';
500 target_bytes = memparse(memstring, &endchar);
501 balloon_set_new_target(target_bytes >> PAGE_SHIFT);
506 static int balloon_read(char *page, char **start, off_t off,
507 int count, int *eof, void *data)
513 "Current allocation: %8lu kB\n"
514 "Requested target: %8lu kB\n"
515 "Minimum target: %8lu kB\n"
516 "Maximum target: %8lu kB\n"
517 "Low-mem balloon: %8lu kB\n"
518 "High-mem balloon: %8lu kB\n"
519 "Driver pages: %8lu kB\n",
520 PAGES2KB(bs.current_pages), PAGES2KB(bs.target_pages),
521 PAGES2KB(balloon_minimum_target()), PAGES2KB(num_physpages),
522 PAGES2KB(bs.balloon_low), PAGES2KB(bs.balloon_high),
523 PAGES2KB(bs.driver_pages));
531 static struct notifier_block xenstore_notifier;
533 static int __init balloon_init(void)
535 #if !defined(CONFIG_XEN)
536 # ifndef XENMEM_get_pod_target
537 # define XENMEM_get_pod_target 17
538 typedef struct xen_pod_target {
539 uint64_t target_pages;
541 uint64_t pod_cache_pages;
542 uint64_t pod_entries;
546 xen_pod_target_t pod_target = { .domid = DOMID_SELF };
548 #elif defined(CONFIG_X86)
553 if (!is_running_on_xen())
556 IPRINTK("Initialising balloon driver.\n");
559 bs.current_pages = min(xen_start_info->nr_pages, max_pfn);
560 totalram_pages = bs.current_pages;
562 rc = HYPERVISOR_memory_op(XENMEM_get_pod_target, &pod_target);
564 * Xen prior to 3.4.0 masks the memory_op command to 4 bits, thus
565 * converting XENMEM_get_pod_target to XENMEM_decrease_reservation.
566 * Fortunately this results in a request with all input fields zero,
567 * but (due to the way bit 4 and upwards get interpreted) a starting
568 * extent of 1. When start_extent > nr_extents (>= in newer Xen), we
569 * simply get start_extent returned.
571 totalram_bias = HYPERVISOR_memory_op(rc != -ENOSYS && rc != 1
572 ? XENMEM_maximum_reservation : XENMEM_current_reservation,
574 if ((long)totalram_bias != -ENOSYS) {
575 BUG_ON(totalram_bias < totalram_pages);
576 bs.current_pages = totalram_bias;
577 totalram_bias -= totalram_pages;
580 bs.current_pages = totalram_pages;
583 bs.target_pages = bs.current_pages;
586 bs.driver_pages = 0UL;
588 #ifdef CONFIG_PROC_FS
589 if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) {
590 WPRINTK("Unable to create /proc/xen/balloon.\n");
594 balloon_pde->read_proc = balloon_read;
595 balloon_pde->write_proc = balloon_write;
597 balloon_sysfs_init();
599 #if defined(CONFIG_X86) && defined(CONFIG_XEN)
600 /* Initialise the balloon with excess memory space. */
601 for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
602 page = pfn_to_page(pfn);
603 if (!PageReserved(page)) {
604 SetPageReserved(page);
605 set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
606 balloon_append(page, 0);
611 target_watch.callback = watch_target;
612 xenstore_notifier.notifier_call = balloon_init_watcher;
614 register_xenstore_notifier(&xenstore_notifier);
619 subsys_initcall(balloon_init);
621 static void __exit balloon_exit(void)
623 balloon_sysfs_exit();
624 /* XXX - release balloon here */
627 module_exit(balloon_exit);
629 void balloon_update_driver_allowance(long delta)
634 bs.driver_pages += delta;
635 balloon_unlock(flags);
637 EXPORT_SYMBOL_GPL(balloon_update_driver_allowance);
639 #if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
642 static int dealloc_pte_fn(
643 pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
645 unsigned long pfn, mfn = pte_mfn(*pte);
647 struct xen_memory_reservation reservation = {
652 set_xen_guest_handle(reservation.extent_start, &mfn);
653 set_pte_at(&init_mm, addr, pte, __pte_ma(0));
654 pfn = __pa(addr) >> PAGE_SHIFT;
655 set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
656 SetPageReserved(pfn_to_page(pfn));
657 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
663 struct page **alloc_empty_pages_and_pagevec(int nr_pages)
667 struct page *page, **pagevec;
670 pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL);
674 for (i = 0; i < nr_pages; i++) {
676 page = balloon_first_page();
677 if (page && !PageHighMem(page)) {
680 balloon_unlock(flags);
684 balloon_unlock(flags);
686 page = pagevec[i] = alloc_page(GFP_KERNEL|__GFP_NOTRACK|__GFP_COLD);
690 v = page_address(page);
691 xen_scrub_pages(v, 1);
695 if (xen_feature(XENFEAT_auto_translated_physmap)) {
696 unsigned long gmfn = page_to_pfn(page);
697 struct xen_memory_reservation reservation = {
702 set_xen_guest_handle(reservation.extent_start, &gmfn);
703 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
706 ret = 0; /* success */
709 ret = apply_to_page_range(&init_mm, (unsigned long)v,
710 PAGE_SIZE, dealloc_pte_fn,
713 /* Cannot handle non-auto translate mode. */
719 balloon_free_page(page);
720 balloon_unlock(flags);
724 totalram_pages = --bs.current_pages - totalram_bias;
725 if (PageHighMem(page))
726 dec_totalhigh_pages();
727 page_zone(page)->present_pages--;
729 balloon_unlock(flags);
733 schedule_work(&balloon_worker);
742 balloon_append(pagevec[i], 0);
743 balloon_unlock(flags);
748 EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec);
750 #endif /* CONFIG_XEN_BACKEND */
753 static void _free_empty_pages(struct page **pagevec, int nr_pages,
760 for (i = 0; i < nr_pages; i++) {
761 BUG_ON(page_count(pagevec[i]) != 1);
762 balloon_append(pagevec[i], account);
765 bs.current_pages -= nr_pages;
766 totalram_pages = bs.current_pages - totalram_bias;
768 balloon_unlock(flags);
770 schedule_work(&balloon_worker);
773 void free_empty_pages(struct page **pagevec, int nr_pages)
775 _free_empty_pages(pagevec, nr_pages, true);
779 #if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
780 void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
783 _free_empty_pages(pagevec, nr_pages, false);
787 EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec);
790 void balloon_release_driver_page(struct page *page)
795 balloon_append(page, 1);
796 totalram_pages = --bs.current_pages - totalram_bias;
798 balloon_unlock(flags);
800 schedule_work(&balloon_worker);
802 EXPORT_SYMBOL_GPL(balloon_release_driver_page);
804 MODULE_LICENSE("Dual BSD/GPL");