1 /******************************************************************************
4 * Update page tables via the hypervisor.
6 * Copyright (c) 2002-2004, K A Fraser
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version 2
10 * as published by the Free Software Foundation; or, when distributed
11 * separately from the Linux kernel or incorporated into other
12 * software packages, subject to the following license:
14 * Permission is hereby granted, free of charge, to any person obtaining a copy
15 * of this source file (the "Software"), to deal in the Software without
16 * restriction, including without limitation the rights to use, copy, modify,
17 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18 * and to permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
21 * The above copyright notice and this permission notice shall be included in
22 * all copies or substantial portions of the Software.
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
33 #include <linux/sched.h>
34 #include <linux/hardirq.h>
36 #include <linux/slab.h>
37 #include <linux/vmalloc.h>
39 #include <asm/pgtable.h>
40 #include <asm/setup.h>
41 #include <asm/hypervisor.h>
42 #include <xen/balloon.h>
43 #include <xen/features.h>
44 #include <xen/interface/memory.h>
45 #include <xen/interface/vcpu.h>
46 #include <linux/export.h>
47 #include <linux/percpu.h>
48 #include <asm/tlbflush.h>
49 #include <linux/highmem.h>
51 #include <linux/bootmem.h> /* for max_pfn */
54 EXPORT_SYMBOL(hypercall_page);
56 shared_info_t *__read_mostly HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
57 #ifndef CONFIG_XEN_VCPU_INFO_PLACEMENT
58 EXPORT_SYMBOL(HYPERVISOR_shared_info);
60 DEFINE_PER_CPU(struct vcpu_info, vcpu_info) __aligned(sizeof(struct vcpu_info));
61 EXPORT_PER_CPU_SYMBOL(vcpu_info);
63 void __ref setup_vcpu_info(unsigned int cpu)
65 struct vcpu_info *v = &per_cpu(vcpu_info, cpu);
66 struct vcpu_register_vcpu_info info;
68 static bool first = true;
72 info.mfn = early_arbitrary_virt_to_mfn(v);
75 info.mfn = arbitrary_virt_to_mfn(v);
76 info.offset = offset_in_page(v);
78 if (HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info))
82 void __init adjust_boot_vcpu_info(void)
84 unsigned long lpfn, rpfn, lmfn, rmfn;
90 * setup_vcpu_info() cannot be used more than once for a given (v)CPU,
91 * hence we must swap the underlying MFNs of the two pages holding old
92 * and new vcpu_info of the boot CPU.
94 * Do *not* use __get_cpu_var() or this_cpu_{write,...}() here, as the
95 * per-CPU segment didn't get reloaded yet. Using this_cpu_read(), as
96 * in arch_use_lazy_mmu_mode(), though undesirable, is safe except for
97 * the accesses to variables that were updated in setup_percpu_areas().
99 lpte = lookup_address((unsigned long)&vcpu_info
100 + (__per_cpu_load - __per_cpu_start),
102 rpte = lookup_address((unsigned long)&per_cpu(vcpu_info, 0), &level);
103 BUG_ON(!lpte || !(pte_flags(*lpte) & _PAGE_PRESENT));
104 BUG_ON(!rpte || !(pte_flags(*rpte) & _PAGE_PRESENT));
105 lmfn = __pte_mfn(*lpte);
106 rmfn = __pte_mfn(*rpte);
111 lpfn = mfn_to_local_pfn(lmfn);
112 rpfn = mfn_to_local_pfn(rmfn);
114 pr_info("Swapping MFNs for PFN %lx and %lx (MFN %lx and %lx)\n",
115 lpfn, rpfn, lmfn, rmfn);
117 xen_l1_entry_update(lpte, pfn_pte_ma(rmfn, pte_pgprot(*lpte)));
118 xen_l1_entry_update(rpte, pfn_pte_ma(lmfn, pte_pgprot(*rpte)));
120 if (HYPERVISOR_update_va_mapping((unsigned long)__va(lpfn<<PAGE_SHIFT),
121 pfn_pte_ma(rmfn, PAGE_KERNEL_RO), 0))
124 if (HYPERVISOR_update_va_mapping((unsigned long)__va(rpfn<<PAGE_SHIFT),
125 pfn_pte_ma(lmfn, PAGE_KERNEL),
129 set_phys_to_machine(lpfn, rmfn);
130 set_phys_to_machine(rpfn, lmfn);
132 mmu[0].ptr = ((uint64_t)lmfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
134 mmu[1].ptr = ((uint64_t)rmfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
136 if (HYPERVISOR_mmu_update(mmu, 2, NULL, DOMID_SELF))
140 * Copy over all contents of the page just replaced, except for the
141 * vcpu_info itself, as it may have got updated after having been
142 * copied from __per_cpu_load[].
144 memcpy(__va(rpfn << PAGE_SHIFT),
145 __va(lpfn << PAGE_SHIFT),
146 (unsigned long)&vcpu_info & (PAGE_SIZE - 1));
147 level = (unsigned long)(&vcpu_info + 1) & (PAGE_SIZE - 1);
149 memcpy(__va(rpfn << PAGE_SHIFT) + level,
150 __va(lpfn << PAGE_SHIFT) + level,
155 #define NR_MC BITS_PER_LONG
156 #define NR_MMU BITS_PER_LONG
157 #define NR_MMUEXT (BITS_PER_LONG / 4)
159 DEFINE_PER_CPU(bool, xen_lazy_mmu);
161 unsigned int nr_mc, nr_mmu, nr_mmuext;
162 multicall_entry_t mc[NR_MC];
163 mmu_update_t mmu[NR_MMU];
164 struct mmuext_op mmuext[NR_MMUEXT];
166 static DEFINE_PER_CPU(struct lazy_mmu, lazy_mmu);
168 static inline bool use_lazy_mmu_mode(void)
170 #ifdef CONFIG_PREEMPT
171 if (!preempt_count())
177 static void multicall_failed(const multicall_entry_t *mc, int rc)
179 pr_emerg("hypercall#%lu(%lx, %lx, %lx, %lx) failed: %d"
181 mc->op, mc->args[0], mc->args[1], mc->args[2], mc->args[3],
186 static int _xen_multicall_flush(bool ret_last) {
187 struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
188 multicall_entry_t *mc = lazy->mc;
189 unsigned int count = lazy->nr_mc;
199 int rc = _hypercall(int, mc->op, mc->args[0], mc->args[1],
200 mc->args[2], mc->args[3], mc->args[4]);
205 multicall_failed(mc, rc);
208 if (HYPERVISOR_multicall(mc, count))
210 while (count-- > ret_last)
211 if (unlikely(mc++->result))
212 multicall_failed(mc - 1, mc[-1].result);
220 void xen_multicall_flush(void) {
221 if (use_lazy_mmu_mode())
222 _xen_multicall_flush(false);
225 int xen_multi_update_va_mapping(unsigned long va, pte_t pte,
228 struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
229 multicall_entry_t *mc;
231 if (unlikely(!use_lazy_mmu_mode()))
232 #ifdef CONFIG_X86_PAE
233 return _hypercall4(int, update_va_mapping, va,
234 pte.pte_low, pte.pte_high, uvmf);
236 return _hypercall3(int, update_va_mapping, va,
240 if (unlikely(lazy->nr_mc == NR_MC))
241 _xen_multicall_flush(false);
243 mc = lazy->mc + lazy->nr_mc++;
244 mc->op = __HYPERVISOR_update_va_mapping;
246 #ifndef CONFIG_X86_PAE
247 mc->args[1] = pte.pte;
249 mc->args[1] = pte.pte_low;
250 mc->args[2] = pte.pte_high;
252 mc->args[MULTI_UVMFLAGS_INDEX] = uvmf;
253 mc->args[5] = (long)__builtin_return_address(0);
258 static inline bool mmu_may_merge(const multicall_entry_t *mc,
259 unsigned int op, domid_t domid)
261 return mc->op == op && !mc->args[2] && mc->args[3] == domid;
264 int xen_multi_mmu_update(mmu_update_t *src, unsigned int count,
265 unsigned int *success_count, domid_t domid)
267 struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
268 multicall_entry_t *mc = lazy->mc + lazy->nr_mc;
272 if (unlikely(!use_lazy_mmu_mode()))
273 return _hypercall4(int, mmu_update, src, count,
274 success_count, domid);
276 commit = (lazy->nr_mmu + count) > NR_MMU || success_count;
277 merge = lazy->nr_mc && !commit
278 && mmu_may_merge(mc - 1, __HYPERVISOR_mmu_update, domid);
279 if (unlikely(lazy->nr_mc == NR_MC) && !merge) {
280 _xen_multicall_flush(false);
282 commit = count > NR_MMU || success_count;
285 if (!lazy->nr_mc && unlikely(commit))
286 return _hypercall4(int, mmu_update, src, count,
287 success_count, domid);
289 dst = lazy->mmu + lazy->nr_mmu;
290 lazy->nr_mmu += count;
292 mc[-1].args[1] += count;
293 memcpy(dst, src, count * sizeof(*src));
296 mc->op = __HYPERVISOR_mmu_update;
298 mc->args[0] = (unsigned long)dst;
299 memcpy(dst, src, count * sizeof(*src));
301 mc->args[0] = (unsigned long)src;
303 mc->args[2] = (unsigned long)success_count;
305 mc->args[5] = (long)__builtin_return_address(0);
308 while (!commit && count--)
309 switch (src++->ptr & (sizeof(pteval_t) - 1)) {
310 case MMU_NORMAL_PT_UPDATE:
311 case MMU_PT_UPDATE_PRESERVE_AD:
318 return commit ? _xen_multicall_flush(true) : 0;
321 int xen_multi_mmuext_op(struct mmuext_op *src, unsigned int count,
322 unsigned int *success_count, domid_t domid)
324 struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
325 multicall_entry_t *mc;
326 struct mmuext_op *dst;
329 if (unlikely(!use_lazy_mmu_mode()))
330 return _hypercall4(int, mmuext_op, src, count,
331 success_count, domid);
334 * While it could be useful in theory, I've never seen the body of
335 * this conditional to be reached, hence it seems more reasonable
336 * to disable it for the time being.
338 if (0 && likely(count)
339 && likely(!success_count)
340 && likely(domid == DOMID_SELF)
341 && likely(lazy->nr_mc)
342 && lazy->mc[lazy->nr_mc - 1].op == __HYPERVISOR_update_va_mapping) {
343 unsigned long oldf, newf = UVMF_NONE;
346 case MMUEXT_TLB_FLUSH_ALL:
347 newf = UVMF_TLB_FLUSH | UVMF_ALL;
349 case MMUEXT_INVLPG_ALL:
350 newf = UVMF_INVLPG | UVMF_ALL;
352 case MMUEXT_TLB_FLUSH_MULTI:
353 newf = UVMF_TLB_FLUSH | UVMF_MULTI
354 | (unsigned long)src->arg2.vcpumask.p;
356 case MMUEXT_INVLPG_MULTI:
357 newf = UVMF_INVLPG | UVMF_MULTI
358 | (unsigned long)src->arg2.vcpumask.p;
360 case MMUEXT_TLB_FLUSH_LOCAL:
361 newf = UVMF_TLB_FLUSH | UVMF_LOCAL;
363 case MMUEXT_INVLPG_LOCAL:
364 newf = UVMF_INVLPG | UVMF_LOCAL;
367 mc = lazy->mc + lazy->nr_mc - 1;
368 oldf = mc->args[MULTI_UVMFLAGS_INDEX];
369 if (newf == UVMF_NONE || oldf == UVMF_NONE
370 || newf == (UVMF_TLB_FLUSH | UVMF_ALL))
372 else if (oldf == (UVMF_TLB_FLUSH | UVMF_ALL))
373 newf = UVMF_TLB_FLUSH | UVMF_ALL;
374 else if ((newf & UVMF_FLUSHTYPE_MASK) == UVMF_INVLPG
375 && (oldf & UVMF_FLUSHTYPE_MASK) == UVMF_INVLPG
376 && ((src->arg1.linear_addr ^ mc->args[0])
379 else if (((oldf | newf) & UVMF_ALL)
380 && !((oldf ^ newf) & UVMF_FLUSHTYPE_MASK))
382 else if ((oldf ^ newf) & ~UVMF_FLUSHTYPE_MASK)
384 else if ((oldf & UVMF_FLUSHTYPE_MASK) == UVMF_TLB_FLUSH)
385 newf = (newf & ~UVMF_FLUSHTYPE_MASK) | UVMF_TLB_FLUSH;
386 else if ((newf & UVMF_FLUSHTYPE_MASK) != UVMF_TLB_FLUSH
387 && ((newf ^ oldf) & UVMF_FLUSHTYPE_MASK))
389 if (newf != UVMF_NONE) {
390 mc->args[MULTI_UVMFLAGS_INDEX] = newf;
397 mc = lazy->mc + lazy->nr_mc;
398 commit = (lazy->nr_mmuext + count) > NR_MMUEXT || success_count;
399 merge = lazy->nr_mc && !commit
400 && mmu_may_merge(mc - 1, __HYPERVISOR_mmuext_op, domid);
401 if (unlikely(lazy->nr_mc == NR_MC) && !merge) {
402 _xen_multicall_flush(false);
404 commit = count > NR_MMUEXT || success_count;
407 if (!lazy->nr_mc && unlikely(commit))
408 return _hypercall4(int, mmuext_op, src, count,
409 success_count, domid);
411 dst = lazy->mmuext + lazy->nr_mmuext;
412 lazy->nr_mmuext += count;
414 mc[-1].args[1] += count;
415 memcpy(dst, src, count * sizeof(*src));
418 mc->op = __HYPERVISOR_mmuext_op;
420 mc->args[0] = (unsigned long)dst;
421 memcpy(dst, src, count * sizeof(*src));
423 mc->args[0] = (unsigned long)src;
425 mc->args[2] = (unsigned long)success_count;
427 mc->args[5] = (long)__builtin_return_address(0);
430 while (!commit && count--)
431 switch (src++->cmd) {
432 case MMUEXT_PIN_L1_TABLE:
433 case MMUEXT_PIN_L2_TABLE:
434 case MMUEXT_PIN_L3_TABLE:
435 case MMUEXT_PIN_L4_TABLE:
436 case MMUEXT_UNPIN_TABLE:
437 case MMUEXT_TLB_FLUSH_LOCAL:
438 case MMUEXT_INVLPG_LOCAL:
439 case MMUEXT_TLB_FLUSH_MULTI:
440 case MMUEXT_INVLPG_MULTI:
441 case MMUEXT_TLB_FLUSH_ALL:
442 case MMUEXT_INVLPG_ALL:
449 return commit ? _xen_multicall_flush(true) : 0;
452 void xen_l1_entry_update(pte_t *ptr, pte_t val)
455 u.ptr = ptep_to_machine(ptr);
456 u.val = __pte_val(val);
457 BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
459 EXPORT_SYMBOL_GPL(xen_l1_entry_update);
461 static void do_lN_entry_update(mmu_update_t *mmu, unsigned int mmu_count,
465 multicall_entry_t mcl[2];
466 unsigned long pfn = page_to_pfn(page);
468 MULTI_update_va_mapping(mcl,
469 (unsigned long)__va(pfn << PAGE_SHIFT),
470 pfn_pte(pfn, PAGE_KERNEL_RO), 0);
472 MULTI_mmu_update(mcl + 1, mmu, mmu_count, NULL, DOMID_SELF);
473 if (unlikely(HYPERVISOR_multicall_check(mcl, 2, NULL)))
475 } else if (unlikely(HYPERVISOR_mmu_update(mmu, mmu_count,
476 NULL, DOMID_SELF) < 0))
480 void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
483 struct page *page = NULL;
485 if (likely(pmd_present(val)) && likely(!pmd_large(val))
487 && likely(PagePinned(virt_to_page(ptr)))) {
488 page = pmd_page(val);
489 if (unlikely(PagePinned(page)))
491 else if (PageHighMem(page)) {
492 #ifndef CONFIG_HIGHPTE
499 u.ptr = virt_to_machine(ptr);
500 u.val = __pmd_val(val);
501 do_lN_entry_update(&u, 1, page);
504 #if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
505 void xen_l3_entry_update(pud_t *ptr, pud_t val)
508 struct page *page = NULL;
510 if (likely(pud_present(val))
512 && likely(!pud_large(val))
515 && likely(PagePinned(virt_to_page(ptr)))) {
516 page = pud_page(val);
517 if (unlikely(PagePinned(page)))
520 u.ptr = virt_to_machine(ptr);
521 u.val = __pud_val(val);
522 do_lN_entry_update(&u, 1, page);
527 void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
530 struct page *page = NULL;
532 if (likely(pgd_present(val)) && likely(mem_map)
533 && likely(PagePinned(virt_to_page(ptr)))) {
534 page = pgd_page(val);
535 if (unlikely(PagePinned(page)))
538 u[0].ptr = virt_to_machine(ptr);
539 u[0].val = __pgd_val(val);
540 if (((unsigned long)ptr & ~PAGE_MASK)
541 <= pgd_index(TASK_SIZE_MAX) * sizeof(*ptr)) {
542 ptr = __user_pgd(ptr);
544 u[1].ptr = virt_to_machine(ptr);
545 u[1].val = __pgd_val(val);
546 do_lN_entry_update(u, 2, page);
548 do_lN_entry_update(u, 1, page);
550 #endif /* CONFIG_X86_64 */
553 void xen_pt_switch(pgd_t *pgd)
556 op.cmd = MMUEXT_NEW_BASEPTR;
557 op.arg1.mfn = virt_to_mfn(pgd);
558 BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
561 void xen_new_user_pt(pgd_t *pgd)
565 pgd = __user_pgd(pgd);
566 op.cmd = MMUEXT_NEW_USER_BASEPTR;
567 op.arg1.mfn = pgd ? virt_to_mfn(pgd) : 0;
568 BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
572 void xen_tlb_flush(void)
575 op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
576 BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
578 EXPORT_SYMBOL(xen_tlb_flush);
580 void xen_invlpg(unsigned long ptr)
583 op.cmd = MMUEXT_INVLPG_LOCAL;
584 op.arg1.linear_addr = ptr & PAGE_MASK;
585 BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
587 EXPORT_SYMBOL(xen_invlpg);
591 void xen_tlb_flush_all(void)
594 op.cmd = MMUEXT_TLB_FLUSH_ALL;
595 BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
597 EXPORT_SYMBOL_GPL(xen_tlb_flush_all);
599 void xen_tlb_flush_mask(const cpumask_t *mask)
602 if ( cpus_empty(*mask) )
604 op.cmd = MMUEXT_TLB_FLUSH_MULTI;
605 set_xen_guest_handle(op.arg2.vcpumask, cpus_addr(*mask));
606 BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
608 EXPORT_SYMBOL_GPL(xen_tlb_flush_mask);
610 void xen_invlpg_all(unsigned long ptr)
613 op.cmd = MMUEXT_INVLPG_ALL;
614 op.arg1.linear_addr = ptr & PAGE_MASK;
615 BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
617 EXPORT_SYMBOL_GPL(xen_invlpg_all);
619 void xen_invlpg_mask(const cpumask_t *mask, unsigned long ptr)
622 if ( cpus_empty(*mask) )
624 op.cmd = MMUEXT_INVLPG_MULTI;
625 op.arg1.linear_addr = ptr & PAGE_MASK;
626 set_xen_guest_handle(op.arg2.vcpumask, cpus_addr(*mask));
627 BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
629 EXPORT_SYMBOL_GPL(xen_invlpg_mask);
631 #endif /* CONFIG_SMP */
634 #define NR_PGD_PIN_OPS 2
636 #define NR_PGD_PIN_OPS 1
639 void xen_pgd_pin(pgd_t *pgd)
641 struct mmuext_op op[NR_PGD_PIN_OPS];
643 op[0].cmd = MMUEXT_PIN_L3_TABLE;
644 op[0].arg1.mfn = virt_to_mfn(pgd);
646 op[1].cmd = op[0].cmd = MMUEXT_PIN_L4_TABLE;
647 pgd = __user_pgd(pgd);
649 op[1].arg1.mfn = virt_to_mfn(pgd);
651 op[1].cmd = MMUEXT_PIN_L3_TABLE;
652 op[1].arg1.mfn = pfn_to_mfn(__pa_symbol(level3_user_pgt)
656 if (HYPERVISOR_mmuext_op(op, NR_PGD_PIN_OPS, NULL, DOMID_SELF) < 0)
660 void xen_pgd_unpin(pgd_t *pgd)
662 struct mmuext_op op[NR_PGD_PIN_OPS];
664 op[0].cmd = MMUEXT_UNPIN_TABLE;
665 op[0].arg1.mfn = virt_to_mfn(pgd);
667 pgd = __user_pgd(pgd);
669 op[1].cmd = MMUEXT_UNPIN_TABLE;
670 op[1].arg1.mfn = virt_to_mfn(pgd);
672 if (HYPERVISOR_mmuext_op(op, NR_PGD_PIN_OPS, NULL, DOMID_SELF) < 0)
676 void xen_set_ldt(const void *ptr, unsigned int ents)
679 op.cmd = MMUEXT_SET_LDT;
680 op.arg1.linear_addr = (unsigned long)ptr;
681 op.arg2.nr_ents = ents;
682 BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
685 /* Protected by balloon_lock. */
686 #define INIT_CONTIG_ORDER 6 /* 256kB */
687 static unsigned int __read_mostly max_contig_order = INIT_CONTIG_ORDER;
688 static unsigned long __initdata init_df[1U << INIT_CONTIG_ORDER];
689 static unsigned long *__refdata discontig_frames = init_df;
690 static multicall_entry_t __initdata init_mc[1U << INIT_CONTIG_ORDER];
691 static multicall_entry_t *__refdata cr_mcl = init_mc;
693 static int __init init_contig_order(void)
695 discontig_frames = vmalloc((sizeof(*discontig_frames)
696 + sizeof(*cr_mcl)) << INIT_CONTIG_ORDER);
697 BUG_ON(!discontig_frames);
699 cr_mcl = (void *)(discontig_frames + (1U << INIT_CONTIG_ORDER));
703 early_initcall(init_contig_order);
705 static int check_contig_order(unsigned int order)
708 if (unlikely(order >= 32))
710 if (unlikely(order > BITS_PER_LONG - fls(sizeof(*cr_mcl))))
714 if (unlikely(order > max_contig_order))
716 unsigned long *df = __vmalloc((sizeof(*discontig_frames)
717 + sizeof(*cr_mcl)) << order,
718 GFP_ATOMIC, PAGE_KERNEL);
726 if (order > max_contig_order) {
727 void *temp = discontig_frames;
729 discontig_frames = df;
730 cr_mcl = (void *)(df + (1U << order));
734 max_contig_order = order;
736 balloon_unlock(flags);
738 pr_info("Adjusted maximum contiguous region order to %u\n",
745 /* Ensure multi-page extents are contiguous in machine memory. */
746 int xen_create_contiguous_region(
747 unsigned long vstart, unsigned int order, unsigned int address_bits)
749 unsigned long *in_frames, out_frame, frame, flags;
755 struct xen_memory_exchange exchange = {
757 .nr_extents = 1UL << order,
763 .extent_order = order,
764 .address_bits = address_bits,
770 * Currently an auto-translated guest will not perform I/O, nor will
771 * it require PAE page directories below 4GB. Therefore any calls to
772 * this function are redundant and can be ignored.
774 if (xen_feature(XENFEAT_auto_translated_physmap))
777 rc = check_contig_order(order);
782 if (unlikely(vstart > PAGE_OFFSET + MAXMEM)) {
785 if (vstart < __START_KERNEL_map
786 || vstart + (PAGE_SIZE << order) > _brk_end)
788 ptep = lookup_address((unsigned long)__va(__pa(vstart)),
790 if (ptep && pte_none(*ptep))
792 if (vstart < __START_KERNEL && ptep)
794 rc = check_contig_order(order + 1);
799 if (unlikely(vstart + (PAGE_SIZE << order) > (unsigned long)high_memory))
803 set_xen_guest_handle(exchange.out.extent_start, &out_frame);
805 xen_scrub_pages((void *)vstart, 1 << order);
809 in_frames = discontig_frames;
810 set_xen_guest_handle(exchange.in.extent_start, in_frames);
812 /* 1. Zap current PTEs, remembering MFNs. */
813 for (i = 0; i < (1U<<order); i++) {
814 in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i);
815 MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
819 MULTI_update_va_mapping(cr_mcl + i + (1U << order),
820 (unsigned long)__va(__pa(vstart)) + (i*PAGE_SIZE),
823 set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
830 if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
833 /* 2. Get a new contiguous memory extent. */
834 out_frame = __pa(vstart) >> PAGE_SHIFT;
835 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
836 success = (exchange.nr_exchanged == (1UL << order));
837 BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
838 BUG_ON(success && (rc != 0));
839 #if CONFIG_XEN_COMPAT <= 0x030002
840 if (unlikely(rc == -ENOSYS)) {
841 /* Compatibility when XENMEM_exchange is unsupported. */
842 if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
843 &exchange.in) != (1UL << order))
845 success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
846 &exchange.out) == 1);
848 /* Couldn't get special memory: fall back to normal. */
849 for (i = 0; i < (1U<<order); i++)
850 in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i;
851 if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
852 &exchange.in) != (1UL<<order))
858 /* 3. Map the new extent in place of old pages. */
859 for (i = 0; i < (1U<<order); i++) {
860 frame = success ? (out_frame + i) : in_frames[i];
861 MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
862 pfn_pte_ma(frame, PAGE_KERNEL), 0);
865 MULTI_update_va_mapping(cr_mcl + i + (1U << order),
866 (unsigned long)__va(__pa(vstart)) + (i*PAGE_SIZE),
867 pfn_pte_ma(frame, PAGE_KERNEL_RO), 0);
869 set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
875 cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
876 ? UVMF_TLB_FLUSH|UVMF_ALL
877 : UVMF_INVLPG|UVMF_ALL;
878 if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
881 balloon_unlock(flags);
883 return success ? 0 : -ENOMEM;
885 EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
887 void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
889 unsigned long *out_frames, in_frame, frame, flags;
892 struct xen_memory_exchange exchange = {
895 .extent_order = order,
899 .nr_extents = 1UL << order,
905 if (xen_feature(XENFEAT_auto_translated_physmap))
908 if (unlikely(order > max_contig_order))
911 set_xen_guest_handle(exchange.in.extent_start, &in_frame);
913 xen_scrub_pages((void *)vstart, 1 << order);
917 out_frames = discontig_frames;
918 set_xen_guest_handle(exchange.out.extent_start, out_frames);
920 /* 1. Find start MFN of contiguous extent. */
921 in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT);
923 /* 2. Zap current PTEs. */
924 for (i = 0; i < (1U<<order); i++) {
925 MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
927 set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
929 out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i;
931 if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
934 /* 3. Do the exchange for non-contiguous MFNs. */
935 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
936 success = (exchange.nr_exchanged == 1);
937 BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
938 BUG_ON(success && (rc != 0));
939 #if CONFIG_XEN_COMPAT <= 0x030002
940 if (unlikely(rc == -ENOSYS)) {
941 /* Compatibility when XENMEM_exchange is unsupported. */
942 if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
945 if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
946 &exchange.out) != (1UL << order))
952 /* 4. Map new pages in place of old pages. */
953 for (i = 0; i < (1U<<order); i++) {
954 frame = success ? out_frames[i] : (in_frame + i);
955 MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
956 pfn_pte_ma(frame, PAGE_KERNEL), 0);
957 set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
960 cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
961 ? UVMF_TLB_FLUSH|UVMF_ALL
962 : UVMF_INVLPG|UVMF_ALL;
963 if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
966 balloon_unlock(flags);
968 if (unlikely(!success)) {
969 /* Try hard to get the special memory back to Xen. */
970 exchange.in.extent_order = 0;
971 set_xen_guest_handle(exchange.in.extent_start, &in_frame);
973 for (i = 0; i < (1U<<order); i++) {
974 struct page *page = alloc_page(__GFP_HIGHMEM|__GFP_COLD);
980 pr_warn("Xen and kernel out of memory"
981 " while trying to release an order"
982 " %u contiguous region\n", order);
985 pfn = page_to_pfn(page);
989 if (!PageHighMem(page)) {
990 void *v = __va(pfn << PAGE_SHIFT);
992 xen_scrub_pages(v, 1);
993 MULTI_update_va_mapping(cr_mcl + j, (unsigned long)v,
994 __pte_ma(0), UVMF_INVLPG|UVMF_ALL);
997 #ifdef CONFIG_XEN_SCRUB_PAGES
999 xen_scrub_pages(kmap(page), 1);
1001 kmap_flush_unused();
1005 frame = pfn_to_mfn(pfn);
1006 set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
1008 MULTI_update_va_mapping(cr_mcl + j, vstart,
1009 pfn_pte_ma(frame, PAGE_KERNEL),
1010 UVMF_INVLPG|UVMF_ALL);
1013 pfn = __pa(vstart) >> PAGE_SHIFT;
1014 set_phys_to_machine(pfn, frame);
1015 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1016 mmu.ptr = ((uint64_t)frame << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
1018 cr_mcl[j].op = __HYPERVISOR_mmu_update;
1019 cr_mcl[j].args[0] = (unsigned long)&mmu;
1020 cr_mcl[j].args[1] = 1;
1021 cr_mcl[j].args[2] = 0;
1022 cr_mcl[j].args[3] = DOMID_SELF;
1026 cr_mcl[j].op = __HYPERVISOR_memory_op;
1027 cr_mcl[j].args[0] = XENMEM_decrease_reservation;
1028 cr_mcl[j].args[1] = (unsigned long)&exchange.in;
1030 if (HYPERVISOR_multicall(cr_mcl, j + 1))
1032 BUG_ON(cr_mcl[j].result != 1);
1034 BUG_ON(cr_mcl[j].result != 0);
1036 balloon_unlock(flags);
1038 free_empty_pages(&page, 1);
1041 vstart += PAGE_SIZE;
1045 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
1047 int __init early_create_contiguous_region(unsigned long pfn,
1049 unsigned int address_bits)
1051 unsigned long *in_frames = discontig_frames, out_frame = pfn;
1054 struct xen_memory_exchange exchange = {
1056 .nr_extents = 1UL << order,
1062 .extent_order = order,
1063 .address_bits = address_bits,
1068 if (xen_feature(XENFEAT_auto_translated_physmap))
1071 if (unlikely(order > max_contig_order))
1074 for (i = 0; i < (1U << order); ++i) {
1075 in_frames[i] = pfn_to_mfn(pfn + i);
1076 set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY);
1079 set_xen_guest_handle(exchange.in.extent_start, in_frames);
1080 set_xen_guest_handle(exchange.out.extent_start, &out_frame);
1082 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
1083 success = (exchange.nr_exchanged == (1UL << order));
1084 BUG_ON(!success && (exchange.nr_exchanged || !rc));
1085 BUG_ON(success && rc);
1086 #if CONFIG_XEN_COMPAT <= 0x030002
1087 if (unlikely(rc == -ENOSYS)) {
1088 /* Compatibility when XENMEM_exchange is unavailable. */
1089 if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
1090 &exchange.in) != (1UL << order))
1092 success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
1093 &exchange.out) == 1);
1095 for (i = 0; i < (1U << order); ++i)
1096 in_frames[i] = pfn + i;
1097 if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
1098 &exchange.in) != (1UL << order))
1104 for (i = 0; i < (1U << order); ++i, ++out_frame) {
1106 out_frame = in_frames[i];
1107 set_phys_to_machine(pfn + i, out_frame);
1110 return success ? 0 : -ENOMEM;
1113 static void undo_limit_pages(struct page *pages, unsigned int order)
1115 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
1116 BUG_ON(order > max_contig_order);
1117 xen_limit_pages_to_max_mfn(pages, order, 0);
1118 ClearPageForeign(pages);
1119 init_page_count(pages);
1120 __free_pages(pages, order);
1123 int xen_limit_pages_to_max_mfn(
1124 struct page *pages, unsigned int order, unsigned int address_bits)
1126 unsigned long flags, frame, *limit_map, _limit_map;
1127 unsigned long *in_frames, *out_frames;
1129 unsigned int i, n, nr_mcl;
1132 struct xen_memory_exchange exchange = {
1139 .address_bits = address_bits,
1144 if (xen_feature(XENFEAT_auto_translated_physmap))
1147 if (address_bits && address_bits < PAGE_SHIFT)
1150 rc = check_contig_order(order + 1);
1154 if (BITS_PER_LONG >> order) {
1155 limit_map = kmalloc(BITS_TO_LONGS(1U << order)
1156 * sizeof(*limit_map), GFP_ATOMIC);
1157 if (unlikely(!limit_map))
1160 limit_map = &_limit_map;
1163 bitmap_zero(limit_map, 1U << order);
1165 BUILD_BUG_ON(sizeof(pages->index) != sizeof(*limit_map));
1166 for (i = 0; i < BITS_TO_LONGS(1U << order); ++i)
1167 limit_map[i] = pages[i + 1].index;
1169 __set_bit(0, limit_map);
1171 /* 0. Scrub the pages. */
1172 for (i = 0, n = 0; i < 1U<<order ; i++) {
1175 if (!(pfn_to_mfn(page_to_pfn(page)) >> (address_bits - PAGE_SHIFT)))
1177 __set_bit(i, limit_map);
1180 if (!PageHighMem(page))
1181 xen_scrub_pages(page_address(page), 1);
1182 #ifdef CONFIG_XEN_SCRUB_PAGES
1184 xen_scrub_pages(kmap(page), 1);
1190 if (bitmap_empty(limit_map, 1U << order)) {
1191 if (limit_map != &_limit_map)
1197 kmap_flush_unused();
1199 balloon_lock(flags);
1201 in_frames = discontig_frames;
1202 set_xen_guest_handle(exchange.in.extent_start, in_frames);
1203 out_frames = in_frames + (1U << order);
1204 set_xen_guest_handle(exchange.out.extent_start, out_frames);
1206 /* 1. Zap current PTEs (if any), remembering MFNs. */
1207 for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
1208 if(!test_bit(i, limit_map))
1212 out_frames[n] = page_to_pfn(page);
1213 in_frames[n] = pfn_to_mfn(out_frames[n]);
1215 if (!PageHighMem(page))
1216 MULTI_update_va_mapping(cr_mcl + nr_mcl++,
1217 (unsigned long)page_address(page),
1220 set_phys_to_machine(out_frames[n], INVALID_P2M_ENTRY);
1223 if (nr_mcl && HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
1226 /* 2. Get new memory below the required limit. */
1227 exchange.in.nr_extents = n;
1228 exchange.out.nr_extents = n;
1229 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
1230 success = (exchange.nr_exchanged == n);
1231 BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
1232 BUG_ON(success && (rc != 0));
1233 #if CONFIG_XEN_COMPAT <= 0x030002
1234 if (unlikely(rc == -ENOSYS)) {
1235 /* Compatibility when XENMEM_exchange is unsupported. */
1236 if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
1239 if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
1240 &exchange.out) != n)
1246 /* 3. Map the new pages in place of old pages. */
1247 for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
1248 if(!test_bit(i, limit_map))
1252 frame = success ? out_frames[n] : in_frames[n];
1254 if (!PageHighMem(page))
1255 MULTI_update_va_mapping(cr_mcl + nr_mcl++,
1256 (unsigned long)page_address(page),
1257 pfn_pte_ma(frame, PAGE_KERNEL), 0);
1259 set_phys_to_machine(page_to_pfn(page), frame);
1263 cr_mcl[nr_mcl - 1].args[MULTI_UVMFLAGS_INDEX] = order
1264 ? UVMF_TLB_FLUSH|UVMF_ALL
1265 : UVMF_INVLPG|UVMF_ALL;
1266 if (HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
1270 balloon_unlock(flags);
1272 if (success && address_bits) {
1274 BUILD_BUG_ON(sizeof(*limit_map) != sizeof(pages->index));
1275 for (i = 0; i < BITS_TO_LONGS(1U << order); ++i)
1276 pages[i + 1].index = limit_map[i];
1278 SetPageForeign(pages, undo_limit_pages);
1281 if (limit_map != &_limit_map)
1284 return success ? 0 : -ENOMEM;
1286 EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
1288 bool hypervisor_oom(void)
1290 WARN_ONCE(1, "Hypervisor is out of memory");
1294 int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
1295 void *arg, int (*func)(unsigned long, unsigned long,
1298 return start_pfn < max_pfn && nr_pages
1299 ? func(start_pfn, min(max_pfn - start_pfn, nr_pages), arg)
1303 int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
1305 maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry);
1306 return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
1309 int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
1312 maddr_t mach_gp = arbitrary_virt_to_machine(gdt + entry);
1313 return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);