Update to 3.4-final.
[linux-flexiantxendom0-3.2.10.git] / arch / x86 / mm / hypervisor.c
1 /******************************************************************************
2  * mm/hypervisor.c
3  * 
4  * Update page tables via the hypervisor.
5  * 
6  * Copyright (c) 2002-2004, K A Fraser
7  * 
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License version 2
10  * as published by the Free Software Foundation; or, when distributed
11  * separately from the Linux kernel or incorporated into other
12  * software packages, subject to the following license:
13  * 
14  * Permission is hereby granted, free of charge, to any person obtaining a copy
15  * of this source file (the "Software"), to deal in the Software without
16  * restriction, including without limitation the rights to use, copy, modify,
17  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18  * and to permit persons to whom the Software is furnished to do so, subject to
19  * the following conditions:
20  * 
21  * The above copyright notice and this permission notice shall be included in
22  * all copies or substantial portions of the Software.
23  * 
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30  * IN THE SOFTWARE.
31  */
32
33 #include <linux/sched.h>
34 #include <linux/hardirq.h>
35 #include <linux/mm.h>
36 #include <linux/slab.h>
37 #include <linux/vmalloc.h>
38 #include <asm/page.h>
39 #include <asm/pgtable.h>
40 #include <asm/setup.h>
41 #include <asm/hypervisor.h>
42 #include <xen/balloon.h>
43 #include <xen/features.h>
44 #include <xen/interface/memory.h>
45 #include <xen/interface/vcpu.h>
46 #include <linux/export.h>
47 #include <linux/percpu.h>
48 #include <asm/tlbflush.h>
49 #include <linux/highmem.h>
50 #ifdef CONFIG_X86_32
51 #include <linux/bootmem.h> /* for max_pfn */
52 #endif
53
54 EXPORT_SYMBOL(hypercall_page);
55
56 shared_info_t *__read_mostly HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
57 #ifndef CONFIG_XEN_VCPU_INFO_PLACEMENT
58 EXPORT_SYMBOL(HYPERVISOR_shared_info);
59 #else
60 DEFINE_PER_CPU(struct vcpu_info, vcpu_info) __aligned(sizeof(struct vcpu_info));
61 EXPORT_PER_CPU_SYMBOL(vcpu_info);
62
63 void __ref setup_vcpu_info(unsigned int cpu)
64 {
65         struct vcpu_info *v = &per_cpu(vcpu_info, cpu);
66         struct vcpu_register_vcpu_info info;
67 #ifdef CONFIG_X86_64
68         static bool first = true;
69
70         if (first) {
71                 first = false;
72                 info.mfn = early_arbitrary_virt_to_mfn(v);
73         } else
74 #endif
75                 info.mfn = arbitrary_virt_to_mfn(v);
76         info.offset = offset_in_page(v);
77
78         if (HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info))
79                 BUG();
80 }
81
82 void __init adjust_boot_vcpu_info(void)
83 {
84         unsigned long lpfn, rpfn, lmfn, rmfn;
85         pte_t *lpte, *rpte;
86         unsigned int level;
87         mmu_update_t mmu[2];
88
89         /*
90          * setup_vcpu_info() cannot be used more than once for a given (v)CPU,
91          * hence we must swap the underlying MFNs of the two pages holding old
92          * and new vcpu_info of the boot CPU.
93          *
94          * Do *not* use __get_cpu_var() or this_cpu_{write,...}() here, as the
95          * per-CPU segment didn't get reloaded yet. Using this_cpu_read(), as
96          * in arch_use_lazy_mmu_mode(), though undesirable, is safe except for
97          * the accesses to variables that were updated in setup_percpu_areas().
98          */
99         lpte = lookup_address((unsigned long)&vcpu_info
100                               + (__per_cpu_load - __per_cpu_start),
101                               &level);
102         rpte = lookup_address((unsigned long)&per_cpu(vcpu_info, 0), &level);
103         BUG_ON(!lpte || !(pte_flags(*lpte) & _PAGE_PRESENT));
104         BUG_ON(!rpte || !(pte_flags(*rpte) & _PAGE_PRESENT));
105         lmfn = __pte_mfn(*lpte);
106         rmfn = __pte_mfn(*rpte);
107
108         if (lmfn == rmfn)
109                 return;
110
111         lpfn = mfn_to_local_pfn(lmfn);
112         rpfn = mfn_to_local_pfn(rmfn);
113
114         pr_info("Swapping MFNs for PFN %lx and %lx (MFN %lx and %lx)\n",
115                 lpfn, rpfn, lmfn, rmfn);
116
117         xen_l1_entry_update(lpte, pfn_pte_ma(rmfn, pte_pgprot(*lpte)));
118         xen_l1_entry_update(rpte, pfn_pte_ma(lmfn, pte_pgprot(*rpte)));
119 #ifdef CONFIG_X86_64
120         if (HYPERVISOR_update_va_mapping((unsigned long)__va(lpfn<<PAGE_SHIFT),
121                                          pfn_pte_ma(rmfn, PAGE_KERNEL_RO), 0))
122                 BUG();
123 #endif
124         if (HYPERVISOR_update_va_mapping((unsigned long)__va(rpfn<<PAGE_SHIFT),
125                                          pfn_pte_ma(lmfn, PAGE_KERNEL),
126                                          UVMF_TLB_FLUSH))
127                 BUG();
128
129         set_phys_to_machine(lpfn, rmfn);
130         set_phys_to_machine(rpfn, lmfn);
131
132         mmu[0].ptr = ((uint64_t)lmfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
133         mmu[0].val = rpfn;
134         mmu[1].ptr = ((uint64_t)rmfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
135         mmu[1].val = lpfn;
136         if (HYPERVISOR_mmu_update(mmu, 2, NULL, DOMID_SELF))
137                 BUG();
138
139         /*
140          * Copy over all contents of the page just replaced, except for the
141          * vcpu_info itself, as it may have got updated after having been
142          * copied from __per_cpu_load[].
143          */
144         memcpy(__va(rpfn << PAGE_SHIFT),
145                __va(lpfn << PAGE_SHIFT),
146                (unsigned long)&vcpu_info & (PAGE_SIZE - 1));
147         level = (unsigned long)(&vcpu_info + 1) & (PAGE_SIZE - 1);
148         if (level)
149                 memcpy(__va(rpfn << PAGE_SHIFT) + level,
150                        __va(lpfn << PAGE_SHIFT) + level,
151                        PAGE_SIZE - level);
152 }
153 #endif
154
155 #define NR_MC     BITS_PER_LONG
156 #define NR_MMU    BITS_PER_LONG
157 #define NR_MMUEXT (BITS_PER_LONG / 4)
158
159 DEFINE_PER_CPU(bool, xen_lazy_mmu);
160 struct lazy_mmu {
161         unsigned int nr_mc, nr_mmu, nr_mmuext;
162         multicall_entry_t mc[NR_MC];
163         mmu_update_t mmu[NR_MMU];
164         struct mmuext_op mmuext[NR_MMUEXT];
165 };
166 static DEFINE_PER_CPU(struct lazy_mmu, lazy_mmu);
167
168 static inline bool use_lazy_mmu_mode(void)
169 {
170 #ifdef CONFIG_PREEMPT
171         if (!preempt_count())
172                 return false;
173 #endif
174         return !irq_count();
175 }
176
177 static void multicall_failed(const multicall_entry_t *mc, int rc)
178 {
179         pr_emerg("hypercall#%lu(%lx, %lx, %lx, %lx) failed: %d"
180                  " (caller %lx)\n",
181                mc->op, mc->args[0], mc->args[1], mc->args[2], mc->args[3],
182                rc, mc->args[5]);
183         BUG();
184 }
185
186 static int _xen_multicall_flush(bool ret_last) {
187         struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
188         multicall_entry_t *mc = lazy->mc;
189         unsigned int count = lazy->nr_mc;
190
191         if (!count)
192                 return 0;
193
194         lazy->nr_mc = 0;
195         lazy->nr_mmu = 0;
196         lazy->nr_mmuext = 0;
197
198         if (count == 1) {
199                 int rc = _hypercall(int, mc->op, mc->args[0], mc->args[1],
200                                     mc->args[2], mc->args[3], mc->args[4]);
201
202                 if (unlikely(rc)) {
203                         if (ret_last)
204                                 return rc;
205                         multicall_failed(mc, rc);
206                 }
207         } else {
208                 if (HYPERVISOR_multicall(mc, count))
209                         BUG();
210                 while (count-- > ret_last)
211                         if (unlikely(mc++->result))
212                                 multicall_failed(mc - 1, mc[-1].result);
213                 if (ret_last)
214                         return mc->result;
215         }
216
217         return 0;
218 }
219
220 void xen_multicall_flush(void) {
221         if (use_lazy_mmu_mode())
222                 _xen_multicall_flush(false);
223 }
224
225 int xen_multi_update_va_mapping(unsigned long va, pte_t pte,
226                                 unsigned long uvmf)
227 {
228         struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
229         multicall_entry_t *mc;
230
231         if (unlikely(!use_lazy_mmu_mode()))
232 #ifdef CONFIG_X86_PAE
233                 return _hypercall4(int, update_va_mapping, va,
234                                    pte.pte_low, pte.pte_high, uvmf);
235 #else
236                 return _hypercall3(int, update_va_mapping, va,
237                                    pte.pte, uvmf);
238 #endif
239
240         if (unlikely(lazy->nr_mc == NR_MC))
241                 _xen_multicall_flush(false);
242
243         mc = lazy->mc + lazy->nr_mc++;
244         mc->op = __HYPERVISOR_update_va_mapping;
245         mc->args[0] = va;
246 #ifndef CONFIG_X86_PAE
247         mc->args[1] = pte.pte;
248 #else
249         mc->args[1] = pte.pte_low;
250         mc->args[2] = pte.pte_high;
251 #endif
252         mc->args[MULTI_UVMFLAGS_INDEX] = uvmf;
253         mc->args[5] = (long)__builtin_return_address(0);
254
255         return 0;
256 }
257
258 static inline bool mmu_may_merge(const multicall_entry_t *mc,
259                                  unsigned int op, domid_t domid)
260 {
261         return mc->op == op && !mc->args[2] && mc->args[3] == domid;
262 }
263
264 int xen_multi_mmu_update(mmu_update_t *src, unsigned int count,
265                          unsigned int *success_count, domid_t domid)
266 {
267         struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
268         multicall_entry_t *mc = lazy->mc + lazy->nr_mc;
269         mmu_update_t *dst;
270         bool commit, merge;
271
272         if (unlikely(!use_lazy_mmu_mode()))
273                 return _hypercall4(int, mmu_update, src, count,
274                                    success_count, domid);
275
276         commit = (lazy->nr_mmu + count) > NR_MMU || success_count;
277         merge = lazy->nr_mc && !commit
278                 && mmu_may_merge(mc - 1, __HYPERVISOR_mmu_update, domid);
279         if (unlikely(lazy->nr_mc == NR_MC) && !merge) {
280                 _xen_multicall_flush(false);
281                 mc = lazy->mc;
282                 commit = count > NR_MMU || success_count;
283         }
284
285         if (!lazy->nr_mc && unlikely(commit))
286                 return _hypercall4(int, mmu_update, src, count,
287                                    success_count, domid);
288
289         dst = lazy->mmu + lazy->nr_mmu;
290         lazy->nr_mmu += count;
291         if (merge) {
292                 mc[-1].args[1] += count;
293                 memcpy(dst, src, count * sizeof(*src));
294         } else {
295                 ++lazy->nr_mc;
296                 mc->op = __HYPERVISOR_mmu_update;
297                 if (!commit) {
298                         mc->args[0] = (unsigned long)dst;
299                         memcpy(dst, src, count * sizeof(*src));
300                 } else
301                         mc->args[0] = (unsigned long)src;
302                 mc->args[1] = count;
303                 mc->args[2] = (unsigned long)success_count;
304                 mc->args[3] = domid;
305                 mc->args[5] = (long)__builtin_return_address(0);
306         }
307
308         while (!commit && count--)
309                 switch (src++->ptr & (sizeof(pteval_t) - 1)) {
310                 case MMU_NORMAL_PT_UPDATE:
311                 case MMU_PT_UPDATE_PRESERVE_AD:
312                         break;
313                 default:
314                         commit = true;
315                         break;
316                 }
317
318         return commit ? _xen_multicall_flush(true) : 0;
319 }
320
321 int xen_multi_mmuext_op(struct mmuext_op *src, unsigned int count,
322                         unsigned int *success_count, domid_t domid)
323 {
324         struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
325         multicall_entry_t *mc;
326         struct mmuext_op *dst;
327         bool commit, merge;
328
329         if (unlikely(!use_lazy_mmu_mode()))
330                 return _hypercall4(int, mmuext_op, src, count,
331                                    success_count, domid);
332
333         /*
334          * While it could be useful in theory, I've never seen the body of
335          * this conditional to be reached, hence it seems more reasonable
336          * to disable it for the time being.
337          */
338         if (0 && likely(count)
339             && likely(!success_count)
340             && likely(domid == DOMID_SELF)
341             && likely(lazy->nr_mc)
342             && lazy->mc[lazy->nr_mc - 1].op == __HYPERVISOR_update_va_mapping) {
343                 unsigned long oldf, newf = UVMF_NONE;
344
345                 switch (src->cmd) {
346                 case MMUEXT_TLB_FLUSH_ALL:
347                         newf = UVMF_TLB_FLUSH | UVMF_ALL;
348                         break;
349                 case MMUEXT_INVLPG_ALL:
350                         newf = UVMF_INVLPG | UVMF_ALL;
351                         break;
352                 case MMUEXT_TLB_FLUSH_MULTI:
353                         newf = UVMF_TLB_FLUSH | UVMF_MULTI
354                                | (unsigned long)src->arg2.vcpumask.p;
355                         break;
356                 case MMUEXT_INVLPG_MULTI:
357                         newf = UVMF_INVLPG | UVMF_MULTI
358                                | (unsigned long)src->arg2.vcpumask.p;
359                         break;
360                 case MMUEXT_TLB_FLUSH_LOCAL:
361                         newf = UVMF_TLB_FLUSH | UVMF_LOCAL;
362                         break;
363                 case MMUEXT_INVLPG_LOCAL:
364                         newf = UVMF_INVLPG | UVMF_LOCAL;
365                         break;
366                 }
367                 mc = lazy->mc + lazy->nr_mc - 1;
368                 oldf = mc->args[MULTI_UVMFLAGS_INDEX];
369                 if (newf == UVMF_NONE || oldf == UVMF_NONE
370                     || newf == (UVMF_TLB_FLUSH | UVMF_ALL))
371                         ;
372                 else if (oldf == (UVMF_TLB_FLUSH | UVMF_ALL))
373                         newf = UVMF_TLB_FLUSH | UVMF_ALL;
374                 else if ((newf & UVMF_FLUSHTYPE_MASK) == UVMF_INVLPG
375                          && (oldf & UVMF_FLUSHTYPE_MASK) == UVMF_INVLPG
376                          && ((src->arg1.linear_addr ^ mc->args[0])
377                              >> PAGE_SHIFT))
378                         newf = UVMF_NONE;
379                 else if (((oldf | newf) & UVMF_ALL)
380                          && !((oldf ^ newf) & UVMF_FLUSHTYPE_MASK))
381                         newf |= UVMF_ALL;
382                 else if ((oldf ^ newf) & ~UVMF_FLUSHTYPE_MASK)
383                         newf = UVMF_NONE;
384                 else if ((oldf & UVMF_FLUSHTYPE_MASK) == UVMF_TLB_FLUSH)
385                         newf = (newf & ~UVMF_FLUSHTYPE_MASK) | UVMF_TLB_FLUSH;
386                 else if ((newf & UVMF_FLUSHTYPE_MASK) != UVMF_TLB_FLUSH
387                          && ((newf ^ oldf) & UVMF_FLUSHTYPE_MASK))
388                         newf = UVMF_NONE;
389                 if (newf != UVMF_NONE) {
390                         mc->args[MULTI_UVMFLAGS_INDEX] = newf;
391                         ++src;
392                         if (!--count)
393                                 return 0;
394                 }
395         }
396
397         mc = lazy->mc + lazy->nr_mc;
398         commit = (lazy->nr_mmuext + count) > NR_MMUEXT || success_count;
399         merge = lazy->nr_mc && !commit
400                 && mmu_may_merge(mc - 1, __HYPERVISOR_mmuext_op, domid);
401         if (unlikely(lazy->nr_mc == NR_MC) && !merge) {
402                 _xen_multicall_flush(false);
403                 mc = lazy->mc;
404                 commit = count > NR_MMUEXT || success_count;
405         }
406
407         if (!lazy->nr_mc && unlikely(commit))
408                 return _hypercall4(int, mmuext_op, src, count,
409                                    success_count, domid);
410
411         dst = lazy->mmuext + lazy->nr_mmuext;
412         lazy->nr_mmuext += count;
413         if (merge) {
414                 mc[-1].args[1] += count;
415                 memcpy(dst, src, count * sizeof(*src));
416         } else {
417                 ++lazy->nr_mc;
418                 mc->op = __HYPERVISOR_mmuext_op;
419                 if (!commit) {
420                         mc->args[0] = (unsigned long)dst;
421                         memcpy(dst, src, count * sizeof(*src));
422                 } else
423                         mc->args[0] = (unsigned long)src;
424                 mc->args[1] = count;
425                 mc->args[2] = (unsigned long)success_count;
426                 mc->args[3] = domid;
427                 mc->args[5] = (long)__builtin_return_address(0);
428         }
429
430         while (!commit && count--)
431                 switch (src++->cmd) {
432                 case MMUEXT_PIN_L1_TABLE:
433                 case MMUEXT_PIN_L2_TABLE:
434                 case MMUEXT_PIN_L3_TABLE:
435                 case MMUEXT_PIN_L4_TABLE:
436                 case MMUEXT_UNPIN_TABLE:
437                 case MMUEXT_TLB_FLUSH_LOCAL:
438                 case MMUEXT_INVLPG_LOCAL:
439                 case MMUEXT_TLB_FLUSH_MULTI:
440                 case MMUEXT_INVLPG_MULTI:
441                 case MMUEXT_TLB_FLUSH_ALL:
442                 case MMUEXT_INVLPG_ALL:
443                         break;
444                 default:
445                         commit = true;
446                         break;
447                 }
448
449         return commit ? _xen_multicall_flush(true) : 0;
450 }
451
452 void xen_l1_entry_update(pte_t *ptr, pte_t val)
453 {
454         mmu_update_t u;
455         u.ptr = ptep_to_machine(ptr);
456         u.val = __pte_val(val);
457         BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
458 }
459 EXPORT_SYMBOL_GPL(xen_l1_entry_update);
460
461 static void do_lN_entry_update(mmu_update_t *mmu, unsigned int mmu_count,
462                                struct page *page)
463 {
464         if (likely(page)) {
465                 multicall_entry_t mcl[2];
466                 unsigned long pfn = page_to_pfn(page);
467
468                 MULTI_update_va_mapping(mcl,
469                                         (unsigned long)__va(pfn << PAGE_SHIFT),
470                                         pfn_pte(pfn, PAGE_KERNEL_RO), 0);
471                 SetPagePinned(page);
472                 MULTI_mmu_update(mcl + 1, mmu, mmu_count, NULL, DOMID_SELF);
473                 if (unlikely(HYPERVISOR_multicall_check(mcl, 2, NULL)))
474                         BUG();
475         } else if (unlikely(HYPERVISOR_mmu_update(mmu, mmu_count,
476                                                   NULL, DOMID_SELF) < 0))
477                 BUG();
478 }
479
480 void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
481 {
482         mmu_update_t u;
483         struct page *page = NULL;
484
485         if (likely(pmd_present(val)) && likely(!pmd_large(val))
486             && likely(mem_map)
487             && likely(PagePinned(virt_to_page(ptr)))) {
488                 page = pmd_page(val);
489                 if (unlikely(PagePinned(page)))
490                         page = NULL;
491                 else if (PageHighMem(page)) {
492 #ifndef CONFIG_HIGHPTE
493                         BUG();
494 #endif
495                         kmap_flush_unused();
496                         page = NULL;
497                 }
498         }
499         u.ptr = virt_to_machine(ptr);
500         u.val = __pmd_val(val);
501         do_lN_entry_update(&u, 1, page);
502 }
503
504 #if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
505 void xen_l3_entry_update(pud_t *ptr, pud_t val)
506 {
507         mmu_update_t u;
508         struct page *page = NULL;
509
510         if (likely(pud_present(val))
511 #ifdef CONFIG_X86_64
512             && likely(!pud_large(val))
513 #endif
514             && likely(mem_map)
515             && likely(PagePinned(virt_to_page(ptr)))) {
516                 page = pud_page(val);
517                 if (unlikely(PagePinned(page)))
518                         page = NULL;
519         }
520         u.ptr = virt_to_machine(ptr);
521         u.val = __pud_val(val);
522         do_lN_entry_update(&u, 1, page);
523 }
524 #endif
525
526 #ifdef CONFIG_X86_64
527 void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
528 {
529         mmu_update_t u[2];
530         struct page *page = NULL;
531
532         if (likely(pgd_present(val)) && likely(mem_map)
533             && likely(PagePinned(virt_to_page(ptr)))) {
534                 page = pgd_page(val);
535                 if (unlikely(PagePinned(page)))
536                         page = NULL;
537         }
538         u[0].ptr = virt_to_machine(ptr);
539         u[0].val = __pgd_val(val);
540         if (((unsigned long)ptr & ~PAGE_MASK)
541             <= pgd_index(TASK_SIZE_MAX) * sizeof(*ptr)) {
542                 ptr = __user_pgd(ptr);
543                 BUG_ON(!ptr);
544                 u[1].ptr = virt_to_machine(ptr);
545                 u[1].val = __pgd_val(val);
546                 do_lN_entry_update(u, 2, page);
547         } else
548                 do_lN_entry_update(u, 1, page);
549 }
550 #endif /* CONFIG_X86_64 */
551
552 #ifdef CONFIG_X86_64
553 void xen_pt_switch(pgd_t *pgd)
554 {
555         struct mmuext_op op;
556         op.cmd = MMUEXT_NEW_BASEPTR;
557         op.arg1.mfn = virt_to_mfn(pgd);
558         BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
559 }
560
561 void xen_new_user_pt(pgd_t *pgd)
562 {
563         struct mmuext_op op;
564
565         pgd = __user_pgd(pgd);
566         op.cmd = MMUEXT_NEW_USER_BASEPTR;
567         op.arg1.mfn = pgd ? virt_to_mfn(pgd) : 0;
568         BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
569 }
570 #endif
571
572 void xen_tlb_flush(void)
573 {
574         struct mmuext_op op;
575         op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
576         BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
577 }
578 EXPORT_SYMBOL(xen_tlb_flush);
579
580 void xen_invlpg(unsigned long ptr)
581 {
582         struct mmuext_op op;
583         op.cmd = MMUEXT_INVLPG_LOCAL;
584         op.arg1.linear_addr = ptr & PAGE_MASK;
585         BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
586 }
587 EXPORT_SYMBOL(xen_invlpg);
588
589 #ifdef CONFIG_SMP
590
591 void xen_tlb_flush_all(void)
592 {
593         struct mmuext_op op;
594         op.cmd = MMUEXT_TLB_FLUSH_ALL;
595         BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
596 }
597 EXPORT_SYMBOL_GPL(xen_tlb_flush_all);
598
599 void xen_tlb_flush_mask(const cpumask_t *mask)
600 {
601         struct mmuext_op op;
602         if ( cpus_empty(*mask) )
603                 return;
604         op.cmd = MMUEXT_TLB_FLUSH_MULTI;
605         set_xen_guest_handle(op.arg2.vcpumask, cpus_addr(*mask));
606         BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
607 }
608 EXPORT_SYMBOL_GPL(xen_tlb_flush_mask);
609
610 void xen_invlpg_all(unsigned long ptr)
611 {
612         struct mmuext_op op;
613         op.cmd = MMUEXT_INVLPG_ALL;
614         op.arg1.linear_addr = ptr & PAGE_MASK;
615         BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
616 }
617 EXPORT_SYMBOL_GPL(xen_invlpg_all);
618
619 void xen_invlpg_mask(const cpumask_t *mask, unsigned long ptr)
620 {
621         struct mmuext_op op;
622         if ( cpus_empty(*mask) )
623                 return;
624         op.cmd = MMUEXT_INVLPG_MULTI;
625         op.arg1.linear_addr = ptr & PAGE_MASK;
626         set_xen_guest_handle(op.arg2.vcpumask, cpus_addr(*mask));
627         BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
628 }
629 EXPORT_SYMBOL_GPL(xen_invlpg_mask);
630
631 #endif /* CONFIG_SMP */
632
633 #ifdef CONFIG_X86_64
634 #define NR_PGD_PIN_OPS 2
635 #else
636 #define NR_PGD_PIN_OPS 1
637 #endif
638
639 void xen_pgd_pin(pgd_t *pgd)
640 {
641         struct mmuext_op op[NR_PGD_PIN_OPS];
642
643         op[0].cmd = MMUEXT_PIN_L3_TABLE;
644         op[0].arg1.mfn = virt_to_mfn(pgd);
645 #ifdef CONFIG_X86_64
646         op[1].cmd = op[0].cmd = MMUEXT_PIN_L4_TABLE;
647         pgd = __user_pgd(pgd);
648         if (pgd)
649                 op[1].arg1.mfn = virt_to_mfn(pgd);
650         else {
651                 op[1].cmd = MMUEXT_PIN_L3_TABLE;
652                 op[1].arg1.mfn = pfn_to_mfn(__pa_symbol(level3_user_pgt)
653                                             >> PAGE_SHIFT);
654         }
655 #endif
656         if (HYPERVISOR_mmuext_op(op, NR_PGD_PIN_OPS, NULL, DOMID_SELF) < 0)
657                 BUG();
658 }
659
660 void xen_pgd_unpin(pgd_t *pgd)
661 {
662         struct mmuext_op op[NR_PGD_PIN_OPS];
663
664         op[0].cmd = MMUEXT_UNPIN_TABLE;
665         op[0].arg1.mfn = virt_to_mfn(pgd);
666 #ifdef CONFIG_X86_64
667         pgd = __user_pgd(pgd);
668         BUG_ON(!pgd);
669         op[1].cmd = MMUEXT_UNPIN_TABLE;
670         op[1].arg1.mfn = virt_to_mfn(pgd);
671 #endif
672         if (HYPERVISOR_mmuext_op(op, NR_PGD_PIN_OPS, NULL, DOMID_SELF) < 0)
673                 BUG();
674 }
675
676 void xen_set_ldt(const void *ptr, unsigned int ents)
677 {
678         struct mmuext_op op;
679         op.cmd = MMUEXT_SET_LDT;
680         op.arg1.linear_addr = (unsigned long)ptr;
681         op.arg2.nr_ents     = ents;
682         BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
683 }
684
685 /* Protected by balloon_lock. */
686 #define INIT_CONTIG_ORDER 6 /* 256kB */
687 static unsigned int __read_mostly max_contig_order = INIT_CONTIG_ORDER;
688 static unsigned long __initdata init_df[1U << INIT_CONTIG_ORDER];
689 static unsigned long *__refdata discontig_frames = init_df;
690 static multicall_entry_t __initdata init_mc[1U << INIT_CONTIG_ORDER];
691 static multicall_entry_t *__refdata cr_mcl = init_mc;
692
693 static int __init init_contig_order(void)
694 {
695         discontig_frames = vmalloc((sizeof(*discontig_frames)
696                                     + sizeof(*cr_mcl)) << INIT_CONTIG_ORDER);
697         BUG_ON(!discontig_frames);
698
699         cr_mcl = (void *)(discontig_frames + (1U << INIT_CONTIG_ORDER));
700
701         return 0;
702 }
703 early_initcall(init_contig_order);
704
705 static int check_contig_order(unsigned int order)
706 {
707 #ifdef CONFIG_64BIT
708         if (unlikely(order >= 32))
709 #else
710         if (unlikely(order > BITS_PER_LONG - fls(sizeof(*cr_mcl))))
711 #endif
712                 return -ENOMEM;
713
714         if (unlikely(order > max_contig_order))
715         {
716                 unsigned long *df = __vmalloc((sizeof(*discontig_frames)
717                                                + sizeof(*cr_mcl)) << order,
718                                               GFP_ATOMIC, PAGE_KERNEL);
719                 unsigned long flags;
720
721                 if (!df) {
722                         vfree(df);
723                         return -ENOMEM;
724                 }
725                 balloon_lock(flags);
726                 if (order > max_contig_order) {
727                         void *temp = discontig_frames;
728
729                         discontig_frames = df;
730                         cr_mcl = (void *)(df + (1U << order));
731                         df = temp;
732
733                         wmb();
734                         max_contig_order = order;
735                 }
736                 balloon_unlock(flags);
737                 vfree(df);
738                 pr_info("Adjusted maximum contiguous region order to %u\n",
739                         order);
740         }
741
742         return 0;
743 }
744
745 /* Ensure multi-page extents are contiguous in machine memory. */
746 int xen_create_contiguous_region(
747         unsigned long vstart, unsigned int order, unsigned int address_bits)
748 {
749         unsigned long *in_frames, out_frame, frame, flags;
750         unsigned int   i;
751         int            rc, success;
752 #ifdef CONFIG_64BIT
753         pte_t         *ptep = NULL;
754 #endif
755         struct xen_memory_exchange exchange = {
756                 .in = {
757                         .nr_extents   = 1UL << order,
758                         .extent_order = 0,
759                         .domid        = DOMID_SELF
760                 },
761                 .out = {
762                         .nr_extents   = 1,
763                         .extent_order = order,
764                         .address_bits = address_bits,
765                         .domid        = DOMID_SELF
766                 }
767         };
768
769         /*
770          * Currently an auto-translated guest will not perform I/O, nor will
771          * it require PAE page directories below 4GB. Therefore any calls to
772          * this function are redundant and can be ignored.
773          */
774         if (xen_feature(XENFEAT_auto_translated_physmap))
775                 return 0;
776
777         rc = check_contig_order(order);
778         if (unlikely(rc))
779                 return rc;
780
781 #ifdef CONFIG_64BIT
782         if (unlikely(vstart > PAGE_OFFSET + MAXMEM)) {
783                 unsigned int level;
784
785                 if (vstart < __START_KERNEL_map
786                     || vstart + (PAGE_SIZE << order) > _brk_end)
787                         return -EINVAL;
788                 ptep = lookup_address((unsigned long)__va(__pa(vstart)),
789                                       &level);
790                 if (ptep && pte_none(*ptep))
791                         ptep = NULL;
792                 if (vstart < __START_KERNEL && ptep)
793                         return -EINVAL;
794                 rc = check_contig_order(order + 1);
795                 if (unlikely(rc))
796                         return rc;
797         }
798 #else
799         if (unlikely(vstart + (PAGE_SIZE << order) > (unsigned long)high_memory))
800                 return -EINVAL;
801 #endif
802
803         set_xen_guest_handle(exchange.out.extent_start, &out_frame);
804
805         xen_scrub_pages((void *)vstart, 1 << order);
806
807         balloon_lock(flags);
808
809         in_frames = discontig_frames;
810         set_xen_guest_handle(exchange.in.extent_start, in_frames);
811
812         /* 1. Zap current PTEs, remembering MFNs. */
813         for (i = 0; i < (1U<<order); i++) {
814                 in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i);
815                 MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
816                                         __pte_ma(0), 0);
817 #ifdef CONFIG_64BIT
818                 if (ptep)
819                         MULTI_update_va_mapping(cr_mcl + i + (1U << order),
820                                 (unsigned long)__va(__pa(vstart)) + (i*PAGE_SIZE),
821                                 __pte_ma(0), 0);
822 #endif
823                 set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
824                         INVALID_P2M_ENTRY);
825         }
826 #ifdef CONFIG_64BIT
827         if (ptep)
828                 i += i;
829 #endif
830         if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
831                 BUG();
832
833         /* 2. Get a new contiguous memory extent. */
834         out_frame = __pa(vstart) >> PAGE_SHIFT;
835         rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
836         success = (exchange.nr_exchanged == (1UL << order));
837         BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
838         BUG_ON(success && (rc != 0));
839 #if CONFIG_XEN_COMPAT <= 0x030002
840         if (unlikely(rc == -ENOSYS)) {
841                 /* Compatibility when XENMEM_exchange is unsupported. */
842                 if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
843                                          &exchange.in) != (1UL << order))
844                         BUG();
845                 success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
846                                                 &exchange.out) == 1);
847                 if (!success) {
848                         /* Couldn't get special memory: fall back to normal. */
849                         for (i = 0; i < (1U<<order); i++)
850                                 in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i;
851                         if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
852                                                  &exchange.in) != (1UL<<order))
853                                 BUG();
854                 }
855         }
856 #endif
857
858         /* 3. Map the new extent in place of old pages. */
859         for (i = 0; i < (1U<<order); i++) {
860                 frame = success ? (out_frame + i) : in_frames[i];
861                 MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
862                                         pfn_pte_ma(frame, PAGE_KERNEL), 0);
863 #ifdef CONFIG_64BIT
864                 if (ptep)
865                         MULTI_update_va_mapping(cr_mcl + i + (1U << order),
866                                 (unsigned long)__va(__pa(vstart)) + (i*PAGE_SIZE),
867                                 pfn_pte_ma(frame, PAGE_KERNEL_RO), 0);
868 #endif
869                 set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
870         }
871 #ifdef CONFIG_64BIT
872         if (ptep)
873                 i += i;
874 #endif
875         cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
876                                                    ? UVMF_TLB_FLUSH|UVMF_ALL
877                                                    : UVMF_INVLPG|UVMF_ALL;
878         if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
879                 BUG();
880
881         balloon_unlock(flags);
882
883         return success ? 0 : -ENOMEM;
884 }
885 EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
886
887 void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
888 {
889         unsigned long *out_frames, in_frame, frame, flags;
890         unsigned int   i;
891         int            rc, success;
892         struct xen_memory_exchange exchange = {
893                 .in = {
894                         .nr_extents   = 1,
895                         .extent_order = order,
896                         .domid        = DOMID_SELF
897                 },
898                 .out = {
899                         .nr_extents   = 1UL << order,
900                         .extent_order = 0,
901                         .domid        = DOMID_SELF
902                 }
903         };
904
905         if (xen_feature(XENFEAT_auto_translated_physmap))
906                 return;
907
908         if (unlikely(order > max_contig_order))
909                 return;
910
911         set_xen_guest_handle(exchange.in.extent_start, &in_frame);
912
913         xen_scrub_pages((void *)vstart, 1 << order);
914
915         balloon_lock(flags);
916
917         out_frames = discontig_frames;
918         set_xen_guest_handle(exchange.out.extent_start, out_frames);
919
920         /* 1. Find start MFN of contiguous extent. */
921         in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT);
922
923         /* 2. Zap current PTEs. */
924         for (i = 0; i < (1U<<order); i++) {
925                 MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
926                                         __pte_ma(0), 0);
927                 set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
928                         INVALID_P2M_ENTRY);
929                 out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i;
930         }
931         if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
932                 BUG();
933
934         /* 3. Do the exchange for non-contiguous MFNs. */
935         rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
936         success = (exchange.nr_exchanged == 1);
937         BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
938         BUG_ON(success && (rc != 0));
939 #if CONFIG_XEN_COMPAT <= 0x030002
940         if (unlikely(rc == -ENOSYS)) {
941                 /* Compatibility when XENMEM_exchange is unsupported. */
942                 if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
943                                          &exchange.in) != 1)
944                         BUG();
945                 if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
946                                          &exchange.out) != (1UL << order))
947                         BUG();
948                 success = 1;
949         }
950 #endif
951
952         /* 4. Map new pages in place of old pages. */
953         for (i = 0; i < (1U<<order); i++) {
954                 frame = success ? out_frames[i] : (in_frame + i);
955                 MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
956                                         pfn_pte_ma(frame, PAGE_KERNEL), 0);
957                 set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
958         }
959
960         cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
961                                                    ? UVMF_TLB_FLUSH|UVMF_ALL
962                                                    : UVMF_INVLPG|UVMF_ALL;
963         if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
964                 BUG();
965
966         balloon_unlock(flags);
967
968         if (unlikely(!success)) {
969                 /* Try hard to get the special memory back to Xen. */
970                 exchange.in.extent_order = 0;
971                 set_xen_guest_handle(exchange.in.extent_start, &in_frame);
972
973                 for (i = 0; i < (1U<<order); i++) {
974                         struct page *page = alloc_page(__GFP_HIGHMEM|__GFP_COLD);
975                         unsigned long pfn;
976                         mmu_update_t mmu;
977                         unsigned int j = 0;
978
979                         if (!page) {
980                                 pr_warn("Xen and kernel out of memory"
981                                         " while trying to release an order"
982                                         " %u contiguous region\n", order);
983                                 break;
984                         }
985                         pfn = page_to_pfn(page);
986
987                         balloon_lock(flags);
988
989                         if (!PageHighMem(page)) {
990                                 void *v = __va(pfn << PAGE_SHIFT);
991
992                                 xen_scrub_pages(v, 1);
993                                 MULTI_update_va_mapping(cr_mcl + j, (unsigned long)v,
994                                                         __pte_ma(0), UVMF_INVLPG|UVMF_ALL);
995                                 ++j;
996                         }
997 #ifdef CONFIG_XEN_SCRUB_PAGES
998                         else {
999                                 xen_scrub_pages(kmap(page), 1);
1000                                 kunmap(page);
1001                                 kmap_flush_unused();
1002                         }
1003 #endif
1004
1005                         frame = pfn_to_mfn(pfn);
1006                         set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
1007
1008                         MULTI_update_va_mapping(cr_mcl + j, vstart,
1009                                                 pfn_pte_ma(frame, PAGE_KERNEL),
1010                                                 UVMF_INVLPG|UVMF_ALL);
1011                         ++j;
1012
1013                         pfn = __pa(vstart) >> PAGE_SHIFT;
1014                         set_phys_to_machine(pfn, frame);
1015                         if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1016                                 mmu.ptr = ((uint64_t)frame << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
1017                                 mmu.val = pfn;
1018                                 cr_mcl[j].op = __HYPERVISOR_mmu_update;
1019                                 cr_mcl[j].args[0] = (unsigned long)&mmu;
1020                                 cr_mcl[j].args[1] = 1;
1021                                 cr_mcl[j].args[2] = 0;
1022                                 cr_mcl[j].args[3] = DOMID_SELF;
1023                                 ++j;
1024                         }
1025
1026                         cr_mcl[j].op = __HYPERVISOR_memory_op;
1027                         cr_mcl[j].args[0] = XENMEM_decrease_reservation;
1028                         cr_mcl[j].args[1] = (unsigned long)&exchange.in;
1029
1030                         if (HYPERVISOR_multicall(cr_mcl, j + 1))
1031                                 BUG();
1032                         BUG_ON(cr_mcl[j].result != 1);
1033                         while (j--)
1034                                 BUG_ON(cr_mcl[j].result != 0);
1035
1036                         balloon_unlock(flags);
1037
1038                         free_empty_pages(&page, 1);
1039
1040                         in_frame++;
1041                         vstart += PAGE_SIZE;
1042                 }
1043         }
1044 }
1045 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
1046
1047 int __init early_create_contiguous_region(unsigned long pfn,
1048                                           unsigned int order,
1049                                           unsigned int address_bits)
1050 {
1051         unsigned long *in_frames = discontig_frames, out_frame = pfn;
1052         unsigned int i;
1053         int rc, success;
1054         struct xen_memory_exchange exchange = {
1055                 .in = {
1056                         .nr_extents   = 1UL << order,
1057                         .extent_order = 0,
1058                         .domid        = DOMID_SELF
1059                 },
1060                 .out = {
1061                         .nr_extents   = 1,
1062                         .extent_order = order,
1063                         .address_bits = address_bits,
1064                         .domid        = DOMID_SELF
1065                 }
1066         };
1067
1068         if (xen_feature(XENFEAT_auto_translated_physmap))
1069                 return 0;
1070
1071         if (unlikely(order > max_contig_order))
1072                 return -ENOMEM;
1073
1074         for (i = 0; i < (1U << order); ++i) {
1075                 in_frames[i] = pfn_to_mfn(pfn + i);
1076                 set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY);
1077         }
1078
1079         set_xen_guest_handle(exchange.in.extent_start, in_frames);
1080         set_xen_guest_handle(exchange.out.extent_start, &out_frame);
1081
1082         rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
1083         success = (exchange.nr_exchanged == (1UL << order));
1084         BUG_ON(!success && (exchange.nr_exchanged || !rc));
1085         BUG_ON(success && rc);
1086 #if CONFIG_XEN_COMPAT <= 0x030002
1087         if (unlikely(rc == -ENOSYS)) {
1088                 /* Compatibility when XENMEM_exchange is unavailable. */
1089                 if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
1090                                          &exchange.in) != (1UL << order))
1091                         BUG();
1092                 success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
1093                                                 &exchange.out) == 1);
1094                 if (!success) {
1095                         for (i = 0; i < (1U << order); ++i)
1096                                 in_frames[i] = pfn + i;
1097                         if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
1098                                                  &exchange.in) != (1UL << order))
1099                                 BUG();
1100                 }
1101         }
1102 #endif
1103
1104         for (i = 0; i < (1U << order); ++i, ++out_frame) {
1105                 if (!success)
1106                         out_frame = in_frames[i];
1107                 set_phys_to_machine(pfn + i, out_frame);
1108         }
1109
1110         return success ? 0 : -ENOMEM;
1111 }
1112
1113 static void undo_limit_pages(struct page *pages, unsigned int order)
1114 {
1115         BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
1116         BUG_ON(order > max_contig_order);
1117         xen_limit_pages_to_max_mfn(pages, order, 0);
1118         ClearPageForeign(pages);
1119         init_page_count(pages);
1120         __free_pages(pages, order);
1121 }
1122
1123 int xen_limit_pages_to_max_mfn(
1124         struct page *pages, unsigned int order, unsigned int address_bits)
1125 {
1126         unsigned long flags, frame, *limit_map, _limit_map;
1127         unsigned long *in_frames, *out_frames;
1128         struct page *page;
1129         unsigned int i, n, nr_mcl;
1130         int rc, success;
1131
1132         struct xen_memory_exchange exchange = {
1133                 .in = {
1134                         .extent_order = 0,
1135                         .domid        = DOMID_SELF
1136                 },
1137                 .out = {
1138                         .extent_order = 0,
1139                         .address_bits = address_bits,
1140                         .domid        = DOMID_SELF
1141                 }
1142         };
1143
1144         if (xen_feature(XENFEAT_auto_translated_physmap))
1145                 return 0;
1146
1147         if (address_bits && address_bits < PAGE_SHIFT)
1148                 return -EINVAL;
1149
1150         rc = check_contig_order(order + 1);
1151         if (unlikely(rc))
1152                 return rc;
1153
1154         if (BITS_PER_LONG >> order) {
1155                 limit_map = kmalloc(BITS_TO_LONGS(1U << order)
1156                                     * sizeof(*limit_map), GFP_ATOMIC);
1157                 if (unlikely(!limit_map))
1158                         return -ENOMEM;
1159         } else
1160                 limit_map = &_limit_map;
1161
1162         if (address_bits)
1163                 bitmap_zero(limit_map, 1U << order);
1164         else if (order) {
1165                 BUILD_BUG_ON(sizeof(pages->index) != sizeof(*limit_map));
1166                 for (i = 0; i < BITS_TO_LONGS(1U << order); ++i)
1167                         limit_map[i] = pages[i + 1].index;
1168         } else
1169                 __set_bit(0, limit_map);
1170
1171         /* 0. Scrub the pages. */
1172         for (i = 0, n = 0; i < 1U<<order ; i++) {
1173                 page = &pages[i];
1174                 if (address_bits) {
1175                         if (!(pfn_to_mfn(page_to_pfn(page)) >> (address_bits - PAGE_SHIFT)))
1176                                 continue;
1177                         __set_bit(i, limit_map);
1178                 }
1179
1180                 if (!PageHighMem(page))
1181                         xen_scrub_pages(page_address(page), 1);
1182 #ifdef CONFIG_XEN_SCRUB_PAGES
1183                 else {
1184                         xen_scrub_pages(kmap(page), 1);
1185                         kunmap(page);
1186                         ++n;
1187                 }
1188 #endif
1189         }
1190         if (bitmap_empty(limit_map, 1U << order)) {
1191                 if (limit_map != &_limit_map)
1192                         kfree(limit_map);
1193                 return 0;
1194         }
1195
1196         if (n)
1197                 kmap_flush_unused();
1198
1199         balloon_lock(flags);
1200
1201         in_frames = discontig_frames;
1202         set_xen_guest_handle(exchange.in.extent_start, in_frames);
1203         out_frames = in_frames + (1U << order);
1204         set_xen_guest_handle(exchange.out.extent_start, out_frames);
1205
1206         /* 1. Zap current PTEs (if any), remembering MFNs. */
1207         for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
1208                 if(!test_bit(i, limit_map))
1209                         continue;
1210                 page = &pages[i];
1211
1212                 out_frames[n] = page_to_pfn(page);
1213                 in_frames[n] = pfn_to_mfn(out_frames[n]);
1214
1215                 if (!PageHighMem(page))
1216                         MULTI_update_va_mapping(cr_mcl + nr_mcl++,
1217                                                 (unsigned long)page_address(page),
1218                                                 __pte_ma(0), 0);
1219
1220                 set_phys_to_machine(out_frames[n], INVALID_P2M_ENTRY);
1221                 ++n;
1222         }
1223         if (nr_mcl && HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
1224                 BUG();
1225
1226         /* 2. Get new memory below the required limit. */
1227         exchange.in.nr_extents = n;
1228         exchange.out.nr_extents = n;
1229         rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
1230         success = (exchange.nr_exchanged == n);
1231         BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
1232         BUG_ON(success && (rc != 0));
1233 #if CONFIG_XEN_COMPAT <= 0x030002
1234         if (unlikely(rc == -ENOSYS)) {
1235                 /* Compatibility when XENMEM_exchange is unsupported. */
1236                 if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
1237                                          &exchange.in) != n)
1238                         BUG();
1239                 if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
1240                                          &exchange.out) != n)
1241                         BUG();
1242                 success = 1;
1243         }
1244 #endif
1245
1246         /* 3. Map the new pages in place of old pages. */
1247         for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
1248                 if(!test_bit(i, limit_map))
1249                         continue;
1250                 page = &pages[i];
1251
1252                 frame = success ? out_frames[n] : in_frames[n];
1253
1254                 if (!PageHighMem(page))
1255                         MULTI_update_va_mapping(cr_mcl + nr_mcl++,
1256                                                 (unsigned long)page_address(page),
1257                                                 pfn_pte_ma(frame, PAGE_KERNEL), 0);
1258
1259                 set_phys_to_machine(page_to_pfn(page), frame);
1260                 ++n;
1261         }
1262         if (nr_mcl) {
1263                 cr_mcl[nr_mcl - 1].args[MULTI_UVMFLAGS_INDEX] = order
1264                                                                 ? UVMF_TLB_FLUSH|UVMF_ALL
1265                                                                 : UVMF_INVLPG|UVMF_ALL;
1266                 if (HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
1267                         BUG();
1268         }
1269
1270         balloon_unlock(flags);
1271
1272         if (success && address_bits) {
1273                 if (order) {
1274                         BUILD_BUG_ON(sizeof(*limit_map) != sizeof(pages->index));
1275                         for (i = 0; i < BITS_TO_LONGS(1U << order); ++i)
1276                                 pages[i + 1].index = limit_map[i];
1277                 }
1278                 SetPageForeign(pages, undo_limit_pages);
1279         }
1280
1281         if (limit_map != &_limit_map)
1282                 kfree(limit_map);
1283
1284         return success ? 0 : -ENOMEM;
1285 }
1286 EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
1287
1288 bool hypervisor_oom(void)
1289 {
1290         WARN_ONCE(1, "Hypervisor is out of memory");
1291         return false;//temp
1292 }
1293
1294 int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
1295                           void *arg, int (*func)(unsigned long, unsigned long,
1296                                                  void *))
1297 {
1298         return start_pfn < max_pfn && nr_pages
1299                ? func(start_pfn, min(max_pfn - start_pfn, nr_pages), arg)
1300                : -1;
1301 }
1302
1303 int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
1304 {
1305         maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry);
1306         return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
1307 }
1308
1309 int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
1310                     int type)
1311 {
1312         maddr_t mach_gp = arbitrary_virt_to_machine(gdt + entry);
1313         return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);
1314 }