- Update Xen patches to 3.3-rc5 and c/s 1157.
[linux-flexiantxendom0-3.2.10.git] / arch / x86 / mm / pgtable-xen.c
1 #include <linux/mm.h>
2 #include <linux/gfp.h>
3 #include <linux/module.h>
4 #include <linux/smp.h>
5 #include <xen/features.h>
6 #include <asm/pgalloc.h>
7 #include <asm/pgtable.h>
8 #include <asm/tlb.h>
9 #include <asm/fixmap.h>
10 #include <asm/hypervisor.h>
11 #include <asm/mmu_context.h>
12
13 #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
14
15 #ifdef CONFIG_HIGHPTE
16 #define PGALLOC_USER_GFP __GFP_HIGHMEM
17 #else
18 #define PGALLOC_USER_GFP 0
19 #endif
20
21 gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
22
23 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
24 {
25         pte_t *pte = (pte_t *)__get_free_page(PGALLOC_GFP);
26         if (pte)
27                 make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
28         return pte;
29 }
30
31 static void _pte_free(struct page *page, unsigned int order)
32 {
33         BUG_ON(order);
34         __pte_free(page);
35 }
36
37 pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
38 {
39         struct page *pte;
40
41         pte = alloc_pages(__userpte_alloc_gfp, 0);
42         if (pte) {
43                 pgtable_page_ctor(pte);
44                 SetPageForeign(pte, _pte_free);
45                 init_page_count(pte);
46         }
47         return pte;
48 }
49
50 static int __init setup_userpte(char *arg)
51 {
52         if (!arg)
53                 return -EINVAL;
54
55         /*
56          * "userpte=nohigh" disables allocation of user pagetables in
57          * high memory.
58          */
59         if (strcmp(arg, "nohigh") == 0)
60                 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
61         else
62                 return -EINVAL;
63         return 0;
64 }
65 early_param("userpte", setup_userpte);
66
67 void __pte_free(pgtable_t pte)
68 {
69         if (!PageHighMem(pte)) {
70                 if (PagePinned(pte)) {
71                         unsigned long pfn = page_to_pfn(pte);
72
73                         if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
74                                                          pfn_pte(pfn,
75                                                                  PAGE_KERNEL),
76                                                          0))
77                                 BUG();
78                         ClearPagePinned(pte);
79                 }
80         } else
81 #ifdef CONFIG_HIGHPTE
82                 ClearPagePinned(pte);
83 #else
84                 BUG();
85 #endif
86
87         ClearPageForeign(pte);
88         init_page_count(pte);
89         pgtable_page_dtor(pte);
90         __free_page(pte);
91 }
92
93 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
94 {
95         pgtable_page_dtor(pte);
96         paravirt_release_pte(page_to_pfn(pte));
97         tlb_remove_page(tlb, pte);
98 }
99
100 #if PAGETABLE_LEVELS > 2
101 static void _pmd_free(struct page *page, unsigned int order)
102 {
103         BUG_ON(order);
104         __pmd_free(page);
105 }
106
107 pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
108 {
109         struct page *pmd;
110
111         pmd = alloc_pages(PGALLOC_GFP, 0);
112         if (!pmd)
113                 return NULL;
114         SetPageForeign(pmd, _pmd_free);
115         init_page_count(pmd);
116         return page_address(pmd);
117 }
118
119 void __pmd_free(pgtable_t pmd)
120 {
121         if (PagePinned(pmd)) {
122                 unsigned long pfn = page_to_pfn(pmd);
123
124                 if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
125                                                  pfn_pte(pfn, PAGE_KERNEL),
126                                                  0))
127                         BUG();
128                 ClearPagePinned(pmd);
129         }
130
131         ClearPageForeign(pmd);
132         init_page_count(pmd);
133         __free_page(pmd);
134 }
135
136 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
137 {
138         paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
139         tlb_remove_page(tlb, virt_to_page(pmd));
140 }
141
142 #if PAGETABLE_LEVELS > 3
143 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
144 {
145         paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
146         tlb_remove_page(tlb, virt_to_page(pud));
147 }
148 #endif  /* PAGETABLE_LEVELS > 3 */
149 #endif  /* PAGETABLE_LEVELS > 2 */
150
151 static void _pin_lock(struct mm_struct *mm, int lock) {
152         if (lock)
153                 spin_lock(&mm->page_table_lock);
154 #if USE_SPLIT_PTLOCKS
155         /* While mm->page_table_lock protects us against insertions and
156          * removals of higher level page table pages, it doesn't protect
157          * against updates of pte-s. Such updates, however, require the
158          * pte pages to be in consistent state (unpinned+writable or
159          * pinned+readonly). The pinning and attribute changes, however
160          * cannot be done atomically, which is why such updates must be
161          * prevented from happening concurrently.
162          * Note that no pte lock can ever elsewhere be acquired nesting
163          * with an already acquired one in the same mm, or with the mm's
164          * page_table_lock already acquired, as that would break in the
165          * non-split case (where all these are actually resolving to the
166          * one page_table_lock). Thus acquiring all of them here is not
167          * going to result in dead locks, and the order of acquires
168          * doesn't matter.
169          */
170         {
171                 pgd_t *pgd = mm->pgd;
172                 unsigned g;
173
174                 for (g = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) {
175                         pud_t *pud;
176                         unsigned u;
177
178                         if (pgd_none(*pgd))
179                                 continue;
180                         pud = pud_offset(pgd, 0);
181                         for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
182                                 pmd_t *pmd;
183                                 unsigned m;
184
185                                 if (pud_none(*pud))
186                                         continue;
187                                 pmd = pmd_offset(pud, 0);
188                                 for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
189                                         spinlock_t *ptl;
190
191                                         if (pmd_none(*pmd))
192                                                 continue;
193                                         ptl = pte_lockptr(0, pmd);
194                                         if (lock)
195                                                 spin_lock(ptl);
196                                         else
197                                                 spin_unlock(ptl);
198                                 }
199                         }
200                 }
201         }
202 #endif
203         if (!lock)
204                 spin_unlock(&mm->page_table_lock);
205 }
206 #define pin_lock(mm) _pin_lock(mm, 1)
207 #define pin_unlock(mm) _pin_lock(mm, 0)
208
209 #define PIN_BATCH sizeof(void *)
210 static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
211
212 static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
213                                              unsigned int cpu, unsigned int seq)
214 {
215         unsigned long pfn = page_to_pfn(page);
216
217         if (pgprot_val(flags) & _PAGE_RW)
218                 ClearPagePinned(page);
219         else
220                 SetPagePinned(page);
221         if (PageHighMem(page))
222                 return seq;
223         MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
224                                 (unsigned long)__va(pfn << PAGE_SHIFT),
225                                 pfn_pte(pfn, flags), 0);
226         if (unlikely(++seq == PIN_BATCH)) {
227                 if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
228                                                         PIN_BATCH, NULL)))
229                         BUG();
230                 seq = 0;
231         }
232
233         return seq;
234 }
235
236 static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
237 {
238         pgd_t       *pgd = pgd_base;
239         pud_t       *pud;
240         pmd_t       *pmd;
241         int          g,u,m;
242         unsigned int cpu, seq;
243         multicall_entry_t *mcl;
244
245         if (xen_feature(XENFEAT_auto_translated_physmap))
246                 return;
247
248         cpu = get_cpu();
249
250         /*
251          * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
252          * may not be the 'current' task's pagetables (e.g., current may be
253          * 32-bit, but the pagetables may be for a 64-bit task).
254          * Subtracting 1 from TASK_SIZE_MAX means the loop limit is correct
255          * regardless of whether TASK_SIZE_MAX is a multiple of PGDIR_SIZE.
256          */
257         for (g = 0, seq = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) {
258                 if (pgd_none(*pgd))
259                         continue;
260                 pud = pud_offset(pgd, 0);
261                 if (PTRS_PER_PUD > 1) /* not folded */
262                         seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
263                 for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
264                         if (pud_none(*pud))
265                                 continue;
266                         pmd = pmd_offset(pud, 0);
267                         if (PTRS_PER_PMD > 1) /* not folded */
268                                 seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
269                         for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
270                                 if (pmd_none(*pmd))
271                                         continue;
272                                 seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
273                         }
274                 }
275         }
276
277 #ifdef CONFIG_X86_PAE
278         for (; g < PTRS_PER_PGD; g++, pgd++) {
279                 BUG_ON(pgd_none(*pgd));
280                 pud = pud_offset(pgd, 0);
281                 BUG_ON(pud_none(*pud));
282                 pmd = pmd_offset(pud, 0);
283                 seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
284         }
285 #endif
286
287         mcl = per_cpu(pb_mcl, cpu);
288 #ifdef CONFIG_X86_64
289         if (unlikely(seq > PIN_BATCH - 2)) {
290                 if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
291                         BUG();
292                 seq = 0;
293         }
294         pgd = __user_pgd(pgd_base);
295         BUG_ON(!pgd);
296         MULTI_update_va_mapping(mcl + seq,
297                (unsigned long)pgd,
298                pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, flags),
299                0);
300         MULTI_update_va_mapping(mcl + seq + 1,
301                (unsigned long)pgd_base,
302                pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
303                UVMF_TLB_FLUSH);
304         if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
305                 BUG();
306 #else
307         if (likely(seq != 0)) {
308                 MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
309                         (unsigned long)pgd_base,
310                         pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
311                         UVMF_TLB_FLUSH);
312                 if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
313                                                         seq + 1, NULL)))
314                         BUG();
315         } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
316                         pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
317                         UVMF_TLB_FLUSH))
318                 BUG();
319 #endif
320
321         put_cpu();
322 }
323
324 void __init xen_init_pgd_pin(void)
325 {
326         pgd_t       *pgd = init_mm.pgd;
327         pud_t       *pud;
328         pmd_t       *pmd;
329         unsigned int g, u, m;
330
331         if (xen_feature(XENFEAT_auto_translated_physmap))
332                 return;
333
334         SetPagePinned(virt_to_page(pgd));
335         for (g = 0; g < PTRS_PER_PGD; g++, pgd++) {
336 #ifndef CONFIG_X86_PAE
337                 if (g >= pgd_index(HYPERVISOR_VIRT_START)
338                     && g <= pgd_index(HYPERVISOR_VIRT_END - 1))
339                         continue;
340 #endif
341                 if (!pgd_present(*pgd))
342                         continue;
343                 pud = pud_offset(pgd, 0);
344                 if (PTRS_PER_PUD > 1) /* not folded */
345                         SetPagePinned(virt_to_page(pud));
346                 for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
347                         if (!pud_present(*pud) || pud_large(*pud))
348                                 continue;
349                         pmd = pmd_offset(pud, 0);
350                         if (PTRS_PER_PMD > 1) /* not folded */
351                                 SetPagePinned(virt_to_page(pmd));
352                         for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
353 #ifdef CONFIG_X86_PAE
354                                 if (g == pgd_index(HYPERVISOR_VIRT_START)
355                                     && m >= pmd_index(HYPERVISOR_VIRT_START))
356                                         continue;
357 #endif
358                                 if (!pmd_present(*pmd) || pmd_large(*pmd))
359                                         continue;
360                                 SetPagePinned(pmd_page(*pmd));
361                         }
362                 }
363         }
364 #ifdef CONFIG_X86_64
365         SetPagePinned(virt_to_page(level3_user_pgt));
366 #endif
367 }
368
369 static void __pgd_pin(pgd_t *pgd)
370 {
371         pgd_walk(pgd, PAGE_KERNEL_RO);
372         kmap_flush_unused();
373         xen_pgd_pin(pgd);
374         SetPagePinned(virt_to_page(pgd));
375 }
376
377 static void __pgd_unpin(pgd_t *pgd)
378 {
379         xen_pgd_unpin(pgd);
380         pgd_walk(pgd, PAGE_KERNEL);
381         ClearPagePinned(virt_to_page(pgd));
382 }
383
384 static void pgd_test_and_unpin(pgd_t *pgd)
385 {
386         if (PagePinned(virt_to_page(pgd)))
387                 __pgd_unpin(pgd);
388 }
389
390 void mm_pin(struct mm_struct *mm)
391 {
392         if (xen_feature(XENFEAT_writable_page_tables))
393                 return;
394
395         pin_lock(mm);
396         __pgd_pin(mm->pgd);
397         pin_unlock(mm);
398 }
399
400 void mm_unpin(struct mm_struct *mm)
401 {
402         if (xen_feature(XENFEAT_writable_page_tables))
403                 return;
404
405         pin_lock(mm);
406         __pgd_unpin(mm->pgd);
407         pin_unlock(mm);
408 }
409
410 void mm_pin_all(void)
411 {
412         struct page *page;
413
414         if (xen_feature(XENFEAT_writable_page_tables))
415                 return;
416
417         /*
418          * Allow uninterrupted access to the pgd_list. Also protects
419          * __pgd_pin() by ensuring preemption is disabled.
420          * All other CPUs must be at a safe point (e.g., in stop_machine
421          * or offlined entirely).
422          */
423         BUG_ON(!irqs_disabled());
424         spin_lock(&pgd_lock);
425         list_for_each_entry(page, &pgd_list, lru) {
426                 if (!PagePinned(page))
427                         __pgd_pin((pgd_t *)page_address(page));
428         }
429         spin_unlock(&pgd_lock);
430 }
431
432 void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
433 {
434         if (!PagePinned(virt_to_page(mm->pgd)))
435                 mm_pin(mm);
436 }
437
438 /*
439  * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() *much*
440  * faster this way, as no hypercalls are needed for the page table updates.
441  */
442 static void leave_active_mm(struct task_struct *tsk, struct mm_struct *mm)
443         __releases(tsk->alloc_lock)
444 {
445         if (tsk->active_mm == mm) {
446                 tsk->active_mm = &init_mm;
447                 atomic_inc(&init_mm.mm_count);
448
449                 switch_mm(mm, &init_mm, tsk);
450
451                 if (atomic_dec_and_test(&mm->mm_count))
452                         BUG();
453         }
454
455         task_unlock(tsk);
456 }
457
458 static void _leave_active_mm(void *mm)
459 {
460         struct task_struct *tsk = current;
461
462         if (spin_trylock(&tsk->alloc_lock))
463                 leave_active_mm(tsk, mm);
464 }
465
466 void arch_exit_mmap(struct mm_struct *mm)
467 {
468         struct task_struct *tsk = current;
469
470         task_lock(tsk);
471         leave_active_mm(tsk, mm);
472
473         preempt_disable();
474         smp_call_function_many(mm_cpumask(mm), _leave_active_mm, mm, 1);
475         preempt_enable();
476
477         if (PagePinned(virt_to_page(mm->pgd))
478             && atomic_read(&mm->mm_count) == 1
479             && !mm->context.has_foreign_mappings)
480                 mm_unpin(mm);
481 }
482
483 static inline void pgd_list_add(pgd_t *pgd)
484 {
485         struct page *page = virt_to_page(pgd);
486
487         list_add(&page->lru, &pgd_list);
488 }
489
490 static inline void pgd_list_del(pgd_t *pgd)
491 {
492         struct page *page = virt_to_page(pgd);
493
494         list_del(&page->lru);
495 }
496
497 #define UNSHARED_PTRS_PER_PGD                           \
498         (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
499
500
501 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
502 {
503         BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
504         virt_to_page(pgd)->index = (pgoff_t)mm;
505 }
506
507 struct mm_struct *pgd_page_get_mm(struct page *page)
508 {
509         return (struct mm_struct *)page->index;
510 }
511
512 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
513 {
514         pgd_test_and_unpin(pgd);
515
516         /* If the pgd points to a shared pagetable level (either the
517            ptes in non-PAE, or shared PMD in PAE), then just copy the
518            references from swapper_pg_dir. */
519         if (PAGETABLE_LEVELS == 2 ||
520             (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
521             PAGETABLE_LEVELS == 4) {
522                 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
523                                 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
524                                 KERNEL_PGD_PTRS);
525         }
526
527 #ifdef CONFIG_X86_64
528         /* set level3_user_pgt for vsyscall area */
529         __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
530                 __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
531 #endif
532
533         /* list required to sync kernel mapping updates */
534         if (!SHARED_KERNEL_PMD) {
535                 pgd_set_mm(pgd, mm);
536                 pgd_list_add(pgd);
537         }
538 }
539
540 static void pgd_dtor(pgd_t *pgd)
541 {
542         if (!SHARED_KERNEL_PMD) {
543                 spin_lock(&pgd_lock);
544                 pgd_list_del(pgd);
545                 spin_unlock(&pgd_lock);
546         }
547
548         pgd_test_and_unpin(pgd);
549 }
550
551 /*
552  * List of all pgd's needed for non-PAE so it can invalidate entries
553  * in both cached and uncached pgd's; not needed for PAE since the
554  * kernel pmd is shared. If PAE were not to share the pmd a similar
555  * tactic would be needed. This is essentially codepath-based locking
556  * against pageattr.c; it is the unique case in which a valid change
557  * of kernel pagetables can't be lazily synchronized by vmalloc faults.
558  * vmalloc faults work because attached pagetables are never freed.
559  * -- wli
560  */
561
562 #ifdef CONFIG_X86_PAE
563 /*
564  * In PAE mode, we need to do a cr3 reload (=tlb flush) when
565  * updating the top-level pagetable entries to guarantee the
566  * processor notices the update.  Since this is expensive, and
567  * all 4 top-level entries are used almost immediately in a
568  * new process's life, we just pre-populate them here.
569  *
570  * Also, if we're in a paravirt environment where the kernel pmd is
571  * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
572  * and initialize the kernel pmds here.
573  */
574 #define PREALLOCATED_PMDS       UNSHARED_PTRS_PER_PGD
575
576 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
577 {
578         /* Note: almost everything apart from _PAGE_PRESENT is
579            reserved at the pmd (PDPT) level. */
580         pud_t pud = __pud(__pa(pmd) | _PAGE_PRESENT);
581
582         paravirt_alloc_pmd(mm, page_to_pfn(virt_to_page(pmd)));
583
584         if (likely(!PagePinned(virt_to_page(pudp)))) {
585                 *pudp = pud;
586                 return;
587         }
588
589         set_pud(pudp, pud);
590
591         /*
592          * According to Intel App note "TLBs, Paging-Structure Caches,
593          * and Their Invalidation", April 2007, document 317080-001,
594          * section 8.1: in PAE mode we explicitly have to flush the
595          * TLB via cr3 if the top-level pgd is changed...
596          */
597         flush_tlb_mm(mm);
598 }
599 #else  /* !CONFIG_X86_PAE */
600
601 /* No need to prepopulate any pagetable entries in non-PAE modes. */
602 #define PREALLOCATED_PMDS       0
603
604 #endif  /* CONFIG_X86_PAE */
605
606 static void free_pmds(pmd_t *pmds[], struct mm_struct *mm, bool contig)
607 {
608         int i;
609
610 #ifdef CONFIG_X86_PAE
611         if (contig)
612                 xen_destroy_contiguous_region((unsigned long)mm->pgd, 0);
613 #endif
614
615         for(i = 0; i < PREALLOCATED_PMDS; i++)
616                 if (pmds[i])
617                         pmd_free(mm, pmds[i]);
618 }
619
620 static int preallocate_pmds(pmd_t *pmds[], struct mm_struct *mm)
621 {
622         int i;
623         bool failed = false;
624
625         for(i = 0; i < PREALLOCATED_PMDS; i++) {
626                 pmd_t *pmd = pmd_alloc_one(mm, i << PUD_SHIFT);
627                 if (pmd == NULL)
628                         failed = true;
629                 pmds[i] = pmd;
630         }
631
632         if (failed) {
633                 free_pmds(pmds, mm, false);
634                 return -ENOMEM;
635         }
636
637         return 0;
638 }
639
640 /*
641  * Mop up any pmd pages which may still be attached to the pgd.
642  * Normally they will be freed by munmap/exit_mmap, but any pmd we
643  * preallocate which never got a corresponding vma will need to be
644  * freed manually.
645  */
646 static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
647 {
648         int i;
649
650         for(i = 0; i < PREALLOCATED_PMDS; i++) {
651                 pgd_t pgd = pgdp[i];
652
653                 if (__pgd_val(pgd) != 0) {
654                         pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
655
656                         pgdp[i] = xen_make_pgd(0);
657
658                         paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
659                         pmd_free(mm, pmd);
660                 }
661         }
662
663 #ifdef CONFIG_X86_PAE
664         if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
665                 xen_destroy_contiguous_region((unsigned long)pgdp, 0);
666 #endif
667 }
668
669 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
670 {
671         pud_t *pud;
672         unsigned long addr;
673         int i;
674
675         if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
676                 return;
677
678         pud = pud_offset(pgd, 0);
679         for (addr = i = 0; i < PREALLOCATED_PMDS;
680              i++, pud++, addr += PUD_SIZE) {
681                 pmd_t *pmd = pmds[i];
682
683                 if (i >= KERNEL_PGD_BOUNDARY)
684                         memcpy(pmd,
685                                (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
686                                sizeof(pmd_t) * PTRS_PER_PMD);
687
688                 /* It is safe to poke machine addresses of pmds under the pgd_lock. */
689                 pud_populate(mm, pud, pmd);
690         }
691 }
692
693 static inline pgd_t *user_pgd_alloc(pgd_t *pgd)
694 {
695 #ifdef CONFIG_X86_64
696         if (pgd) {
697                 pgd_t *upgd = (void *)__get_free_page(PGALLOC_GFP);
698
699                 if (upgd)
700                         set_page_private(virt_to_page(pgd),
701                                          (unsigned long)upgd);
702                 else {
703                         free_page((unsigned long)pgd);
704                         pgd = NULL;
705                 }
706         }
707 #endif
708         return pgd;
709 }
710
711 static inline void user_pgd_free(pgd_t *pgd)
712 {
713 #ifdef CONFIG_X86_64
714         free_page(page_private(virt_to_page(pgd)));
715 #endif
716 }
717
718 pgd_t *pgd_alloc(struct mm_struct *mm)
719 {
720         pgd_t *pgd;
721         pmd_t *pmds[PREALLOCATED_PMDS];
722
723         pgd = user_pgd_alloc((void *)__get_free_page(PGALLOC_GFP));
724
725         if (pgd == NULL)
726                 goto out;
727
728         mm->pgd = pgd;
729
730         if (preallocate_pmds(pmds, mm) != 0)
731                 goto out_free_pgd;
732
733         if (paravirt_pgd_alloc(mm) != 0)
734                 goto out_free_pmds;
735
736         /*
737          * Make sure that pre-populating the pmds is atomic with
738          * respect to anything walking the pgd_list, so that they
739          * never see a partially populated pgd.
740          */
741         spin_lock(&pgd_lock);
742
743 #ifdef CONFIG_X86_PAE
744         /* Protect against save/restore: move below 4GB under pgd_lock. */
745         if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
746             && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
747                 spin_unlock(&pgd_lock);
748                 goto out_free_pmds;
749         }
750 #endif
751
752         pgd_ctor(mm, pgd);
753         pgd_prepopulate_pmd(mm, pgd, pmds);
754
755         spin_unlock(&pgd_lock);
756
757         return pgd;
758
759 out_free_pmds:
760         free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb));
761 out_free_pgd:
762         user_pgd_free(pgd);
763         free_page((unsigned long)pgd);
764 out:
765         return NULL;
766 }
767
768 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
769 {
770         /*
771          * After this the pgd should not be pinned for the duration of this
772          * function's execution. We should never sleep and thus never race:
773          *  1. User pmds will not become write-protected under our feet due
774          *     to a concurrent mm_pin_all().
775          *  2. The machine addresses in PGD entries will not become invalid
776          *     due to a concurrent save/restore.
777          */
778         pgd_dtor(pgd);
779
780         pgd_mop_up_pmds(mm, pgd);
781         paravirt_pgd_free(mm, pgd);
782         user_pgd_free(pgd);
783         free_page((unsigned long)pgd);
784 }
785
786 /* blktap and gntdev need this, as otherwise they would implicitly (and
787  * needlessly, as they never use it) reference init_mm. */
788 pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
789                                   unsigned long addr, pte_t *ptep, int full)
790 {
791         return ptep_get_and_clear_full(vma ? vma->vm_mm : &init_mm,
792                                        addr, ptep, full);
793 }
794 EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
795
796 int ptep_set_access_flags(struct vm_area_struct *vma,
797                           unsigned long address, pte_t *ptep,
798                           pte_t entry, int dirty)
799 {
800         int changed = !pte_same(*ptep, entry);
801
802         if (changed && dirty) {
803                 if (likely(vma->vm_mm == current->mm)) {
804                         if (HYPERVISOR_update_va_mapping(address,
805                                 entry,
806                                 uvm_multi(mm_cpumask(vma->vm_mm))|UVMF_INVLPG))
807                                 BUG();
808                 } else {
809                         xen_l1_entry_update(ptep, entry);
810                         flush_tlb_page(vma, address);
811                 }
812         }
813
814         return changed;
815 }
816
817 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
818 int pmdp_set_access_flags(struct vm_area_struct *vma,
819                           unsigned long address, pmd_t *pmdp,
820                           pmd_t entry, int dirty)
821 {
822         int changed = !pmd_same(*pmdp, entry);
823
824         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
825
826         if (changed && dirty) {
827                 *pmdp = entry;
828                 pmd_update_defer(vma->vm_mm, address, pmdp);
829                 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
830         }
831
832         return changed;
833 }
834 #endif
835
836 int ptep_test_and_clear_young(struct vm_area_struct *vma,
837                               unsigned long addr, pte_t *ptep)
838 {
839         int ret = 0;
840
841         if (pte_young(*ptep))
842                 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
843                                          (unsigned long *) &ptep->pte);
844
845         if (ret)
846                 pte_update(vma->vm_mm, addr, ptep);
847
848         return ret;
849 }
850
851 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
852 int pmdp_test_and_clear_young(struct vm_area_struct *vma,
853                               unsigned long addr, pmd_t *pmdp)
854 {
855         int ret = 0;
856
857         if (pmd_young(*pmdp))
858                 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
859                                          (unsigned long *)pmdp);
860
861         if (ret)
862                 pmd_update(vma->vm_mm, addr, pmdp);
863
864         return ret;
865 }
866 #endif
867
868 int ptep_clear_flush_young(struct vm_area_struct *vma,
869                            unsigned long address, pte_t *ptep)
870 {
871         pte_t pte = *ptep;
872         int young = pte_young(pte);
873
874         pte = pte_mkold(pte);
875         if (PagePinned(virt_to_page(vma->vm_mm->pgd)))
876                 ptep_set_access_flags(vma, address, ptep, pte, young);
877         else if (young)
878                 ptep->pte_low = pte.pte_low;
879
880         return young;
881 }
882
883 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
884 int pmdp_clear_flush_young(struct vm_area_struct *vma,
885                            unsigned long address, pmd_t *pmdp)
886 {
887         int young;
888
889         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
890
891         young = pmdp_test_and_clear_young(vma, address, pmdp);
892         if (young)
893                 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
894
895         return young;
896 }
897
898 void pmdp_splitting_flush(struct vm_area_struct *vma,
899                           unsigned long address, pmd_t *pmdp)
900 {
901         int set;
902         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
903         set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
904                                 (unsigned long *)pmdp);
905         if (set) {
906                 pmd_update(vma->vm_mm, address, pmdp);
907                 /* need tlb flush only to serialize against gup-fast */
908                 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
909         }
910 }
911 #endif
912
913 /**
914  * reserve_top_address - reserves a hole in the top of kernel address space
915  * @reserve - size of hole to reserve
916  *
917  * Can be used to relocate the fixmap area and poke a hole in the top
918  * of kernel address space to make room for a hypervisor.
919  */
920 void __init reserve_top_address(unsigned long reserve)
921 {
922 #ifdef CONFIG_X86_32
923         BUG_ON(fixmaps_set > 0);
924         printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
925                (int)-reserve);
926         __FIXADDR_TOP = -reserve - PAGE_SIZE;
927 #endif
928 }
929
930 int fixmaps_set;
931
932 void xen_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
933 {
934         unsigned long address = __fix_to_virt(idx);
935         pte_t pte;
936
937         if (idx >= __end_of_fixed_addresses) {
938                 BUG();
939                 return;
940         }
941
942         switch (idx) {
943 #ifdef CONFIG_X86_64
944         extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
945
946         case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
947         case VVAR_PAGE:
948                 pte = pfn_pte(phys >> PAGE_SHIFT, flags);
949                 set_pte_vaddr_pud(level3_user_pgt, address, pte);
950                 break;
951         case FIX_EARLYCON_MEM_BASE:
952         case FIX_SHARED_INFO:
953         case FIX_ISAMAP_END ... FIX_ISAMAP_BEGIN:
954                 xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
955                                     pfn_pte_ma(phys >> PAGE_SHIFT, flags));
956                 fixmaps_set++;
957                 return;
958 #else
959         case FIX_WP_TEST:
960         case FIX_VDSO:
961                 pte = pfn_pte(phys >> PAGE_SHIFT, flags);
962                 break;
963 #endif
964         default:
965                 pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
966                 break;
967         }
968         set_pte_vaddr(address, pte);
969         fixmaps_set++;
970 }