arch/x86/mm/pgtable-xen.c

   1 #include <linux/mm.h>
   2 #include <linux/gfp.h>
   3 #include <linux/module.h>
   4 #include <linux/smp.h>
   5 #include <xen/features.h>
   6 #include <asm/pgalloc.h>
   7 #include <asm/pgtable.h>
   8 #include <asm/tlb.h>
   9 #include <asm/fixmap.h>
  10 #include <asm/hypervisor.h>
  11 #include <asm/mmu_context.h>
  12
  13 #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
  14
  15 #ifdef CONFIG_HIGHPTE
  16 #define PGALLOC_USER_GFP __GFP_HIGHMEM
  17 #else
  18 #define PGALLOC_USER_GFP 0
  19 #endif
  20
  21 gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
  22
  23 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
  24 {
  25         pte_t *pte = (pte_t *)__get_free_page(PGALLOC_GFP);
  26         if (pte)
  27                 make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
  28         return pte;
  29 }
  30
  31 static void _pte_free(struct page *page, unsigned int order)
  32 {
  33         BUG_ON(order);
  34         __pte_free(page);
  35 }
  36
  37 pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
  38 {
  39         struct page *pte;
  40
  41         pte = alloc_pages(__userpte_alloc_gfp, 0);
  42         if (pte) {
  43                 pgtable_page_ctor(pte);
  44                 SetPageForeign(pte, _pte_free);
  45                 init_page_count(pte);
  46         }
  47         return pte;
  48 }
  49
  50 static int __init setup_userpte(char *arg)
  51 {
  52         if (!arg)
  53                 return -EINVAL;
  54
  55         /*
  56          * "userpte=nohigh" disables allocation of user pagetables in
  57          * high memory.
  58          */
  59         if (strcmp(arg, "nohigh") == 0)
  60                 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
  61         else
  62                 return -EINVAL;
  63         return 0;
  64 }
  65 early_param("userpte", setup_userpte);
  66
  67 void __pte_free(pgtable_t pte)
  68 {
  69         if (!PageHighMem(pte)) {
  70                 if (PagePinned(pte)) {
  71                         unsigned long pfn = page_to_pfn(pte);
  72
  73                         if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
  74                                                          pfn_pte(pfn,
  75                                                                  PAGE_KERNEL),
  76                                                          0))
  77                                 BUG();
  78                         ClearPagePinned(pte);
  79                 }
  80         } else
  81 #ifdef CONFIG_HIGHPTE
  82                 ClearPagePinned(pte);
  83 #else
  84                 BUG();
  85 #endif
  86
  87         ClearPageForeign(pte);
  88         init_page_count(pte);
  89         pgtable_page_dtor(pte);
  90         __free_page(pte);
  91 }
  92
  93 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
  94 {
  95         pgtable_page_dtor(pte);
  96         paravirt_release_pte(page_to_pfn(pte));
  97         tlb_remove_page(tlb, pte);
  98 }
  99
 100 #if PAGETABLE_LEVELS > 2
 101 static void _pmd_free(struct page *page, unsigned int order)
 102 {
 103         BUG_ON(order);
 104         __pmd_free(page);
 105 }
 106
 107 pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
 108 {
 109         struct page *pmd;
 110
 111         pmd = alloc_pages(PGALLOC_GFP, 0);
 112         if (!pmd)
 113                 return NULL;
 114         SetPageForeign(pmd, _pmd_free);
 115         init_page_count(pmd);
 116         return page_address(pmd);
 117 }
 118
 119 void __pmd_free(pgtable_t pmd)
 120 {
 121         if (PagePinned(pmd)) {
 122                 unsigned long pfn = page_to_pfn(pmd);
 123
 124                 if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
 125                                                  pfn_pte(pfn, PAGE_KERNEL),
 126                                                  0))
 127                         BUG();
 128                 ClearPagePinned(pmd);
 129         }
 130
 131         ClearPageForeign(pmd);
 132         init_page_count(pmd);
 133         __free_page(pmd);
 134 }
 135
 136 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 137 {
 138         paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
 139         tlb_remove_page(tlb, virt_to_page(pmd));
 140 }
 141
 142 #if PAGETABLE_LEVELS > 3
 143 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
 144 {
 145         paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
 146         tlb_remove_page(tlb, virt_to_page(pud));
 147 }
 148 #endif  /* PAGETABLE_LEVELS > 3 */
 149 #endif  /* PAGETABLE_LEVELS > 2 */
 150
 151 static void _pin_lock(struct mm_struct *mm, int lock) {
 152         if (lock)
 153                 spin_lock(&mm->page_table_lock);
 154 #if USE_SPLIT_PTLOCKS
 155         /* While mm->page_table_lock protects us against insertions and
 156          * removals of higher level page table pages, it doesn't protect
 157          * against updates of pte-s. Such updates, however, require the
 158          * pte pages to be in consistent state (unpinned+writable or
 159          * pinned+readonly). The pinning and attribute changes, however
 160          * cannot be done atomically, which is why such updates must be
 161          * prevented from happening concurrently.
 162          * Note that no pte lock can ever elsewhere be acquired nesting
 163          * with an already acquired one in the same mm, or with the mm's
 164          * page_table_lock already acquired, as that would break in the
 165          * non-split case (where all these are actually resolving to the
 166          * one page_table_lock). Thus acquiring all of them here is not
 167          * going to result in dead locks, and the order of acquires
 168          * doesn't matter.
 169          */
 170         {
 171                 pgd_t *pgd = mm->pgd;
 172                 unsigned g;
 173
 174                 for (g = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) {
 175                         pud_t *pud;
 176                         unsigned u;
 177
 178                         if (pgd_none(*pgd))
 179                                 continue;
 180                         pud = pud_offset(pgd, 0);
 181                         for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
 182                                 pmd_t *pmd;
 183                                 unsigned m;
 184
 185                                 if (pud_none(*pud))
 186                                         continue;
 187                                 pmd = pmd_offset(pud, 0);
 188                                 for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
 189                                         spinlock_t *ptl;
 190
 191                                         if (pmd_none(*pmd))
 192                                                 continue;
 193                                         ptl = pte_lockptr(0, pmd);
 194                                         if (lock)
 195                                                 spin_lock(ptl);
 196                                         else
 197                                                 spin_unlock(ptl);
 198                                 }
 199                         }
 200                 }
 201         }
 202 #endif
 203         if (!lock)
 204                 spin_unlock(&mm->page_table_lock);
 205 }
 206 #define pin_lock(mm) _pin_lock(mm, 1)
 207 #define pin_unlock(mm) _pin_lock(mm, 0)
 208
 209 #define PIN_BATCH sizeof(void *)
 210 static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
 211
 212 static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
 213                                              unsigned int cpu, unsigned int seq)
 214 {
 215         unsigned long pfn = page_to_pfn(page);
 216
 217         if (pgprot_val(flags) & _PAGE_RW)
 218                 ClearPagePinned(page);
 219         else
 220                 SetPagePinned(page);
 221         if (PageHighMem(page))
 222                 return seq;
 223         MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
 224                                 (unsigned long)__va(pfn << PAGE_SHIFT),
 225                                 pfn_pte(pfn, flags), 0);
 226         if (unlikely(++seq == PIN_BATCH)) {
 227                 if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
 228                                                         PIN_BATCH, NULL)))
 229                         BUG();
 230                 seq = 0;
 231         }
 232
 233         return seq;
 234 }
 235
 236 static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
 237 {
 238         pgd_t       *pgd = pgd_base;
 239         pud_t       *pud;
 240         pmd_t       *pmd;
 241         int          g,u,m;
 242         unsigned int cpu, seq;
 243         multicall_entry_t *mcl;
 244
 245         if (xen_feature(XENFEAT_auto_translated_physmap))
 246                 return;
 247
 248         cpu = get_cpu();
 249
 250         /*
 251          * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
 252          * may not be the 'current' task's pagetables (e.g., current may be
 253          * 32-bit, but the pagetables may be for a 64-bit task).
 254          * Subtracting 1 from TASK_SIZE_MAX means the loop limit is correct
 255          * regardless of whether TASK_SIZE_MAX is a multiple of PGDIR_SIZE.
 256          */
 257         for (g = 0, seq = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) {
 258                 if (pgd_none(*pgd))
 259                         continue;
 260                 pud = pud_offset(pgd, 0);
 261                 if (PTRS_PER_PUD > 1) /* not folded */
 262                         seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
 263                 for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
 264                         if (pud_none(*pud))
 265                                 continue;
 266                         pmd = pmd_offset(pud, 0);
 267                         if (PTRS_PER_PMD > 1) /* not folded */
 268                                 seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
 269                         for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
 270                                 if (pmd_none(*pmd))
 271                                         continue;
 272                                 seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
 273                         }
 274                 }
 275         }
 276
 277 #ifdef CONFIG_X86_PAE
 278         for (; g < PTRS_PER_PGD; g++, pgd++) {
 279                 BUG_ON(pgd_none(*pgd));
 280                 pud = pud_offset(pgd, 0);
 281                 BUG_ON(pud_none(*pud));
 282                 pmd = pmd_offset(pud, 0);
 283                 seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
 284         }
 285 #endif
 286
 287         mcl = per_cpu(pb_mcl, cpu);
 288 #ifdef CONFIG_X86_64
 289         if (unlikely(seq > PIN_BATCH - 2)) {
 290                 if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
 291                         BUG();
 292                 seq = 0;
 293         }
 294         pgd = __user_pgd(pgd_base);
 295         BUG_ON(!pgd);
 296         MULTI_update_va_mapping(mcl + seq,
 297                (unsigned long)pgd,
 298                pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, flags),
 299                0);
 300         MULTI_update_va_mapping(mcl + seq + 1,
 301                (unsigned long)pgd_base,
 302                pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
 303                UVMF_TLB_FLUSH);
 304         if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
 305                 BUG();
 306 #else
 307         if (likely(seq != 0)) {
 308                 MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
 309                         (unsigned long)pgd_base,
 310                         pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
 311                         UVMF_TLB_FLUSH);
 312                 if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
 313                                                         seq + 1, NULL)))
 314                         BUG();
 315         } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
 316                         pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
 317                         UVMF_TLB_FLUSH))
 318                 BUG();
 319 #endif
 320
 321         put_cpu();
 322 }
 323
 324 void __init xen_init_pgd_pin(void)
 325 {
 326         pgd_t       *pgd = init_mm.pgd;
 327         pud_t       *pud;
 328         pmd_t       *pmd;
 329         unsigned int g, u, m;
 330
 331         if (xen_feature(XENFEAT_auto_translated_physmap))
 332                 return;
 333
 334         SetPagePinned(virt_to_page(pgd));
 335         for (g = 0; g < PTRS_PER_PGD; g++, pgd++) {
 336 #ifndef CONFIG_X86_PAE
 337                 if (g >= pgd_index(HYPERVISOR_VIRT_START)
 338                     && g <= pgd_index(HYPERVISOR_VIRT_END - 1))
 339                         continue;
 340 #endif
 341                 if (!pgd_present(*pgd))
 342                         continue;
 343                 pud = pud_offset(pgd, 0);
 344                 if (PTRS_PER_PUD > 1) /* not folded */
 345                         SetPagePinned(virt_to_page(pud));
 346                 for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
 347                         if (!pud_present(*pud) || pud_large(*pud))
 348                                 continue;
 349                         pmd = pmd_offset(pud, 0);
 350                         if (PTRS_PER_PMD > 1) /* not folded */
 351                                 SetPagePinned(virt_to_page(pmd));
 352                         for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
 353 #ifdef CONFIG_X86_PAE
 354                                 if (g == pgd_index(HYPERVISOR_VIRT_START)
 355                                     && m >= pmd_index(HYPERVISOR_VIRT_START))
 356                                         continue;
 357 #endif
 358                                 if (!pmd_present(*pmd) || pmd_large(*pmd))
 359                                         continue;
 360                                 SetPagePinned(pmd_page(*pmd));
 361                         }
 362                 }
 363         }
 364 #ifdef CONFIG_X86_64
 365         SetPagePinned(virt_to_page(level3_user_pgt));
 366 #endif
 367 }
 368
 369 static void __pgd_pin(pgd_t *pgd)
 370 {
 371         pgd_walk(pgd, PAGE_KERNEL_RO);
 372         kmap_flush_unused();
 373         xen_pgd_pin(pgd);
 374         SetPagePinned(virt_to_page(pgd));
 375 }
 376
 377 static void __pgd_unpin(pgd_t *pgd)
 378 {
 379         xen_pgd_unpin(pgd);
 380         pgd_walk(pgd, PAGE_KERNEL);
 381         ClearPagePinned(virt_to_page(pgd));
 382 }
 383
 384 static void pgd_test_and_unpin(pgd_t *pgd)
 385 {
 386         if (PagePinned(virt_to_page(pgd)))
 387                 __pgd_unpin(pgd);
 388 }
 389
 390 void mm_pin(struct mm_struct *mm)
 391 {
 392         if (xen_feature(XENFEAT_writable_page_tables))
 393                 return;
 394
 395         pin_lock(mm);
 396         __pgd_pin(mm->pgd);
 397         pin_unlock(mm);
 398 }
 399
 400 void mm_unpin(struct mm_struct *mm)
 401 {
 402         if (xen_feature(XENFEAT_writable_page_tables))
 403                 return;
 404
 405         pin_lock(mm);
 406         __pgd_unpin(mm->pgd);
 407         pin_unlock(mm);
 408 }
 409
 410 void mm_pin_all(void)
 411 {
 412         struct page *page;
 413
 414         if (xen_feature(XENFEAT_writable_page_tables))
 415                 return;
 416
 417         /*
 418          * Allow uninterrupted access to the pgd_list. Also protects
 419          * __pgd_pin() by ensuring preemption is disabled.
 420          * All other CPUs must be at a safe point (e.g., in stop_machine
 421          * or offlined entirely).
 422          */
 423         BUG_ON(!irqs_disabled());
 424         spin_lock(&pgd_lock);
 425         list_for_each_entry(page, &pgd_list, lru) {
 426                 if (!PagePinned(page))
 427                         __pgd_pin((pgd_t *)page_address(page));
 428         }
 429         spin_unlock(&pgd_lock);
 430 }
 431
 432 void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 433 {
 434         if (!PagePinned(virt_to_page(mm->pgd)))
 435                 mm_pin(mm);
 436 }
 437
 438 /*
 439  * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() *much*
 440  * faster this way, as no hypercalls are needed for the page table updates.
 441  */
 442 static void leave_active_mm(struct task_struct *tsk, struct mm_struct *mm)
 443         __releases(tsk->alloc_lock)
 444 {
 445         if (tsk->active_mm == mm) {
 446                 tsk->active_mm = &init_mm;
 447                 atomic_inc(&init_mm.mm_count);
 448
 449                 switch_mm(mm, &init_mm, tsk);
 450
 451                 if (atomic_dec_and_test(&mm->mm_count))
 452                         BUG();
 453         }
 454
 455         task_unlock(tsk);
 456 }
 457
 458 static void _leave_active_mm(void *mm)
 459 {
 460         struct task_struct *tsk = current;
 461
 462         if (spin_trylock(&tsk->alloc_lock))
 463                 leave_active_mm(tsk, mm);
 464 }
 465
 466 void arch_exit_mmap(struct mm_struct *mm)
 467 {
 468         struct task_struct *tsk = current;
 469
 470         task_lock(tsk);
 471         leave_active_mm(tsk, mm);
 472
 473         preempt_disable();
 474         smp_call_function_many(mm_cpumask(mm), _leave_active_mm, mm, 1);
 475         preempt_enable();
 476
 477         if (PagePinned(virt_to_page(mm->pgd))
 478             && atomic_read(&mm->mm_count) == 1
 479             && !mm->context.has_foreign_mappings)
 480                 mm_unpin(mm);
 481 }
 482
 483 static inline void pgd_list_add(pgd_t *pgd)
 484 {
 485         struct page *page = virt_to_page(pgd);
 486
 487         list_add(&page->lru, &pgd_list);
 488 }
 489
 490 static inline void pgd_list_del(pgd_t *pgd)
 491 {
 492         struct page *page = virt_to_page(pgd);
 493
 494         list_del(&page->lru);
 495 }
 496
 497 #define UNSHARED_PTRS_PER_PGD                           \
 498         (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 499
 500
 501 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
 502 {
 503         BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
 504         virt_to_page(pgd)->index = (pgoff_t)mm;
 505 }
 506
 507 struct mm_struct *pgd_page_get_mm(struct page *page)
 508 {
 509         return (struct mm_struct *)page->index;
 510 }
 511
 512 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 513 {
 514         pgd_test_and_unpin(pgd);
 515
 516         /* If the pgd points to a shared pagetable level (either the
 517            ptes in non-PAE, or shared PMD in PAE), then just copy the
 518            references from swapper_pg_dir. */
 519         if (PAGETABLE_LEVELS == 2 ||
 520             (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
 521             PAGETABLE_LEVELS == 4) {
 522                 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
 523                                 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
 524                                 KERNEL_PGD_PTRS);
 525         }
 526
 527 #ifdef CONFIG_X86_64
 528         /* set level3_user_pgt for vsyscall area */
 529         __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
 530                 __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
 531 #endif
 532
 533         /* list required to sync kernel mapping updates */
 534         if (!SHARED_KERNEL_PMD) {
 535                 pgd_set_mm(pgd, mm);
 536                 pgd_list_add(pgd);
 537         }
 538 }
 539
 540 static void pgd_dtor(pgd_t *pgd)
 541 {
 542         if (!SHARED_KERNEL_PMD) {
 543                 spin_lock(&pgd_lock);
 544                 pgd_list_del(pgd);
 545                 spin_unlock(&pgd_lock);
 546         }
 547
 548         pgd_test_and_unpin(pgd);
 549 }
 550
 551 /*
 552  * List of all pgd's needed for non-PAE so it can invalidate entries
 553  * in both cached and uncached pgd's; not needed for PAE since the
 554  * kernel pmd is shared. If PAE were not to share the pmd a similar
 555  * tactic would be needed. This is essentially codepath-based locking
 556  * against pageattr.c; it is the unique case in which a valid change
 557  * of kernel pagetables can't be lazily synchronized by vmalloc faults.
 558  * vmalloc faults work because attached pagetables are never freed.
 559  * -- wli
 560  */
 561
 562 #ifdef CONFIG_X86_PAE
 563 /*
 564  * In PAE mode, we need to do a cr3 reload (=tlb flush) when
 565  * updating the top-level pagetable entries to guarantee the
 566  * processor notices the update.  Since this is expensive, and
 567  * all 4 top-level entries are used almost immediately in a
 568  * new process's life, we just pre-populate them here.
 569  *
 570  * Also, if we're in a paravirt environment where the kernel pmd is
 571  * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
 572  * and initialize the kernel pmds here.
 573  */
 574 #define PREALLOCATED_PMDS       UNSHARED_PTRS_PER_PGD
 575
 576 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 577 {
 578         /* Note: almost everything apart from _PAGE_PRESENT is
 579            reserved at the pmd (PDPT) level. */
 580         pud_t pud = __pud(__pa(pmd) | _PAGE_PRESENT);
 581
 582         paravirt_alloc_pmd(mm, page_to_pfn(virt_to_page(pmd)));
 583
 584         if (likely(!PagePinned(virt_to_page(pudp)))) {
 585                 *pudp = pud;
 586                 return;
 587         }
 588
 589         set_pud(pudp, pud);
 590
 591         /*
 592          * According to Intel App note "TLBs, Paging-Structure Caches,
 593          * and Their Invalidation", April 2007, document 317080-001,
 594          * section 8.1: in PAE mode we explicitly have to flush the
 595          * TLB via cr3 if the top-level pgd is changed...
 596          */
 597         flush_tlb_mm(mm);
 598 }
 599 #else  /* !CONFIG_X86_PAE */
 600
 601 /* No need to prepopulate any pagetable entries in non-PAE modes. */
 602 #define PREALLOCATED_PMDS       0
 603
 604 #endif  /* CONFIG_X86_PAE */
 605
 606 static void free_pmds(pmd_t *pmds[], struct mm_struct *mm, bool contig)
 607 {
 608         int i;
 609
 610 #ifdef CONFIG_X86_PAE
 611         if (contig)
 612                 xen_destroy_contiguous_region((unsigned long)mm->pgd, 0);
 613 #endif
 614
 615         for(i = 0; i < PREALLOCATED_PMDS; i++)
 616                 if (pmds[i])
 617                         pmd_free(mm, pmds[i]);
 618 }
 619
 620 static int preallocate_pmds(pmd_t *pmds[], struct mm_struct *mm)
 621 {
 622         int i;
 623         bool failed = false;
 624
 625         for(i = 0; i < PREALLOCATED_PMDS; i++) {
 626                 pmd_t *pmd = pmd_alloc_one(mm, i << PUD_SHIFT);
 627                 if (pmd == NULL)
 628                         failed = true;
 629                 pmds[i] = pmd;
 630         }
 631
 632         if (failed) {
 633                 free_pmds(pmds, mm, false);
 634                 return -ENOMEM;
 635         }
 636
 637         return 0;
 638 }
 639
 640 /*
 641  * Mop up any pmd pages which may still be attached to the pgd.
 642  * Normally they will be freed by munmap/exit_mmap, but any pmd we
 643  * preallocate which never got a corresponding vma will need to be
 644  * freed manually.
 645  */
 646 static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 647 {
 648         int i;
 649
 650         for(i = 0; i < PREALLOCATED_PMDS; i++) {
 651                 pgd_t pgd = pgdp[i];
 652
 653                 if (__pgd_val(pgd) != 0) {
 654                         pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
 655
 656                         pgdp[i] = xen_make_pgd(0);
 657
 658                         paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
 659                         pmd_free(mm, pmd);
 660                 }
 661         }
 662
 663 #ifdef CONFIG_X86_PAE
 664         if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
 665                 xen_destroy_contiguous_region((unsigned long)pgdp, 0);
 666 #endif
 667 }
 668
 669 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
 670 {
 671         pud_t *pud;
 672         unsigned long addr;
 673         int i;
 674
 675         if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
 676                 return;
 677
 678         pud = pud_offset(pgd, 0);
 679         for (addr = i = 0; i < PREALLOCATED_PMDS;
 680              i++, pud++, addr += PUD_SIZE) {
 681                 pmd_t *pmd = pmds[i];
 682
 683                 if (i >= KERNEL_PGD_BOUNDARY)
 684                         memcpy(pmd,
 685                                (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
 686                                sizeof(pmd_t) * PTRS_PER_PMD);
 687
 688                 /* It is safe to poke machine addresses of pmds under the pgd_lock. */
 689                 pud_populate(mm, pud, pmd);
 690         }
 691 }
 692
 693 static inline pgd_t *user_pgd_alloc(pgd_t *pgd)
 694 {
 695 #ifdef CONFIG_X86_64
 696         if (pgd) {
 697                 pgd_t *upgd = (void *)__get_free_page(PGALLOC_GFP);
 698
 699                 if (upgd)
 700                         set_page_private(virt_to_page(pgd),
 701                                          (unsigned long)upgd);
 702                 else {
 703                         free_page((unsigned long)pgd);
 704                         pgd = NULL;
 705                 }
 706         }
 707 #endif
 708         return pgd;
 709 }
 710
 711 static inline void user_pgd_free(pgd_t *pgd)
 712 {
 713 #ifdef CONFIG_X86_64
 714         free_page(page_private(virt_to_page(pgd)));
 715 #endif
 716 }
 717
 718 pgd_t *pgd_alloc(struct mm_struct *mm)
 719 {
 720         pgd_t *pgd;
 721         pmd_t *pmds[PREALLOCATED_PMDS];
 722
 723         pgd = user_pgd_alloc((void *)__get_free_page(PGALLOC_GFP));
 724
 725         if (pgd == NULL)
 726                 goto out;
 727
 728         mm->pgd = pgd;
 729
 730         if (preallocate_pmds(pmds, mm) != 0)
 731                 goto out_free_pgd;
 732
 733         if (paravirt_pgd_alloc(mm) != 0)
 734                 goto out_free_pmds;
 735
 736         /*
 737          * Make sure that pre-populating the pmds is atomic with
 738          * respect to anything walking the pgd_list, so that they
 739          * never see a partially populated pgd.
 740          */
 741         spin_lock(&pgd_lock);
 742
 743 #ifdef CONFIG_X86_PAE
 744         /* Protect against save/restore: move below 4GB under pgd_lock. */
 745         if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
 746             && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
 747                 spin_unlock(&pgd_lock);
 748                 goto out_free_pmds;
 749         }
 750 #endif
 751
 752         pgd_ctor(mm, pgd);
 753         pgd_prepopulate_pmd(mm, pgd, pmds);
 754
 755         spin_unlock(&pgd_lock);
 756
 757         return pgd;
 758
 759 out_free_pmds:
 760         free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb));
 761 out_free_pgd:
 762         user_pgd_free(pgd);
 763         free_page((unsigned long)pgd);
 764 out:
 765         return NULL;
 766 }
 767
 768 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 769 {
 770         /*
 771          * After this the pgd should not be pinned for the duration of this
 772          * function's execution. We should never sleep and thus never race:
 773          *  1. User pmds will not become write-protected under our feet due
 774          *     to a concurrent mm_pin_all().
 775          *  2. The machine addresses in PGD entries will not become invalid
 776          *     due to a concurrent save/restore.
 777          */
 778         pgd_dtor(pgd);
 779
 780         pgd_mop_up_pmds(mm, pgd);
 781         paravirt_pgd_free(mm, pgd);
 782         user_pgd_free(pgd);
 783         free_page((unsigned long)pgd);
 784 }
 785
 786 /* blktap and gntdev need this, as otherwise they would implicitly (and
 787  * needlessly, as they never use it) reference init_mm. */
 788 pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
 789                                   unsigned long addr, pte_t *ptep, int full)
 790 {
 791         return ptep_get_and_clear_full(vma ? vma->vm_mm : &init_mm,
 792                                        addr, ptep, full);
 793 }
 794 EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
 795
 796 int ptep_set_access_flags(struct vm_area_struct *vma,
 797                           unsigned long address, pte_t *ptep,
 798                           pte_t entry, int dirty)
 799 {
 800         int changed = !pte_same(*ptep, entry);
 801
 802         if (changed && dirty) {
 803                 if (likely(vma->vm_mm == current->mm)) {
 804                         if (HYPERVISOR_update_va_mapping(address,
 805                                 entry,
 806                                 uvm_multi(mm_cpumask(vma->vm_mm))|UVMF_INVLPG))
 807                                 BUG();
 808                 } else {
 809                         xen_l1_entry_update(ptep, entry);
 810                         flush_tlb_page(vma, address);
 811                 }
 812         }
 813
 814         return changed;
 815 }
 816
 817 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 818 int pmdp_set_access_flags(struct vm_area_struct *vma,
 819                           unsigned long address, pmd_t *pmdp,
 820                           pmd_t entry, int dirty)
 821 {
 822         int changed = !pmd_same(*pmdp, entry);
 823
 824         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 825
 826         if (changed && dirty) {
 827                 *pmdp = entry;
 828                 pmd_update_defer(vma->vm_mm, address, pmdp);
 829                 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 830         }
 831
 832         return changed;
 833 }
 834 #endif
 835
 836 int ptep_test_and_clear_young(struct vm_area_struct *vma,
 837                               unsigned long addr, pte_t *ptep)
 838 {
 839         int ret = 0;
 840
 841         if (pte_young(*ptep))
 842                 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 843                                          (unsigned long *) &ptep->pte);
 844
 845         if (ret)
 846                 pte_update(vma->vm_mm, addr, ptep);
 847
 848         return ret;
 849 }
 850
 851 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 852 int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 853                               unsigned long addr, pmd_t *pmdp)
 854 {
 855         int ret = 0;
 856
 857         if (pmd_young(*pmdp))
 858                 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 859                                          (unsigned long *)pmdp);
 860
 861         if (ret)
 862                 pmd_update(vma->vm_mm, addr, pmdp);
 863
 864         return ret;
 865 }
 866 #endif
 867
 868 int ptep_clear_flush_young(struct vm_area_struct *vma,
 869                            unsigned long address, pte_t *ptep)
 870 {
 871         pte_t pte = *ptep;
 872         int young = pte_young(pte);
 873
 874         pte = pte_mkold(pte);
 875         if (PagePinned(virt_to_page(vma->vm_mm->pgd)))
 876                 ptep_set_access_flags(vma, address, ptep, pte, young);
 877         else if (young)
 878                 ptep->pte_low = pte.pte_low;
 879
 880         return young;
 881 }
 882
 883 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 884 int pmdp_clear_flush_young(struct vm_area_struct *vma,
 885                            unsigned long address, pmd_t *pmdp)
 886 {
 887         int young;
 888
 889         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 890
 891         young = pmdp_test_and_clear_young(vma, address, pmdp);
 892         if (young)
 893                 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 894
 895         return young;
 896 }
 897
 898 void pmdp_splitting_flush(struct vm_area_struct *vma,
 899                           unsigned long address, pmd_t *pmdp)
 900 {
 901         int set;
 902         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 903         set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
 904                                 (unsigned long *)pmdp);
 905         if (set) {
 906                 pmd_update(vma->vm_mm, address, pmdp);
 907                 /* need tlb flush only to serialize against gup-fast */
 908                 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 909         }
 910 }
 911 #endif
 912
 913 /**
 914  * reserve_top_address - reserves a hole in the top of kernel address space
 915  * @reserve - size of hole to reserve
 916  *
 917  * Can be used to relocate the fixmap area and poke a hole in the top
 918  * of kernel address space to make room for a hypervisor.
 919  */
 920 void __init reserve_top_address(unsigned long reserve)
 921 {
 922 #ifdef CONFIG_X86_32
 923         BUG_ON(fixmaps_set > 0);
 924         printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
 925                (int)-reserve);
 926         __FIXADDR_TOP = -reserve - PAGE_SIZE;
 927 #endif
 928 }
 929
 930 int fixmaps_set;
 931
 932 void xen_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
 933 {
 934         unsigned long address = __fix_to_virt(idx);
 935         pte_t pte;
 936
 937         if (idx >= __end_of_fixed_addresses) {
 938                 BUG();
 939                 return;
 940         }
 941
 942         switch (idx) {
 943 #ifdef CONFIG_X86_64
 944         extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
 945
 946         case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
 947         case VVAR_PAGE:
 948                 pte = pfn_pte(phys >> PAGE_SHIFT, flags);
 949                 set_pte_vaddr_pud(level3_user_pgt, address, pte);
 950                 break;
 951         case FIX_EARLYCON_MEM_BASE:
 952         case FIX_SHARED_INFO:
 953         case FIX_ISAMAP_END ... FIX_ISAMAP_BEGIN:
 954                 xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
 955                                     pfn_pte_ma(phys >> PAGE_SHIFT, flags));
 956                 fixmaps_set++;
 957                 return;
 958 #else
 959         case FIX_WP_TEST:
 960         case FIX_VDSO:
 961                 pte = pfn_pte(phys >> PAGE_SHIFT, flags);
 962                 break;
 963 #endif
 964         default:
 965                 pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
 966                 break;
 967         }
 968         set_pte_vaddr(address, pte);
 969         fixmaps_set++;
 970 }