Update to 3.4-final.
[linux-flexiantxendom0-3.2.10.git] / arch / x86 / mm / init_64-xen.c
1 /*
2  *  linux/arch/x86_64/mm/init.c
3  *
4  *  Copyright (C) 1995  Linus Torvalds
5  *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
6  *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7  *
8  *  Jun Nakajima <jun.nakajima@intel.com>
9  *      Modified for Xen.
10  */
11
12 #include <linux/signal.h>
13 #include <linux/sched.h>
14 #include <linux/kernel.h>
15 #include <linux/errno.h>
16 #include <linux/string.h>
17 #include <linux/types.h>
18 #include <linux/ptrace.h>
19 #include <linux/mman.h>
20 #include <linux/mm.h>
21 #include <linux/swap.h>
22 #include <linux/smp.h>
23 #include <linux/init.h>
24 #include <linux/initrd.h>
25 #include <linux/pagemap.h>
26 #include <linux/bootmem.h>
27 #include <linux/memblock.h>
28 #include <linux/proc_fs.h>
29 #include <linux/pci.h>
30 #include <linux/pfn.h>
31 #include <linux/poison.h>
32 #include <linux/dma-mapping.h>
33 #include <linux/module.h>
34 #include <linux/memory.h>
35 #include <linux/memory_hotplug.h>
36 #include <linux/nmi.h>
37 #include <linux/gfp.h>
38
39 #include <asm/processor.h>
40 #include <asm/bios_ebda.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/pgalloc.h>
44 #include <asm/dma.h>
45 #include <asm/fixmap.h>
46 #include <asm/e820.h>
47 #include <asm/apic.h>
48 #include <asm/tlb.h>
49 #include <asm/mmu_context.h>
50 #include <asm/proto.h>
51 #include <asm/smp.h>
52 #include <asm/sections.h>
53 #include <asm/kdebug.h>
54 #include <asm/numa.h>
55 #include <asm/cacheflush.h>
56 #include <asm/init.h>
57 #include <asm/setup.h>
58
59 #include <xen/features.h>
60
61 #if CONFIG_XEN_COMPAT <= 0x030002
62 unsigned int __kernel_page_user;
63 EXPORT_SYMBOL(__kernel_page_user);
64 #endif
65
66 extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
67 extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
68
69 /*
70  * Use this until direct mapping is established, i.e. before __va() is 
71  * available in init_memory_mapping().
72  */
73
74 #define addr_to_page(addr, page)                                \
75         (addr) &= PHYSICAL_PAGE_MASK;                           \
76         (page) = ((unsigned long *) ((unsigned long)            \
77         (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) +   \
78         __START_KERNEL_map)))
79
80 pmd_t *__init early_get_pmd(unsigned long va)
81 {
82         unsigned long addr;
83         unsigned long *page = (unsigned long *)init_level4_pgt;
84
85         addr = page[pgd_index(va)];
86         addr_to_page(addr, page);
87
88         addr = page[pud_index(va)];
89         addr_to_page(addr, page);
90
91         return (pmd_t *)&page[pmd_index(va)];
92 }
93
94 void __meminit early_make_page_readonly(void *va, unsigned int feature)
95 {
96         unsigned long addr, _va = (unsigned long)va;
97         pte_t pte, *ptep;
98         unsigned long *page = (unsigned long *) init_level4_pgt;
99
100         BUG_ON(after_bootmem);
101
102         if (xen_feature(feature))
103                 return;
104
105         addr = (unsigned long) page[pgd_index(_va)];
106         addr_to_page(addr, page);
107
108         addr = page[pud_index(_va)];
109         addr_to_page(addr, page);
110
111         addr = page[pmd_index(_va)];
112         addr_to_page(addr, page);
113
114         ptep = (pte_t *) &page[pte_index(_va)];
115
116         pte.pte = ptep->pte & ~_PAGE_RW;
117         if (HYPERVISOR_update_va_mapping(_va, pte, 0))
118                 BUG();
119 }
120
121 unsigned long __init early_arbitrary_virt_to_mfn(void *v)
122 {
123         unsigned long va = (unsigned long)v, addr, *page;
124
125         BUG_ON(va < __START_KERNEL_map);
126
127         page = (void *)(xen_read_cr3() + __START_KERNEL_map);
128
129         addr = page[pgd_index(va)];
130         addr_to_page(addr, page);
131
132         addr = page[pud_index(va)];
133         addr_to_page(addr, page);
134
135         addr = page[pmd_index(va)];
136         addr_to_page(addr, page);
137
138         return (page[pte_index(va)] & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT;
139 }
140
141 #ifndef CONFIG_XEN
142 static int __init parse_direct_gbpages_off(char *arg)
143 {
144         direct_gbpages = 0;
145         return 0;
146 }
147 early_param("nogbpages", parse_direct_gbpages_off);
148
149 static int __init parse_direct_gbpages_on(char *arg)
150 {
151         direct_gbpages = 1;
152         return 0;
153 }
154 early_param("gbpages", parse_direct_gbpages_on);
155 #endif
156
157 /*
158  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
159  * physical space so we can cache the place of the first one and move
160  * around without checking the pgd every time.
161  */
162
163 pteval_t __supported_pte_mask __read_mostly = ~0UL;
164 EXPORT_SYMBOL_GPL(__supported_pte_mask);
165
166 int force_personality32;
167
168 /*
169  * noexec32=on|off
170  * Control non executable heap for 32bit processes.
171  * To control the stack too use noexec=off
172  *
173  * on   PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
174  * off  PROT_READ implies PROT_EXEC
175  */
176 static int __init nonx32_setup(char *str)
177 {
178         if (!strcmp(str, "on"))
179                 force_personality32 &= ~READ_IMPLIES_EXEC;
180         else if (!strcmp(str, "off"))
181                 force_personality32 |= READ_IMPLIES_EXEC;
182         return 1;
183 }
184 __setup("noexec32=", nonx32_setup);
185
186 /*
187  * When memory was added/removed make sure all the processes MM have
188  * suitable PGD entries in the local PGD level page.
189  */
190 void sync_global_pgds(unsigned long start, unsigned long end)
191 {
192         unsigned long address;
193
194         for (address = start; address <= end; address += PGDIR_SIZE) {
195                 const pgd_t *pgd_ref = pgd_offset_k(address);
196                 struct page *page;
197
198                 if (pgd_none(*pgd_ref))
199                         continue;
200
201                 spin_lock(&pgd_lock);
202                 list_for_each_entry(page, &pgd_list, lru) {
203                         pgd_t *pgd;
204                         spinlock_t *pgt_lock;
205
206                         pgd = (pgd_t *)page_address(page) + pgd_index(address);
207                         /* the pgt_lock only for Xen */
208                         pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
209                         spin_lock(pgt_lock);
210
211                         if (pgd_none(*pgd))
212                                 set_pgd(pgd, *pgd_ref);
213                         else
214                                 BUG_ON(pgd_page_vaddr(*pgd)
215                                        != pgd_page_vaddr(*pgd_ref));
216
217                         spin_unlock(pgt_lock);
218                 }
219                 spin_unlock(&pgd_lock);
220         }
221 }
222
223 static struct reserved_pfn_range {
224         unsigned long pfn, nr;
225 } reserved_pfn_ranges[3] __meminitdata;
226
227 void __init reserve_pfn_range(unsigned long pfn, unsigned long nr)
228 {
229         unsigned int i;
230
231         for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
232                 struct reserved_pfn_range *range = reserved_pfn_ranges + i;
233
234                 if (!range->nr) {
235                         range->pfn = pfn;
236                         range->nr = nr;
237                         break;
238                 }
239                 BUG_ON(range->pfn < pfn + nr && pfn < range->pfn + range->nr);
240                 if (range->pfn > pfn) {
241                         i = ARRAY_SIZE(reserved_pfn_ranges) - 1;
242                         if (reserved_pfn_ranges[i].nr)
243                                 continue;
244                         for (; reserved_pfn_ranges + i > range; --i)
245                                 reserved_pfn_ranges[i]
246                                          = reserved_pfn_ranges[i - 1];
247                         range->pfn = pfn;
248                         range->nr = nr;
249                         break;
250                 }
251         }
252         BUG_ON(i >= ARRAY_SIZE(reserved_pfn_ranges));
253         memblock_reserve(PFN_PHYS(pfn), PFN_PHYS(nr));
254 }
255
256 void __init reserve_pgtable_low(void)
257 {
258         unsigned int i;
259
260         for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
261                 struct reserved_pfn_range *range = reserved_pfn_ranges + i;
262
263                 if (!range->nr)
264                         break;
265                 if (pgt_buf_start <= range->pfn && pgt_buf_top > range->pfn) {
266                         x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
267                                         PFN_PHYS(range->pfn));
268                         pgt_buf_start = range->pfn + range->nr;
269                 }
270         }
271 }
272
273 static __init unsigned long get_table_end(void)
274 {
275         unsigned int i;
276
277         BUG_ON(!pgt_buf_end);
278         for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
279                 struct reserved_pfn_range *range = reserved_pfn_ranges + i;
280
281                 if (!range->nr)
282                         break;
283                 if (pgt_buf_end == range->pfn) {
284                         pgt_buf_end += range->nr;
285                         pgt_buf_top += range->nr;
286                 }
287         }
288         return pgt_buf_end++;
289 }
290
291 /*
292  * NOTE: This function is marked __ref because it calls __init function
293  * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
294  */
295 static __ref void *spp_getpage(void)
296 {
297         void *ptr;
298
299         if (after_bootmem)
300                 ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
301         else if (pgt_buf_end < pgt_buf_top) {
302                 ptr = __va(get_table_end() << PAGE_SHIFT);
303                 clear_page(ptr);
304         } else
305                 ptr = alloc_bootmem_pages(PAGE_SIZE);
306
307         if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
308                 panic("set_pte_phys: cannot allocate page data %s\n",
309                         after_bootmem ? "after bootmem" : "");
310         }
311
312         pr_debug("spp_getpage %p\n", ptr);
313
314         return ptr;
315 }
316
317 static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
318 {
319         if (pgd_none(*pgd)) {
320                 pud_t *pud = (pud_t *)spp_getpage();
321                 if (!after_bootmem) {
322                         make_page_readonly(pud, XENFEAT_writable_page_tables);
323                         xen_l4_entry_update(pgd, __pgd(__pa(pud) | _PAGE_TABLE));
324                 } else
325                         pgd_populate(&init_mm, pgd, pud);
326                 if (pud != pud_offset(pgd, 0))
327                         printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
328                                pud, pud_offset(pgd, 0));
329         }
330         return pud_offset(pgd, vaddr);
331 }
332
333 static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
334 {
335         if (pud_none(*pud)) {
336                 pmd_t *pmd = (pmd_t *) spp_getpage();
337                 if (!after_bootmem) {
338                         make_page_readonly(pmd, XENFEAT_writable_page_tables);
339                         xen_l3_entry_update(pud, __pud(__pa(pmd) | _PAGE_TABLE));
340                 } else
341                         pud_populate(&init_mm, pud, pmd);
342                 if (pmd != pmd_offset(pud, 0))
343                         printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
344                                pmd, pmd_offset(pud, 0));
345         }
346         return pmd_offset(pud, vaddr);
347 }
348
349 static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
350 {
351         if (pmd_none(*pmd)) {
352                 pte_t *pte = (pte_t *) spp_getpage();
353                 make_page_readonly(pte, XENFEAT_writable_page_tables);
354                 pmd_populate_kernel(&init_mm, pmd, pte);
355                 if (pte != pte_offset_kernel(pmd, 0))
356                         printk(KERN_ERR "PAGETABLE BUG #02!\n");
357         }
358         return pte_offset_kernel(pmd, vaddr);
359 }
360
361 void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
362 {
363         pud_t *pud;
364         pmd_t *pmd;
365         pte_t *pte;
366
367         pud = pud_page + pud_index(vaddr);
368         pmd = fill_pmd(pud, vaddr);
369         pte = fill_pte(pmd, vaddr);
370
371         set_pte(pte, new_pte);
372
373         /*
374          * It's enough to flush this one mapping.
375          * (PGE mappings get flushed as well)
376          */
377         __flush_tlb_one(vaddr);
378 }
379
380 void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
381 {
382         pgd_t *pgd;
383         pud_t *pud_page;
384
385         pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, __pte_val(pteval));
386
387         pgd = pgd_offset_k(vaddr);
388         if (pgd_none(*pgd)) {
389                 printk(KERN_ERR
390                         "PGD FIXMAP MISSING, it should be setup in head.S!\n");
391                 return;
392         }
393         pud_page = (pud_t*)pgd_page_vaddr(*pgd);
394         set_pte_vaddr_pud(pud_page, vaddr, pteval);
395 }
396
397 pmd_t * __init populate_extra_pmd(unsigned long vaddr)
398 {
399         pgd_t *pgd;
400         pud_t *pud;
401
402         pgd = pgd_offset_k(vaddr);
403         pud = fill_pud(pgd, vaddr);
404         return fill_pmd(pud, vaddr);
405 }
406
407 pte_t * __init populate_extra_pte(unsigned long vaddr)
408 {
409         pmd_t *pmd;
410
411         pmd = populate_extra_pmd(vaddr);
412         return fill_pte(pmd, vaddr);
413 }
414
415 #ifndef CONFIG_XEN
416 /*
417  * Create large page table mappings for a range of physical addresses.
418  */
419 static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
420                                                 pgprot_t prot)
421 {
422         pgd_t *pgd;
423         pud_t *pud;
424         pmd_t *pmd;
425
426         BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
427         for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
428                 pgd = pgd_offset_k((unsigned long)__va(phys));
429                 if (pgd_none(*pgd)) {
430                         pud = (pud_t *) spp_getpage();
431                         set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
432                                                 _PAGE_USER));
433                 }
434                 pud = pud_offset(pgd, (unsigned long)__va(phys));
435                 if (pud_none(*pud)) {
436                         pmd = (pmd_t *) spp_getpage();
437                         set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
438                                                 _PAGE_USER));
439                 }
440                 pmd = pmd_offset(pud, phys);
441                 BUG_ON(!pmd_none(*pmd));
442                 set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
443         }
444 }
445
446 void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
447 {
448         __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
449 }
450
451 void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
452 {
453         __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
454 }
455
456 /*
457  * The head.S code sets up the kernel high mapping:
458  *
459  *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
460  *
461  * phys_addr holds the negative offset to the kernel, which is added
462  * to the compile time generated pmds. This results in invalid pmds up
463  * to the point where we hit the physaddr 0 mapping.
464  *
465  * We limit the mappings to the region from _text to _brk_end.  _brk_end
466  * is rounded up to the 2MB boundary. This catches the invalid pmds as
467  * well, as they are located before _text:
468  */
469 void __init cleanup_highmap(void)
470 {
471         unsigned long vaddr = __START_KERNEL_map;
472         unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
473         unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
474         pmd_t *pmd = level2_kernel_pgt;
475
476         for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
477                 if (pmd_none(*pmd))
478                         continue;
479                 if (vaddr < (unsigned long) _text || vaddr > end)
480                         set_pmd(pmd, __pmd(0));
481         }
482 }
483 #endif
484
485 static __ref void *alloc_low_page(unsigned long *phys)
486 {
487         unsigned long pfn;
488         void *adr;
489
490         if (after_bootmem) {
491                 adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
492                 *phys = __pa(adr);
493
494                 return adr;
495         }
496
497         pfn = get_table_end();
498         if (pfn >= pgt_buf_top)
499                 panic("alloc_low_page: ran out of memory");
500
501         adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
502         clear_page(adr);
503         *phys  = pfn * PAGE_SIZE;
504         return adr;
505 }
506
507 static __ref void *map_low_page(void *virt)
508 {
509         void *adr;
510         unsigned long phys, left;
511
512         if (after_bootmem)
513                 return virt;
514
515         phys = __pa(virt);
516         left = phys & (PAGE_SIZE - 1);
517         adr = early_memremap_ro(phys & PAGE_MASK, PAGE_SIZE);
518         adr = (void *)(((unsigned long)adr) | left);
519
520         return adr;
521 }
522
523 static __ref void unmap_low_page(void *adr)
524 {
525         if (after_bootmem)
526                 return;
527
528         early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
529 }
530
531 static inline int __meminit make_readonly(unsigned long paddr)
532 {
533         int readonly = 0;
534
535         /* Make new page tables read-only on the first pass. */
536         if (!xen_feature(XENFEAT_writable_page_tables)
537             && !max_pfn_mapped
538             && (paddr >= (pgt_buf_start << PAGE_SHIFT))) {
539                 unsigned long top = pgt_buf_top;
540                 unsigned int i;
541
542                 /* Account for the ranges get_table_end() skips. */
543                 for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
544                         const struct reserved_pfn_range *range;
545
546                         range = reserved_pfn_ranges + i;
547                         if (!range->nr)
548                                 continue;
549                         if (pgt_buf_end <= range->pfn && top > range->pfn) {
550                                 if (paddr > (range->pfn << PAGE_SHIFT)
551                                     && paddr < ((range->pfn + range->nr)
552                                                 << PAGE_SHIFT))
553                                         break;
554                                 top += range->nr;
555                         }
556                 }
557                 if (paddr < (top << PAGE_SHIFT))
558                         readonly = (i >= ARRAY_SIZE(reserved_pfn_ranges));
559         }
560         /* Make old page tables read-only. */
561         if (!xen_feature(XENFEAT_writable_page_tables)
562             && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
563             && (paddr < (pgt_buf_end << PAGE_SHIFT)))
564                 readonly = 1;
565         /* Make P->M table (and its page tables) read-only. */
566         if (!xen_feature(XENFEAT_writable_page_tables)
567             && xen_start_info->mfn_list < __START_KERNEL_map
568             && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT)
569             && paddr < (xen_start_info->first_p2m_pfn
570                         + xen_start_info->nr_p2m_frames) << PAGE_SHIFT)
571                 readonly = 1;
572
573         /*
574          * No need for writable mapping of kernel image. This also ensures that
575          * page and descriptor tables embedded inside don't have writable
576          * mappings. The range must be in sync with that passed to
577          * reserve_early() (as "TEXT DATA BSS"), since all other regions can be
578          * allocated from under CONFIG_NO_BOOTMEM and thus must be writable.
579          */
580         if ((paddr >= __pa_symbol(&_text))
581             && (paddr < (__pa_symbol(__bss_stop) & PAGE_MASK)))
582                 readonly = 1;
583
584         return readonly;
585 }
586
587 static unsigned long __meminit
588 phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
589               pgprot_t prot)
590 {
591         unsigned pages = 0;
592         unsigned long last_map_addr = end;
593         int i;
594
595         pte_t *pte = pte_page + pte_index(addr);
596
597         for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
598                 unsigned long pteval = addr | pgprot_val(prot);
599
600                 if (addr >= end ||
601                     (!after_bootmem &&
602                      (addr >> PAGE_SHIFT) >= xen_start_info->nr_pages))
603                         break;
604
605                 /*
606                  * We will re-use the existing mapping.
607                  * Xen for example has some special requirements, like mapping
608                  * pagetable pages as RO. So assume someone who pre-setup
609                  * these mappings are more intelligent.
610                  */
611                 if (__pte_val(*pte)) {
612                         pages++;
613                         continue;
614                 }
615
616                 if (make_readonly(addr))
617                         pteval &= ~_PAGE_RW;
618                 if (0)
619                         printk("   pte=%p addr=%lx pte=%016lx\n",
620                                pte, addr, pteval);
621                 pages++;
622                 if (!after_bootmem)
623                         *pte = __pte(pteval & __supported_pte_mask);
624                 else
625                         set_pte(pte, __pte(pteval & __supported_pte_mask));
626                 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
627         }
628
629         update_page_count(PG_LEVEL_4K, pages);
630
631         return last_map_addr;
632 }
633
634 static unsigned long __meminit
635 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
636               unsigned long page_size_mask, pgprot_t prot)
637 {
638         unsigned long pages = 0;
639         unsigned long last_map_addr = end;
640
641         int i = pmd_index(address);
642
643         for (; i < PTRS_PER_PMD; i++, address = (address & PMD_MASK) + PMD_SIZE) {
644                 unsigned long pte_phys;
645                 pmd_t *pmd = pmd_page + pmd_index(address);
646                 pte_t *pte;
647                 pgprot_t new_prot = prot;
648
649                 if (address >= end)
650                         break;
651
652                 if (__pmd_val(*pmd)) {
653                         if (!pmd_large(*pmd)) {
654                                 spin_lock(&init_mm.page_table_lock);
655                                 pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
656                                 last_map_addr = phys_pte_init(pte, address,
657                                                                 end, prot);
658                                 unmap_low_page(pte);
659                                 spin_unlock(&init_mm.page_table_lock);
660                                 continue;
661                         }
662                         /*
663                          * If we are ok with PG_LEVEL_2M mapping, then we will
664                          * use the existing mapping,
665                          *
666                          * Otherwise, we will split the large page mapping but
667                          * use the same existing protection bits except for
668                          * large page, so that we don't violate Intel's TLB
669                          * Application note (317080) which says, while changing
670                          * the page sizes, new and old translations should
671                          * not differ with respect to page frame and
672                          * attributes.
673                          */
674                         if (page_size_mask & (1 << PG_LEVEL_2M)) {
675                                 pages++;
676                                 continue;
677                         }
678                         new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
679                 }
680
681                 if (page_size_mask & (1<<PG_LEVEL_2M)) {
682                         pages++;
683                         spin_lock(&init_mm.page_table_lock);
684                         set_pte((pte_t *)pmd,
685                                 pfn_pte(address >> PAGE_SHIFT,
686                                         __pgprot(pgprot_val(prot) | _PAGE_PSE)));
687                         spin_unlock(&init_mm.page_table_lock);
688                         last_map_addr = (address & PMD_MASK) + PMD_SIZE;
689                         continue;
690                 }
691
692                 pte = alloc_low_page(&pte_phys);
693                 last_map_addr = phys_pte_init(pte, address, end, new_prot);
694                 unmap_low_page(pte);
695
696                 if (!after_bootmem) {
697                         if (max_pfn_mapped)
698                                 make_page_readonly(__va(pte_phys),
699                                                    XENFEAT_writable_page_tables);
700                         if (page_size_mask & (1 << PG_LEVEL_NUM)) {
701                                 mmu_update_t u;
702
703                                 u.ptr = arbitrary_virt_to_machine(pmd);
704                                 u.val = phys_to_machine(pte_phys) | _PAGE_TABLE;
705                                 if (HYPERVISOR_mmu_update(&u, 1, NULL,
706                                                           DOMID_SELF) < 0)
707                                         BUG();
708                         } else
709                                 *pmd = __pmd(pte_phys | _PAGE_TABLE);
710                 } else {
711                         spin_lock(&init_mm.page_table_lock);
712                         pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
713                         spin_unlock(&init_mm.page_table_lock);
714                 }
715         }
716         update_page_count(PG_LEVEL_2M, pages);
717         return last_map_addr;
718 }
719
720 static unsigned long __meminit
721 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
722                          unsigned long page_size_mask)
723 {
724         unsigned long pages = 0;
725         unsigned long last_map_addr = end;
726         int i = pud_index(addr);
727
728         for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
729                 unsigned long pmd_phys;
730                 pud_t *pud = pud_page + pud_index(addr);
731                 pmd_t *pmd;
732                 pgprot_t prot = PAGE_KERNEL;
733
734                 if (addr >= end)
735                         break;
736
737                 if (__pud_val(*pud)) {
738                         if (!pud_large(*pud)) {
739                                 pmd = map_low_page(pmd_offset(pud, 0));
740                                 last_map_addr = phys_pmd_init(pmd, addr, end,
741                                         page_size_mask | (1 << PG_LEVEL_NUM),
742                                         prot);
743                                 unmap_low_page(pmd);
744                                 __flush_tlb_all();
745                                 continue;
746                         }
747                         /*
748                          * If we are ok with PG_LEVEL_1G mapping, then we will
749                          * use the existing mapping.
750                          *
751                          * Otherwise, we will split the gbpage mapping but use
752                          * the same existing protection  bits except for large
753                          * page, so that we don't violate Intel's TLB
754                          * Application note (317080) which says, while changing
755                          * the page sizes, new and old translations should
756                          * not differ with respect to page frame and
757                          * attributes.
758                          */
759                         if (page_size_mask & (1 << PG_LEVEL_1G)) {
760                                 pages++;
761                                 continue;
762                         }
763                         prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
764                 }
765
766                 if (page_size_mask & (1<<PG_LEVEL_1G)) {
767                         pages++;
768                         spin_lock(&init_mm.page_table_lock);
769                         set_pte((pte_t *)pud,
770                                 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
771                         spin_unlock(&init_mm.page_table_lock);
772                         last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
773                         continue;
774                 }
775
776                 pmd = alloc_low_page(&pmd_phys);
777                 last_map_addr = phys_pmd_init(pmd, addr, end,
778                                               page_size_mask & ~(1 << PG_LEVEL_NUM),
779                                               prot);
780                 unmap_low_page(pmd);
781
782                 if (!after_bootmem) {
783                         if (max_pfn_mapped)
784                                 make_page_readonly(__va(pmd_phys),
785                                                    XENFEAT_writable_page_tables);
786                         if (page_size_mask & (1 << PG_LEVEL_NUM)) {
787                                 mmu_update_t u;
788
789                                 u.ptr = arbitrary_virt_to_machine(pud);
790                                 u.val = phys_to_machine(pmd_phys) | _PAGE_TABLE;
791                                 if (HYPERVISOR_mmu_update(&u, 1, NULL,
792                                                           DOMID_SELF) < 0)
793                                         BUG();
794                         } else
795                                 *pud = __pud(pmd_phys | _PAGE_TABLE);
796                 } else {
797                         spin_lock(&init_mm.page_table_lock);
798                         pud_populate(&init_mm, pud, __va(pmd_phys));
799                         spin_unlock(&init_mm.page_table_lock);
800                 }
801         }
802         __flush_tlb_all();
803
804         update_page_count(PG_LEVEL_1G, pages);
805
806         return last_map_addr;
807 }
808
809 void __init xen_init_pt(void)
810 {
811         unsigned long addr, *page;
812
813         /* Find the initial pte page that was built for us. */
814         page = (unsigned long *)xen_start_info->pt_base;
815         addr = page[pgd_index(__START_KERNEL_map)];
816         addr_to_page(addr, page);
817
818 #if CONFIG_XEN_COMPAT <= 0x030002
819         /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
820            in kernel PTEs. We check that here. */
821         if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
822                 unsigned long *pg;
823                 pte_t pte;
824
825                 /* Mess with the initial mapping of page 0. It's not needed. */
826                 BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
827                 addr = page[pud_index(__START_KERNEL_map)];
828                 addr_to_page(addr, pg);
829                 addr = pg[pmd_index(__START_KERNEL_map)];
830                 addr_to_page(addr, pg);
831                 pte.pte = pg[pte_index(__START_KERNEL_map)];
832                 BUG_ON(!(pte.pte & _PAGE_PRESENT));
833
834                 /* If _PAGE_USER isn't set, we obviously do not need it. */
835                 if (pte.pte & _PAGE_USER) {
836                         /* _PAGE_USER is needed, but is it set implicitly? */
837                         pte.pte &= ~_PAGE_USER;
838                         if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
839                                                           pte, 0) != 0) ||
840                             !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
841                                 /* We need to explicitly specify _PAGE_USER. */
842                                 __kernel_page_user = _PAGE_USER;
843                 }
844         }
845 #endif
846
847         /* Construct mapping of initial pte page in our own directories. */
848         init_level4_pgt[pgd_index(__START_KERNEL_map)] = 
849                 __pgd(__pa_symbol(level3_kernel_pgt) | _PAGE_TABLE);
850         memcpy(level3_kernel_pgt + pud_index(__START_KERNEL_map),
851                page + pud_index(__START_KERNEL_map),
852                (PTRS_PER_PUD - pud_index(__START_KERNEL_map))
853                * sizeof(*level3_kernel_pgt));
854
855         /* Copy the initial P->M table mappings if necessary. */
856         addr = pgd_index(xen_start_info->mfn_list);
857         if (addr < pgd_index(__START_KERNEL_map))
858                 init_level4_pgt[addr] =
859                         ((pgd_t *)xen_start_info->pt_base)[addr];
860
861         /* Do an early initialization of the fixmap area. */
862         addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
863         if (pud_present(level3_kernel_pgt[pud_index(addr)])) {
864                 unsigned long adr = page[pud_index(addr)];
865
866                 addr_to_page(adr, page);
867                 copy_page(level2_fixmap_pgt, page);
868         }
869         level3_kernel_pgt[pud_index(addr)] =
870                 __pud(__pa_symbol(level2_fixmap_pgt) | _PAGE_TABLE);
871         level2_fixmap_pgt[pmd_index(addr)] =
872                 __pmd(__pa_symbol(level1_fixmap_pgt) | _PAGE_TABLE);
873
874         early_make_page_readonly(init_level4_pgt,
875                                  XENFEAT_writable_page_tables);
876         early_make_page_readonly(level3_kernel_pgt,
877                                  XENFEAT_writable_page_tables);
878         early_make_page_readonly(level3_user_pgt,
879                                  XENFEAT_writable_page_tables);
880         early_make_page_readonly(level2_fixmap_pgt,
881                                  XENFEAT_writable_page_tables);
882         early_make_page_readonly(level1_fixmap_pgt,
883                                  XENFEAT_writable_page_tables);
884
885         if (!xen_feature(XENFEAT_writable_page_tables))
886                 xen_pgd_pin(init_level4_pgt);
887 }
888
889 void __init xen_finish_init_mapping(void)
890 {
891         unsigned long start, end;
892         struct mmuext_op mmuext;
893
894         /* Re-vector virtual addresses pointing into the initial
895            mapping to the just-established permanent ones. */
896         xen_start_info = __va(__pa(xen_start_info));
897         xen_start_info->pt_base = (unsigned long)
898                 __va(__pa(xen_start_info->pt_base));
899         if (!xen_feature(XENFEAT_auto_translated_physmap)
900             && xen_start_info->mfn_list >= __START_KERNEL_map)
901                 phys_to_machine_mapping =
902                         __va(__pa(xen_start_info->mfn_list));
903
904         /* Unpin the no longer used Xen provided page tables. */
905         mmuext.cmd = MMUEXT_UNPIN_TABLE;
906         mmuext.arg1.mfn = virt_to_mfn(xen_start_info->pt_base);
907         if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF))
908                 BUG();
909
910         /* Destroy the Xen-created mappings beyond the kernel image. */
911         start = PAGE_ALIGN(_brk_end);
912         end   = __START_KERNEL_map + (pgt_buf_start << PAGE_SHIFT);
913         for (; start < end; start += PAGE_SIZE)
914                 if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
915                         BUG();
916
917         WARN(pgt_buf_end != pgt_buf_top, "start=%lx cur=%lx top=%lx\n",
918              pgt_buf_start, pgt_buf_end, pgt_buf_top);
919         if (pgt_buf_end > pgt_buf_top)
920                 pgt_buf_top = pgt_buf_end;
921 }
922
923 unsigned long __meminit
924 kernel_physical_mapping_init(unsigned long start,
925                              unsigned long end,
926                              unsigned long page_size_mask)
927 {
928         bool pgd_changed = false;
929         unsigned long next, last_map_addr = end;
930         unsigned long addr;
931
932         start = (unsigned long)__va(start);
933         end = (unsigned long)__va(end);
934         addr = start;
935
936         for (; start < end; start = next) {
937                 pgd_t *pgd = pgd_offset_k(start);
938                 unsigned long pud_phys;
939                 pud_t *pud;
940
941                 next = (start + PGDIR_SIZE) & PGDIR_MASK;
942                 if (next > end)
943                         next = end;
944
945                 if (__pgd_val(*pgd)) {
946                         pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
947                         last_map_addr = phys_pud_init(pud, __pa(start),
948                                 __pa(end), page_size_mask | (1 << PG_LEVEL_NUM));
949                         unmap_low_page(pud);
950                         continue;
951                 }
952
953                 pud = alloc_low_page(&pud_phys);
954                 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
955                                                  page_size_mask);
956                 unmap_low_page(pud);
957
958                 if (!after_bootmem) {
959                         if (max_pfn_mapped)
960                                 make_page_readonly(__va(pud_phys),
961                                                    XENFEAT_writable_page_tables);
962                         xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE));
963                 } else {
964                         spin_lock(&init_mm.page_table_lock);
965                         pgd_populate(&init_mm, pgd, __va(pud_phys));
966                         spin_unlock(&init_mm.page_table_lock);
967                         pgd_changed = true;
968                 }
969         }
970
971         if (pgd_changed)
972                 sync_global_pgds(addr, end);
973
974         return last_map_addr;
975 }
976
977 #ifndef CONFIG_NUMA
978 void __init initmem_init(void)
979 {
980         memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
981 }
982 #endif
983
984 void __init paging_init(void)
985 {
986         sparse_memory_present_with_active_regions(MAX_NUMNODES);
987         sparse_init();
988
989         /*
990          * clear the default setting with node 0
991          * note: don't use nodes_clear here, that is really clearing when
992          *       numa support is not compiled in, and later node_set_state
993          *       will not set it back.
994          */
995         node_clear_state(0, N_NORMAL_MEMORY);
996
997         zone_sizes_init();
998 }
999
1000 /*
1001  * Memory hotplug specific functions
1002  */
1003 #ifdef CONFIG_MEMORY_HOTPLUG
1004 /*
1005  * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
1006  * updating.
1007  */
1008 static void  update_end_of_memory_vars(u64 start, u64 size)
1009 {
1010         unsigned long end_pfn = PFN_UP(start + size);
1011
1012         if (end_pfn > max_pfn) {
1013                 max_pfn = end_pfn;
1014                 max_low_pfn = end_pfn;
1015                 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
1016         }
1017 }
1018
1019 /*
1020  * Memory is added always to NORMAL zone. This means you will never get
1021  * additional DMA/DMA32 memory.
1022  */
1023 int arch_add_memory(int nid, u64 start, u64 size)
1024 {
1025         struct pglist_data *pgdat = NODE_DATA(nid);
1026         struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
1027         unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
1028         unsigned long nr_pages = size >> PAGE_SHIFT;
1029         int ret;
1030
1031         last_mapped_pfn = init_memory_mapping(start, start + size);
1032         if (last_mapped_pfn > max_pfn_mapped)
1033                 max_pfn_mapped = last_mapped_pfn;
1034
1035         ret = __add_pages(nid, zone, start_pfn, nr_pages);
1036         WARN_ON_ONCE(ret);
1037
1038         /* update max_pfn, max_low_pfn and high_memory */
1039         update_end_of_memory_vars(start, size);
1040
1041         return ret;
1042 }
1043 EXPORT_SYMBOL_GPL(arch_add_memory);
1044
1045 #endif /* CONFIG_MEMORY_HOTPLUG */
1046
1047 static struct kcore_list kcore_vsyscall;
1048
1049 void __init mem_init(void)
1050 {
1051         long codesize, reservedpages, datasize, initsize;
1052         unsigned long absent_pages;
1053         unsigned long pfn;
1054
1055         pci_iommu_alloc();
1056
1057         /* clear_bss() already clear the empty_zero_page */
1058
1059         reservedpages = 0;
1060
1061         /* this will put all low memory onto the freelists */
1062 #ifdef CONFIG_NUMA
1063         totalram_pages = numa_free_all_bootmem();
1064 #else
1065         totalram_pages = free_all_bootmem();
1066 #endif
1067
1068         /* XEN: init pages outside initial allocation. */
1069         for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
1070                 ClearPageReserved(pfn_to_page(pfn));
1071                 init_page_count(pfn_to_page(pfn));
1072         }
1073
1074         absent_pages = absent_pages_in_range(0, max_pfn);
1075         reservedpages = max_pfn - totalram_pages - absent_pages;
1076         after_bootmem = 1;
1077
1078         codesize =  (unsigned long) &_etext - (unsigned long) &_text;
1079         datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
1080         initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
1081
1082         /* Register memory areas for /proc/kcore */
1083         kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
1084                          VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
1085
1086         printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
1087                          "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
1088                 nr_free_pages() << (PAGE_SHIFT-10),
1089                 max_pfn << (PAGE_SHIFT-10),
1090                 codesize >> 10,
1091                 absent_pages << (PAGE_SHIFT-10),
1092                 reservedpages << (PAGE_SHIFT-10),
1093                 datasize >> 10,
1094                 initsize >> 10);
1095 }
1096
1097 #ifdef CONFIG_DEBUG_RODATA
1098 const int rodata_test_data = 0xC3;
1099 EXPORT_SYMBOL_GPL(rodata_test_data);
1100
1101 int kernel_set_to_readonly;
1102
1103 void set_kernel_text_rw(void)
1104 {
1105         unsigned long start = PFN_ALIGN(_text);
1106         unsigned long end = PFN_ALIGN(__stop___ex_table);
1107
1108         if (!kernel_set_to_readonly)
1109                 return;
1110
1111         pr_debug("Set kernel text: %lx - %lx for read write\n",
1112                  start, end);
1113
1114         /*
1115          * Make the kernel identity mapping for text RW. Kernel text
1116          * mapping will always be RO. Refer to the comment in
1117          * static_protections() in pageattr.c
1118          */
1119         set_memory_rw(start, (end - start) >> PAGE_SHIFT);
1120 }
1121
1122 void set_kernel_text_ro(void)
1123 {
1124         unsigned long start = PFN_ALIGN(_text);
1125         unsigned long end = PFN_ALIGN(__stop___ex_table);
1126
1127         if (!kernel_set_to_readonly)
1128                 return;
1129
1130         pr_debug("Set kernel text: %lx - %lx for read only\n",
1131                  start, end);
1132
1133         /*
1134          * Set the kernel identity mapping for text RO.
1135          */
1136         set_memory_ro(start, (end - start) >> PAGE_SHIFT);
1137 }
1138
1139 void mark_rodata_ro(void)
1140 {
1141         unsigned long start = PFN_ALIGN(_text);
1142         unsigned long rodata_start =
1143                 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
1144         unsigned long end = (unsigned long) &__end_rodata;
1145         unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
1146         unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
1147         unsigned long data_start = (unsigned long) &_sdata;
1148
1149         printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
1150                (end - start) >> 10);
1151         set_memory_ro(start, (end - start) >> PAGE_SHIFT);
1152
1153         kernel_set_to_readonly = 1;
1154
1155         /*
1156          * The rodata section (but not the kernel text!) should also be
1157          * not-executable.
1158          */
1159         set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
1160
1161         rodata_test();
1162
1163 #ifdef CONFIG_CPA_DEBUG
1164         printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
1165         set_memory_rw(start, (end-start) >> PAGE_SHIFT);
1166
1167         printk(KERN_INFO "Testing CPA: again\n");
1168         set_memory_ro(start, (end-start) >> PAGE_SHIFT);
1169 #endif
1170
1171         free_init_pages("unused kernel memory",
1172                         (unsigned long) page_address(virt_to_page(text_end)),
1173                         (unsigned long)
1174                                  page_address(virt_to_page(rodata_start)));
1175         free_init_pages("unused kernel memory",
1176                         (unsigned long) page_address(virt_to_page(rodata_end)),
1177                         (unsigned long) page_address(virt_to_page(data_start)));
1178 }
1179
1180 #endif
1181
1182 int kern_addr_valid(unsigned long addr)
1183 {
1184         unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1185         pgd_t *pgd;
1186         pud_t *pud;
1187         pmd_t *pmd;
1188         pte_t *pte;
1189
1190         if (above != 0 && above != -1UL)
1191                 return 0;
1192
1193 #ifdef CONFIG_XEN
1194         /*
1195          * Don't walk page tables for hypervisor addresses, but allow
1196          * the M2P table to be accessed through e.g. /proc/kcore.
1197          */
1198         if (addr >= (unsigned long)machine_to_phys_mapping &&
1199             addr < (unsigned long)(machine_to_phys_mapping +
1200                                    machine_to_phys_nr))
1201                 return 1;
1202         if (addr >= HYPERVISOR_VIRT_START && addr < HYPERVISOR_VIRT_END)
1203                 return 0;
1204 #endif
1205
1206         pgd = pgd_offset_k(addr);
1207         if (pgd_none(*pgd))
1208                 return 0;
1209
1210         pud = pud_offset(pgd, addr);
1211         if (pud_none(*pud))
1212                 return 0;
1213
1214         pmd = pmd_offset(pud, addr);
1215         if (pmd_none(*pmd))
1216                 return 0;
1217
1218         if (pmd_large(*pmd))
1219                 return pfn_valid(pmd_pfn(*pmd));
1220
1221         pte = pte_offset_kernel(pmd, addr);
1222         if (pte_none(*pte))
1223                 return 0;
1224
1225         return pfn_valid(pte_pfn(*pte));
1226 }
1227
1228 /*
1229  * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
1230  * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
1231  * not need special handling anymore:
1232  */
1233 static struct vm_area_struct gate_vma = {
1234         .vm_start       = VSYSCALL_START,
1235         .vm_end         = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
1236         .vm_page_prot   = PAGE_READONLY_EXEC,
1237         .vm_flags       = VM_READ | VM_EXEC
1238 };
1239
1240 struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
1241 {
1242 #ifdef CONFIG_IA32_EMULATION
1243         if (!mm || mm->context.ia32_compat)
1244                 return NULL;
1245 #endif
1246         return &gate_vma;
1247 }
1248
1249 int in_gate_area(struct mm_struct *mm, unsigned long addr)
1250 {
1251         struct vm_area_struct *vma = get_gate_vma(mm);
1252
1253         if (!vma)
1254                 return 0;
1255
1256         return (addr >= vma->vm_start) && (addr < vma->vm_end);
1257 }
1258
1259 /*
1260  * Use this when you have no reliable mm, typically from interrupt
1261  * context. It is less reliable than using a task's mm and may give
1262  * false positives.
1263  */
1264 int in_gate_area_no_mm(unsigned long addr)
1265 {
1266         return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
1267 }
1268
1269 const char *arch_vma_name(struct vm_area_struct *vma)
1270 {
1271         if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
1272                 return "[vdso]";
1273         if (vma == &gate_vma)
1274                 return "[vsyscall]";
1275         return NULL;
1276 }
1277
1278 #ifdef CONFIG_X86_UV
1279 unsigned long memory_block_size_bytes(void)
1280 {
1281         if (is_uv_system()) {
1282                 printk(KERN_INFO "UV: memory block size 2GB\n");
1283                 return 2UL * 1024 * 1024 * 1024;
1284         }
1285         return MIN_MEMORY_BLOCK_SIZE;
1286 }
1287 #endif
1288
1289 #ifdef CONFIG_SPARSEMEM_VMEMMAP
1290 /*
1291  * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
1292  */
1293 static long __meminitdata addr_start, addr_end;
1294 static void __meminitdata *p_start, *p_end;
1295 static int __meminitdata node_start;
1296
1297 int __meminit
1298 vmemmap_populate(struct page *start_page, unsigned long size, int node)
1299 {
1300         unsigned long addr = (unsigned long)start_page;
1301         unsigned long end = (unsigned long)(start_page + size);
1302         unsigned long next;
1303         pgd_t *pgd;
1304         pud_t *pud;
1305         pmd_t *pmd;
1306
1307         for (; addr < end; addr = next) {
1308                 void *p = NULL;
1309
1310                 pgd = vmemmap_pgd_populate(addr, node);
1311                 if (!pgd)
1312                         return -ENOMEM;
1313
1314                 pud = vmemmap_pud_populate(pgd, addr, node);
1315                 if (!pud)
1316                         return -ENOMEM;
1317
1318                 if (!cpu_has_pse) {
1319                         next = (addr + PAGE_SIZE) & PAGE_MASK;
1320                         pmd = vmemmap_pmd_populate(pud, addr, node);
1321
1322                         if (!pmd)
1323                                 return -ENOMEM;
1324
1325                         p = vmemmap_pte_populate(pmd, addr, node);
1326
1327                         if (!p)
1328                                 return -ENOMEM;
1329
1330                         addr_end = addr + PAGE_SIZE;
1331                         p_end = p + PAGE_SIZE;
1332                 } else {
1333                         next = pmd_addr_end(addr, end);
1334
1335                         pmd = pmd_offset(pud, addr);
1336                         if (pmd_none(*pmd)) {
1337                                 pte_t entry;
1338
1339                                 p = vmemmap_alloc_block_buf(PMD_SIZE, node);
1340                                 if (!p)
1341                                         return -ENOMEM;
1342
1343                                 entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
1344                                                 PAGE_KERNEL_LARGE);
1345                                 set_pmd(pmd, __pmd_ma(__pte_val(entry)));
1346
1347                                 /* check to see if we have contiguous blocks */
1348                                 if (p_end != p || node_start != node) {
1349                                         if (p_start)
1350                                                 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1351                                                        addr_start, addr_end-1, p_start, p_end-1, node_start);
1352                                         addr_start = addr;
1353                                         node_start = node;
1354                                         p_start = p;
1355                                 }
1356
1357                                 addr_end = addr + PMD_SIZE;
1358                                 p_end = p + PMD_SIZE;
1359                         } else
1360                                 vmemmap_verify((pte_t *)pmd, node, addr, next);
1361                 }
1362
1363         }
1364         sync_global_pgds((unsigned long)start_page, end);
1365         return 0;
1366 }
1367
1368 void __meminit vmemmap_populate_print_last(void)
1369 {
1370         if (p_start) {
1371                 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
1372                         addr_start, addr_end-1, p_start, p_end-1, node_start);
1373                 p_start = NULL;
1374                 p_end = NULL;
1375                 node_start = 0;
1376         }
1377 }
1378 #endif