- Update Xen patches to 3.3-rc5 and c/s 1157.
[linux-flexiantxendom0-3.2.10.git] / arch / x86 / mm / init_32-xen.c
1 /*
2  *
3  *  Copyright (C) 1995  Linus Torvalds
4  *
5  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
6  */
7
8 #include <linux/module.h>
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
17 #include <linux/mm.h>
18 #include <linux/hugetlb.h>
19 #include <linux/swap.h>
20 #include <linux/smp.h>
21 #include <linux/init.h>
22 #include <linux/highmem.h>
23 #include <linux/pagemap.h>
24 #include <linux/pci.h>
25 #include <linux/pfn.h>
26 #include <linux/poison.h>
27 #include <linux/bootmem.h>
28 #include <linux/memblock.h>
29 #include <linux/proc_fs.h>
30 #include <linux/memory_hotplug.h>
31 #include <linux/initrd.h>
32 #include <linux/cpumask.h>
33 #include <linux/gfp.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/scatterlist.h>
36
37 #include <asm/asm.h>
38 #include <asm/bios_ebda.h>
39 #include <asm/processor.h>
40 #include <asm/system.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/dma.h>
44 #include <asm/fixmap.h>
45 #include <asm/e820.h>
46 #include <asm/apic.h>
47 #include <asm/bugs.h>
48 #include <asm/tlb.h>
49 #include <asm/tlbflush.h>
50 #include <asm/olpc_ofw.h>
51 #include <asm/pgalloc.h>
52 #include <asm/sections.h>
53 #include <asm/hypervisor.h>
54 #include <asm/swiotlb.h>
55 #include <asm/setup.h>
56 #include <asm/cacheflush.h>
57 #include <asm/page_types.h>
58 #include <asm/init.h>
59
60 unsigned long highstart_pfn, highend_pfn;
61
62 static noinline int do_test_wp_bit(void);
63
64 bool __read_mostly __vmalloc_start_set = false;
65
66 static __init void *alloc_low_page(void)
67 {
68         unsigned long pfn = pgt_buf_end++;
69         void *adr;
70
71         if (pfn >= pgt_buf_top)
72                 panic("alloc_low_page: ran out of memory");
73
74         adr = __va(pfn * PAGE_SIZE);
75         clear_page(adr);
76         return adr;
77 }
78
79 /*
80  * Creates a middle page table and puts a pointer to it in the
81  * given global directory entry. This only returns the gd entry
82  * in non-PAE compilation mode, since the middle layer is folded.
83  */
84 static pmd_t * __init one_md_table_init(pgd_t *pgd)
85 {
86         pud_t *pud;
87         pmd_t *pmd_table;
88
89 #ifdef CONFIG_X86_PAE
90         if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
91                 if (after_bootmem)
92                         pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
93                 else
94                         pmd_table = (pmd_t *)alloc_low_page();
95                 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
96                 make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
97                 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
98                 pud = pud_offset(pgd, 0);
99                 BUG_ON(pmd_table != pmd_offset(pud, 0));
100
101                 return pmd_table;
102         }
103 #endif
104         pud = pud_offset(pgd, 0);
105         pmd_table = pmd_offset(pud, 0);
106
107         return pmd_table;
108 }
109
110 /*
111  * Create a page table and place a pointer to it in a middle page
112  * directory entry:
113  */
114 static pte_t * __init one_page_table_init(pmd_t *pmd)
115 {
116 #if CONFIG_XEN_COMPAT <= 0x030002
117         if (pmd_none(*pmd)) {
118 #else
119         if (!(__pmd_val(*pmd) & _PAGE_PRESENT)) {
120 #endif
121                 pte_t *page_table = NULL;
122
123                 if (after_bootmem) {
124 #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
125                         page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
126 #endif
127                         if (!page_table)
128                                 page_table =
129                                 (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
130                 } else
131                         page_table = (pte_t *)alloc_low_page();
132
133                 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
134                 make_lowmem_page_readonly(page_table,
135                                           XENFEAT_writable_page_tables);
136                 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
137                 BUG_ON(page_table != pte_offset_kernel(pmd, 0));
138         }
139
140         return pte_offset_kernel(pmd, 0);
141 }
142
143 pmd_t * __init populate_extra_pmd(unsigned long vaddr)
144 {
145         int pgd_idx = pgd_index(vaddr);
146         int pmd_idx = pmd_index(vaddr);
147
148         return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
149 }
150
151 pte_t * __init populate_extra_pte(unsigned long vaddr)
152 {
153         int pte_idx = pte_index(vaddr);
154         pmd_t *pmd;
155
156         pmd = populate_extra_pmd(vaddr);
157         return one_page_table_init(pmd) + pte_idx;
158 }
159
160 static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
161                                            unsigned long vaddr, pte_t *lastpte)
162 {
163 #ifdef CONFIG_HIGHMEM
164         /*
165          * Something (early fixmap) may already have put a pte
166          * page here, which causes the page table allocation
167          * to become nonlinear. Attempt to fix it, and if it
168          * is still nonlinear then we have to bug.
169          */
170         int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
171         int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
172
173         if (pmd_idx_kmap_begin != pmd_idx_kmap_end
174             && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
175             && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
176             && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
177                 || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
178                 pte_t *newpte;
179                 int i;
180
181                 BUG_ON(after_bootmem);
182                 newpte = alloc_low_page();
183                 for (i = 0; i < PTRS_PER_PTE; i++)
184                         set_pte(newpte + i, pte[i]);
185
186                 paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
187                 make_lowmem_page_readonly(newpte,
188                                           XENFEAT_writable_page_tables);
189                 set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
190                 BUG_ON(newpte != pte_offset_kernel(pmd, 0));
191                 __flush_tlb_all();
192
193                 paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
194                 make_lowmem_page_writable(pte,
195                                           XENFEAT_writable_page_tables);
196                 pte = newpte;
197         }
198         BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
199                && vaddr > fix_to_virt(FIX_KMAP_END)
200                && lastpte && lastpte + PTRS_PER_PTE != pte);
201 #endif
202         return pte;
203 }
204
205 /*
206  * This function initializes a certain range of kernel virtual memory
207  * with new bootmem page tables, everywhere page tables are missing in
208  * the given range.
209  *
210  * NOTE: The pagetables are allocated contiguous on the physical space
211  * so we can cache the place of the first one and move around without
212  * checking the pgd every time.
213  */
214 static void __init
215 page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
216 {
217         int pgd_idx, pmd_idx;
218         unsigned long vaddr;
219         pgd_t *pgd;
220         pmd_t *pmd;
221         pte_t *pte = NULL;
222
223         vaddr = start;
224         pgd_idx = pgd_index(vaddr);
225         pmd_idx = pmd_index(vaddr);
226         pgd = pgd_base + pgd_idx;
227
228         for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
229                 pmd = one_md_table_init(pgd);
230                 pmd = pmd + pmd_index(vaddr);
231                 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
232                                                         pmd++, pmd_idx++) {
233                         if (vaddr >= hypervisor_virt_start)
234                                 break;
235                         pte = page_table_kmap_check(one_page_table_init(pmd),
236                                                     pmd, vaddr, pte);
237
238                         vaddr += PMD_SIZE;
239                 }
240                 pmd_idx = 0;
241         }
242 }
243
244 static inline int is_kernel_text(unsigned long addr)
245 {
246         if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
247                 return 1;
248         return 0;
249 }
250
251 /*
252  * This maps the physical memory to kernel virtual address space, a total
253  * of max_low_pfn pages, by creating page tables starting from address
254  * PAGE_OFFSET:
255  */
256 unsigned long __init
257 kernel_physical_mapping_init(unsigned long start,
258                              unsigned long end,
259                              unsigned long page_size_mask)
260 {
261         int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
262         unsigned long last_map_addr = end;
263         unsigned long start_pfn, end_pfn;
264         pgd_t *pgd_base = swapper_pg_dir;
265         int pgd_idx, pmd_idx, pte_ofs;
266         unsigned long pfn;
267         pgd_t *pgd;
268         pmd_t *pmd;
269         pte_t *pte;
270         unsigned pages_2m, pages_4k;
271         int mapping_iter;
272
273         start_pfn = start >> PAGE_SHIFT;
274         end_pfn = end >> PAGE_SHIFT;
275
276         /*
277          * First iteration will setup identity mapping using large/small pages
278          * based on use_pse, with other attributes same as set by
279          * the early code in head_32.S
280          *
281          * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
282          * as desired for the kernel identity mapping.
283          *
284          * This two pass mechanism conforms to the TLB app note which says:
285          *
286          *     "Software should not write to a paging-structure entry in a way
287          *      that would change, for any linear address, both the page size
288          *      and either the page frame or attributes."
289          */
290         mapping_iter = 1;
291
292         if (!cpu_has_pse) {
293                 use_pse = 0;
294                 mapping_iter = 0;
295         }
296
297 repeat:
298         pages_2m = pages_4k = 0;
299         pfn = start_pfn;
300         pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
301         pgd = pgd_base + pgd_idx;
302         for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
303 #ifdef CONFIG_XEN
304                 /*
305                  * Native linux hasn't PAE-paging enabled yet at this
306                  * point.  When running as xen domain we are in PAE
307                  * mode already, thus we can't simply hook a empty
308                  * pmd.  That would kill the mappings we are currently
309                  * using ...
310                  */
311                 pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
312 #else
313                 pmd = one_md_table_init(pgd);
314 #endif
315
316                 if (pfn >= end_pfn)
317                         continue;
318 #ifdef CONFIG_X86_PAE
319                 pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
320                 pmd += pmd_idx;
321 #else
322                 pmd_idx = 0;
323 #endif
324                 for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
325                      pmd++, pmd_idx++) {
326                         unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
327
328                         if (addr >= hypervisor_virt_start)
329                                 continue;
330
331                         /*
332                          * Map with big pages if possible, otherwise
333                          * create normal page tables:
334                          */
335                         if (use_pse) {
336                                 unsigned int addr2;
337                                 pgprot_t prot = PAGE_KERNEL_LARGE;
338                                 /*
339                                  * first pass will use the same initial
340                                  * identity mapping attribute + _PAGE_PSE.
341                                  */
342                                 pgprot_t init_prot =
343                                         __pgprot(PTE_IDENT_ATTR |
344                                                  _PAGE_PSE);
345
346                                 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
347                                         PAGE_OFFSET + PAGE_SIZE-1;
348
349                                 if (is_kernel_text(addr) ||
350                                     is_kernel_text(addr2))
351                                         prot = PAGE_KERNEL_LARGE_EXEC;
352
353                                 pages_2m++;
354                                 if (mapping_iter == 1)
355                                         set_pmd(pmd, pfn_pmd(pfn, init_prot));
356                                 else
357                                         set_pmd(pmd, pfn_pmd(pfn, prot));
358
359                                 pfn += PTRS_PER_PTE;
360                                 continue;
361                         }
362                         pte = one_page_table_init(pmd);
363
364                         pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
365                         pte += pte_ofs;
366                         for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
367                              pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
368                                 pgprot_t prot = PAGE_KERNEL;
369                                 /*
370                                  * first pass will use the same initial
371                                  * identity mapping attribute.
372                                  */
373                                 pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
374
375                                 /* XEN: Only map initial RAM allocation. */
376                                 if (pfn >= xen_start_info->nr_pages || pte_present(*pte))
377                                         continue;
378                                 if (is_kernel_text(addr))
379                                         prot = PAGE_KERNEL_EXEC;
380
381                                 pages_4k++;
382                                 if (mapping_iter == 1) {
383                                         set_pte(pte, pfn_pte(pfn, init_prot));
384                                         last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
385                                 } else
386                                         set_pte(pte, pfn_pte(pfn, prot));
387                         }
388                 }
389         }
390         if (mapping_iter <= 1) {
391                 /*
392                  * update direct mapping page count only in the first
393                  * iteration.
394                  */
395                 update_page_count(PG_LEVEL_2M, pages_2m);
396                 update_page_count(PG_LEVEL_4K, pages_4k);
397         }
398         if (mapping_iter == 1) {
399                 /*
400                  * local global flush tlb, which will flush the previous
401                  * mappings present in both small and large page TLB's.
402                  */
403                 __flush_tlb_all();
404
405                 /*
406                  * Second iteration will set the actual desired PTE attributes.
407                  */
408                 mapping_iter = 2;
409                 goto repeat;
410         }
411         return last_map_addr;
412 }
413
414 pte_t *kmap_pte;
415 pgprot_t kmap_prot;
416
417 static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
418 {
419         return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
420                         vaddr), vaddr), vaddr);
421 }
422
423 static void __init kmap_init(void)
424 {
425         unsigned long kmap_vstart;
426
427         /*
428          * Cache the first kmap pte:
429          */
430         kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
431         kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
432
433         kmap_prot = PAGE_KERNEL;
434 }
435
436 #ifdef CONFIG_HIGHMEM
437 static void __init permanent_kmaps_init(pgd_t *pgd_base)
438 {
439         unsigned long vaddr;
440         pgd_t *pgd;
441         pud_t *pud;
442         pmd_t *pmd;
443         pte_t *pte;
444
445         vaddr = PKMAP_BASE;
446         page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
447
448         pgd = swapper_pg_dir + pgd_index(vaddr);
449         pud = pud_offset(pgd, vaddr);
450         pmd = pmd_offset(pud, vaddr);
451         pte = pte_offset_kernel(pmd, vaddr);
452         pkmap_page_table = pte;
453 }
454
455 static void __init add_one_highpage_init(struct page *page)
456 {
457         ClearPageReserved(page);
458         init_page_count(page);
459         __free_page(page);
460         totalhigh_pages++;
461 }
462
463 void __init add_highpages_with_active_regions(int nid,
464                          unsigned long start_pfn, unsigned long end_pfn)
465 {
466         phys_addr_t start, end;
467         u64 i;
468
469         for_each_free_mem_range(i, nid, &start, &end, NULL) {
470                 unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
471                                             start_pfn, end_pfn);
472                 unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
473                                               start_pfn, end_pfn);
474                 for ( ; pfn < e_pfn; pfn++)
475                         if (pfn_valid(pfn))
476                                 add_one_highpage_init(pfn_to_page(pfn));
477         }
478 }
479 #else
480 static inline void permanent_kmaps_init(pgd_t *pgd_base)
481 {
482 }
483 #endif /* CONFIG_HIGHMEM */
484
485 pgd_t *swapper_pg_dir;
486
487 /*
488  * Build a proper pagetable for the kernel mappings.  Up until this
489  * point, we've been running on some set of pagetables constructed by
490  * the boot process.
491  *
492  * If we're booting on native hardware, this will be a pagetable
493  * constructed in arch/x86/kernel/head_32.S.  The root of the
494  * pagetable will be swapper_pg_dir.
495  *
496  * If we're booting paravirtualized under a hypervisor, then there are
497  * more options: we may already be running PAE, and the pagetable may
498  * or may not be based in swapper_pg_dir.  In any case,
499  * paravirt_pagetable_setup_start() will set up swapper_pg_dir
500  * appropriately for the rest of the initialization to work.
501  *
502  * In general, pagetable_init() assumes that the pagetable may already
503  * be partially populated, and so it avoids stomping on any existing
504  * mappings.
505  */
506 void __init early_ioremap_page_table_range_init(void)
507 {
508         pgd_t *pgd_base = swapper_pg_dir;
509         unsigned long vaddr, end;
510
511         /*
512          * Fixed mappings, only the page table structure has to be
513          * created - mappings will be set by set_fixmap():
514          */
515         vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
516         end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
517         page_table_range_init(vaddr, end, pgd_base);
518         early_ioremap_reset();
519 }
520
521 static void __init pagetable_init(void)
522 {
523         pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
524
525         permanent_kmaps_init(pgd_base);
526 }
527
528 pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
529 EXPORT_SYMBOL_GPL(__supported_pte_mask);
530
531 /* user-defined highmem size */
532 static unsigned int highmem_pages = -1;
533
534 /*
535  * highmem=size forces highmem to be exactly 'size' bytes.
536  * This works even on boxes that have no highmem otherwise.
537  * This also works to reduce highmem size on bigger boxes.
538  */
539 static int __init parse_highmem(char *arg)
540 {
541         if (!arg)
542                 return -EINVAL;
543
544         highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
545         return 0;
546 }
547 early_param("highmem", parse_highmem);
548
549 #define MSG_HIGHMEM_TOO_BIG \
550         "highmem size (%luMB) is bigger than pages available (%luMB)!\n"
551
552 #define MSG_LOWMEM_TOO_SMALL \
553         "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n"
554 /*
555  * All of RAM fits into lowmem - but if user wants highmem
556  * artificially via the highmem=x boot parameter then create
557  * it:
558  */
559 void __init lowmem_pfn_init(void)
560 {
561         /* max_low_pfn is 0, we already have early_res support */
562         max_low_pfn = max_pfn;
563
564         if (highmem_pages == -1)
565                 highmem_pages = 0;
566 #ifdef CONFIG_HIGHMEM
567         if (highmem_pages >= max_pfn) {
568                 printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
569                         pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
570                 highmem_pages = 0;
571         }
572         if (highmem_pages) {
573                 if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
574                         printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
575                                 pages_to_mb(highmem_pages));
576                         highmem_pages = 0;
577                 }
578                 max_low_pfn -= highmem_pages;
579         }
580 #else
581         if (highmem_pages)
582                 printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
583 #endif
584 }
585
586 #define MSG_HIGHMEM_TOO_SMALL \
587         "only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
588
589 #define MSG_HIGHMEM_TRIMMED \
590         "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
591 /*
592  * We have more RAM than fits into lowmem - we try to put it into
593  * highmem, also taking the highmem=x boot parameter into account:
594  */
595 void __init highmem_pfn_init(void)
596 {
597         max_low_pfn = MAXMEM_PFN;
598
599         if (highmem_pages == -1)
600                 highmem_pages = max_pfn - MAXMEM_PFN;
601
602         if (highmem_pages + MAXMEM_PFN < max_pfn)
603                 max_pfn = MAXMEM_PFN + highmem_pages;
604
605         if (highmem_pages + MAXMEM_PFN > max_pfn) {
606                 printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
607                         pages_to_mb(max_pfn - MAXMEM_PFN),
608                         pages_to_mb(highmem_pages));
609                 highmem_pages = 0;
610         }
611 #ifndef CONFIG_HIGHMEM
612         /* Maximum memory usable is what is directly addressable */
613         printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
614         if (max_pfn > MAX_NONPAE_PFN)
615                 printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
616         else
617                 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
618         max_pfn = MAXMEM_PFN;
619 #else /* !CONFIG_HIGHMEM */
620 #ifndef CONFIG_HIGHMEM64G
621         if (max_pfn > MAX_NONPAE_PFN) {
622                 max_pfn = MAX_NONPAE_PFN;
623                 printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
624         }
625 #endif /* !CONFIG_HIGHMEM64G */
626 #endif /* !CONFIG_HIGHMEM */
627 }
628
629 /*
630  * Determine low and high memory ranges:
631  */
632 void __init find_low_pfn_range(void)
633 {
634         /* it could update max_pfn */
635
636         if (max_pfn <= MAXMEM_PFN)
637                 lowmem_pfn_init();
638         else
639                 highmem_pfn_init();
640 }
641
642 #ifndef CONFIG_NEED_MULTIPLE_NODES
643 void __init initmem_init(void)
644 {
645 #ifdef CONFIG_HIGHMEM
646         highstart_pfn = highend_pfn = max_pfn;
647         if (max_pfn > max_low_pfn)
648                 highstart_pfn = max_low_pfn;
649         printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
650                 pages_to_mb(highend_pfn - highstart_pfn));
651         num_physpages = highend_pfn;
652         high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
653 #else
654         num_physpages = max_low_pfn;
655         high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
656 #endif
657
658         memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
659         sparse_memory_present_with_active_regions(0);
660
661 #ifdef CONFIG_FLATMEM
662         max_mapnr = num_physpages;
663 #endif
664         __vmalloc_start_set = true;
665
666         printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
667                         pages_to_mb(max_low_pfn));
668
669         setup_bootmem_allocator();
670 }
671 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
672
673 void __init setup_bootmem_allocator(void)
674 {
675         printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
676                  max_pfn_mapped<<PAGE_SHIFT);
677         printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
678
679         after_bootmem = 1;
680 }
681
682 unsigned long __init extend_init_mapping(unsigned long tables_space)
683 {
684         unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT)
685                                   + xen_start_info->nr_pt_frames;
686         unsigned long start = start_pfn, va = (unsigned long)&_text;
687         pgd_t *pgd;
688         pud_t *pud;
689         pmd_t *pmd;
690         pte_t *pte;
691
692         /* Ensure init mappings cover kernel text/data and initial tables. */
693         while (va < PAGE_OFFSET + (start_pfn << PAGE_SHIFT) + tables_space) {
694                 pgd = pgd_offset_k(va);
695                 pud = pud_offset(pgd, va);
696                 pmd = pmd_offset(pud, va);
697                 if (pmd_none(*pmd)) {
698                         unsigned long pa = start_pfn++ << PAGE_SHIFT;
699
700                         clear_page(__va(pa));
701                         make_lowmem_page_readonly(__va(pa),
702                                                   XENFEAT_writable_page_tables);
703                         xen_l2_entry_update(pmd, __pmd(pa | _KERNPG_TABLE));
704                 }
705                 pte = pte_offset_kernel(pmd, va);
706                 if (pte_none(*pte)) {
707                         pte_t new_pte = __pte(__pa(va) | _KERNPG_TABLE);
708
709                         if (HYPERVISOR_update_va_mapping(va, new_pte, 0))
710                                 BUG();
711                 }
712                 va += PAGE_SIZE;
713         }
714
715         /* Finally, blow away any spurious initial mappings. */
716         while (1) {
717                 pgd = pgd_offset_k(va);
718                 pud = pud_offset(pgd, va);
719                 pmd = pmd_offset(pud, va);
720                 if (pmd_none(*pmd))
721                         break;
722                 if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
723                         BUG();
724                 va += PAGE_SIZE;
725         }
726
727         if (start_pfn > start)
728                 memblock_reserve(PFN_PHYS(start), PFN_PHYS(start_pfn - start));
729
730         return start_pfn;
731 }
732
733 /*
734  * paging_init() sets up the page tables - note that the first 8MB are
735  * already mapped by head.S.
736  *
737  * This routines also unmaps the page at virtual kernel address 0, so
738  * that we can trap those pesky NULL-reference errors in the kernel.
739  */
740 void __init paging_init(void)
741 {
742         pagetable_init();
743
744         __flush_tlb_all();
745
746         kmap_init();
747
748         /*
749          * NOTE: at this point the bootmem allocator is fully available.
750          */
751         olpc_dt_build_devicetree();
752         sparse_memory_present_with_active_regions(MAX_NUMNODES);
753         sparse_init();
754         zone_sizes_init();
755 }
756
757 /*
758  * Test if the WP bit works in supervisor mode. It isn't supported on 386's
759  * and also on some strange 486's. All 586+'s are OK. This used to involve
760  * black magic jumps to work around some nasty CPU bugs, but fortunately the
761  * switch to using exceptions got rid of all that.
762  */
763 static void __init test_wp_bit(void)
764 {
765         printk(KERN_INFO
766   "Checking if this processor honours the WP bit even in supervisor mode...");
767
768         /* Any page-aligned address will do, the test is non-destructive */
769         __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
770         boot_cpu_data.wp_works_ok = do_test_wp_bit();
771         clear_fixmap(FIX_WP_TEST);
772
773         if (!boot_cpu_data.wp_works_ok) {
774                 printk(KERN_CONT "No.\n");
775 #ifdef CONFIG_X86_WP_WORKS_OK
776                 panic(
777   "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
778 #endif
779         } else {
780                 printk(KERN_CONT "Ok.\n");
781         }
782 }
783
784 void __init mem_init(void)
785 {
786         int codesize, reservedpages, datasize, initsize;
787         int tmp;
788         unsigned long pfn;
789
790         pci_iommu_alloc();
791
792 #ifdef CONFIG_FLATMEM
793         BUG_ON(!mem_map);
794 #endif
795         /*
796          * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
797          * be done before free_all_bootmem(). Memblock use free low memory for
798          * temporary data (see find_range_array()) and for this purpose can use
799          * pages that was already passed to the buddy allocator, hence marked as
800          * not accessible in the page tables when compiled with
801          * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
802          * important here.
803          */
804         set_highmem_pages_init();
805
806         /* this will put all low memory onto the freelists */
807         totalram_pages += free_all_bootmem();
808         /* XEN: init low-mem pages outside initial allocation. */
809         for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) {
810                 ClearPageReserved(pfn_to_page(pfn));
811                 init_page_count(pfn_to_page(pfn));
812         }
813
814         reservedpages = 0;
815         for (tmp = 0; tmp < max_low_pfn; tmp++)
816                 /*
817                  * Only count reserved RAM pages:
818                  */
819                 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
820                         reservedpages++;
821
822         codesize =  (unsigned long) &_etext - (unsigned long) &_text;
823         datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
824         initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
825
826         printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
827                         "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
828                 nr_free_pages() << (PAGE_SHIFT-10),
829                 num_physpages << (PAGE_SHIFT-10),
830                 codesize >> 10,
831                 reservedpages << (PAGE_SHIFT-10),
832                 datasize >> 10,
833                 initsize >> 10,
834                 totalhigh_pages << (PAGE_SHIFT-10));
835
836         printk(KERN_INFO "virtual kernel memory layout:\n"
837                 "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
838 #ifdef CONFIG_HIGHMEM
839                 "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
840 #endif
841                 "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
842                 "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
843                 "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
844                 "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
845                 "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
846                 FIXADDR_START, FIXADDR_TOP,
847                 (FIXADDR_TOP - FIXADDR_START) >> 10,
848
849 #ifdef CONFIG_HIGHMEM
850                 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
851                 (LAST_PKMAP*PAGE_SIZE) >> 10,
852 #endif
853
854                 VMALLOC_START, VMALLOC_END,
855                 (VMALLOC_END - VMALLOC_START) >> 20,
856
857                 (unsigned long)__va(0), (unsigned long)high_memory,
858                 ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
859
860                 (unsigned long)&__init_begin, (unsigned long)&__init_end,
861                 ((unsigned long)&__init_end -
862                  (unsigned long)&__init_begin) >> 10,
863
864                 (unsigned long)&_etext, (unsigned long)&_edata,
865                 ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
866
867                 (unsigned long)&_text, (unsigned long)&_etext,
868                 ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
869
870         /*
871          * Check boundaries twice: Some fundamental inconsistencies can
872          * be detected at build time already.
873          */
874 #define __FIXADDR_TOP (-PAGE_SIZE)
875 #ifdef CONFIG_HIGHMEM
876         BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE  > FIXADDR_START);
877         BUILD_BUG_ON(VMALLOC_END                        > PKMAP_BASE);
878 #endif
879 #define high_memory (-128UL << 20)
880         BUILD_BUG_ON(VMALLOC_START                      >= VMALLOC_END);
881 #undef high_memory
882 #undef __FIXADDR_TOP
883
884 #ifdef CONFIG_HIGHMEM
885         BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE        > FIXADDR_START);
886         BUG_ON(VMALLOC_END                              > PKMAP_BASE);
887 #endif
888         BUG_ON(VMALLOC_START                            >= VMALLOC_END);
889         BUG_ON((unsigned long)high_memory               > VMALLOC_START);
890
891         if (boot_cpu_data.wp_works_ok < 0)
892                 test_wp_bit();
893 }
894
895 #ifdef CONFIG_MEMORY_HOTPLUG
896 int arch_add_memory(int nid, u64 start, u64 size)
897 {
898         struct pglist_data *pgdata = NODE_DATA(nid);
899         struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
900         unsigned long start_pfn = start >> PAGE_SHIFT;
901         unsigned long nr_pages = size >> PAGE_SHIFT;
902
903         return __add_pages(nid, zone, start_pfn, nr_pages);
904 }
905 #endif
906
907 /*
908  * This function cannot be __init, since exceptions don't work in that
909  * section.  Put this after the callers, so that it cannot be inlined.
910  */
911 static noinline int do_test_wp_bit(void)
912 {
913         char tmp_reg;
914         int flag;
915
916         __asm__ __volatile__(
917                 "       movb %0, %1     \n"
918                 "1:     movb %1, %0     \n"
919                 "       xorl %2, %2     \n"
920                 "2:                     \n"
921                 _ASM_EXTABLE(1b,2b)
922                 :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
923                  "=q" (tmp_reg),
924                  "=r" (flag)
925                 :"2" (1)
926                 :"memory");
927
928         return flag;
929 }
930
931 #ifdef CONFIG_DEBUG_RODATA
932 const int rodata_test_data = 0xC3;
933 EXPORT_SYMBOL_GPL(rodata_test_data);
934
935 int kernel_set_to_readonly __read_mostly;
936
937 void set_kernel_text_rw(void)
938 {
939         unsigned long start = PFN_ALIGN(_text);
940         unsigned long size = PFN_ALIGN(_etext) - start;
941
942         if (!kernel_set_to_readonly)
943                 return;
944
945         pr_debug("Set kernel text: %lx - %lx for read write\n",
946                  start, start+size);
947
948         set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
949 }
950
951 void set_kernel_text_ro(void)
952 {
953         unsigned long start = PFN_ALIGN(_text);
954         unsigned long size = PFN_ALIGN(_etext) - start;
955
956         if (!kernel_set_to_readonly)
957                 return;
958
959         pr_debug("Set kernel text: %lx - %lx for read only\n",
960                  start, start+size);
961
962         set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
963 }
964
965 static void mark_nxdata_nx(void)
966 {
967         /*
968          * When this called, init has already been executed and released,
969          * so everything past _etext should be NX.
970          */
971         unsigned long start = PFN_ALIGN(_etext);
972         /*
973          * This comes from is_kernel_text upper limit. Also HPAGE where used:
974          */
975         unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
976
977         if (__supported_pte_mask & _PAGE_NX)
978                 printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
979         set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
980 }
981
982 void mark_rodata_ro(void)
983 {
984         unsigned long start = PFN_ALIGN(_text);
985         unsigned long size = PFN_ALIGN(_etext) - start;
986
987         set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
988         printk(KERN_INFO "Write protecting the kernel text: %luk\n",
989                 size >> 10);
990
991         kernel_set_to_readonly = 1;
992
993 #ifdef CONFIG_CPA_DEBUG
994         printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
995                 start, start+size);
996         set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
997
998         printk(KERN_INFO "Testing CPA: write protecting again\n");
999         set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
1000 #endif
1001
1002         start += size;
1003         size = (unsigned long)__end_rodata - start;
1004         set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1005         printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
1006                 size >> 10);
1007         rodata_test();
1008
1009 #ifdef CONFIG_CPA_DEBUG
1010         printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
1011         set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
1012
1013         printk(KERN_INFO "Testing CPA: write protecting again\n");
1014         set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1015 #endif
1016         mark_nxdata_nx();
1017 }
1018 #endif
1019