Update to 3.4-final.
[linux-flexiantxendom0-3.2.10.git] / arch / x86 / mm / init-xen.c
1 #include <linux/gfp.h>
2 #include <linux/initrd.h>
3 #include <linux/ioport.h>
4 #include <linux/swap.h>
5 #include <linux/memblock.h>
6 #include <linux/bootmem.h>
7
8 #include <asm/cacheflush.h>
9 #include <asm/e820.h>
10 #include <asm/init.h>
11 #include <asm/page.h>
12 #include <asm/page_types.h>
13 #include <asm/sections.h>
14 #include <asm/setup.h>
15 #include <asm/tlbflush.h>
16 #include <asm/tlb.h>
17 #include <asm/proto.h>
18 #include <asm/dma.h>            /* for MAX_DMA_PFN */
19
20 unsigned long __meminitdata pgt_buf_start;
21 unsigned long __meminitdata pgt_buf_end;
22 unsigned long __meminitdata pgt_buf_top;
23
24 int after_bootmem;
25
26 #if !defined(CONFIG_XEN)
27 int direct_gbpages
28 #ifdef CONFIG_DIRECT_GBPAGES
29                                 = 1
30 #endif
31 ;
32 #elif defined(CONFIG_X86_32)
33 #define direct_gbpages 0
34 extern unsigned long extend_init_mapping(unsigned long tables_space);
35 #else
36 extern void xen_finish_init_mapping(void);
37 #endif
38
39 static void __init find_early_table_space(unsigned long end, int use_pse,
40                                           int use_gbpages)
41 {
42         unsigned long puds, pmds, ptes, tables;
43
44         puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
45         tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
46
47         if (use_gbpages) {
48                 unsigned long extra;
49
50                 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
51                 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
52         } else
53                 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
54
55         tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
56
57         if (use_pse) {
58                 unsigned long extra;
59
60                 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
61 #ifdef CONFIG_X86_32
62                 extra += PMD_SIZE;
63 #endif
64                 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
65         } else
66                 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
67
68         tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
69
70 #ifdef CONFIG_X86_32
71         /* for fixmap */
72         tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
73
74         pgt_buf_start = extend_init_mapping(tables);
75         pgt_buf_end = pgt_buf_start;
76 #else /* CONFIG_X86_64 */
77         if (!pgt_buf_top) {
78                 pgt_buf_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
79                         xen_start_info->nr_pt_frames;
80                 pgt_buf_end = pgt_buf_start;
81         } else {
82                 /*
83                  * [table_start, table_top) gets passed to reserve_early(),
84                  * so we must not use table_end here, despite continuing
85                  * to allocate from there. table_end possibly being below
86                  * table_start is otoh not a problem.
87                  */
88                 pgt_buf_start = pgt_buf_top;
89         }
90 #endif
91         if (pgt_buf_start == -1UL)
92                 panic("Cannot find space for the kernel page tables");
93
94         pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
95
96         printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
97                 end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
98 }
99
100 void __init xen_pagetable_reserve(u64 start, u64 end)
101 {
102         if (end > start)
103                 memblock_reserve(start, end - start);
104 }
105
106 struct map_range {
107         unsigned long start;
108         unsigned long end;
109         unsigned page_size_mask;
110 };
111
112 #ifdef CONFIG_X86_32
113 #define NR_RANGE_MR 3
114 #else /* CONFIG_X86_64 */
115 #define NR_RANGE_MR 5
116 #endif
117
118 static int __meminit save_mr(struct map_range *mr, int nr_range,
119                              unsigned long start_pfn, unsigned long end_pfn,
120                              unsigned long page_size_mask)
121 {
122         if (start_pfn < end_pfn) {
123                 if (nr_range >= NR_RANGE_MR)
124                         panic("run out of range for init_memory_mapping\n");
125                 mr[nr_range].start = start_pfn<<PAGE_SHIFT;
126                 mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
127                 mr[nr_range].page_size_mask = page_size_mask;
128                 nr_range++;
129         }
130
131         return nr_range;
132 }
133
134 /*
135  * Setup the direct mapping of the physical memory at PAGE_OFFSET.
136  * This runs before bootmem is initialized and gets pages directly from
137  * the physical memory. To access them they are temporarily mapped.
138  */
139 unsigned long __init_refok init_memory_mapping(unsigned long start,
140                                                unsigned long end)
141 {
142         unsigned long page_size_mask = 0;
143         unsigned long start_pfn, end_pfn;
144         unsigned long ret = 0;
145         unsigned long pos;
146
147         struct map_range mr[NR_RANGE_MR];
148         int nr_range, i;
149         int use_pse, use_gbpages;
150
151         printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
152
153 #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
154         /*
155          * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
156          * This will simplify cpa(), which otherwise needs to support splitting
157          * large pages into small in interrupt context, etc.
158          */
159         use_pse = use_gbpages = 0;
160 #else
161         use_pse = cpu_has_pse;
162         use_gbpages = direct_gbpages;
163 #endif
164
165         /* Enable PSE if available */
166         if (cpu_has_pse)
167                 set_in_cr4(X86_CR4_PSE);
168
169         /* Enable PGE if available */
170         if (cpu_has_pge) {
171                 set_in_cr4(X86_CR4_PGE);
172                 __supported_pte_mask |= _PAGE_GLOBAL;
173         }
174
175         if (use_gbpages)
176                 page_size_mask |= 1 << PG_LEVEL_1G;
177         if (use_pse)
178                 page_size_mask |= 1 << PG_LEVEL_2M;
179
180         memset(mr, 0, sizeof(mr));
181         nr_range = 0;
182
183         /* head if not big page alignment ? */
184         start_pfn = start >> PAGE_SHIFT;
185         pos = start_pfn << PAGE_SHIFT;
186 #ifdef CONFIG_X86_32
187         /*
188          * Don't use a large page for the first 2/4MB of memory
189          * because there are often fixed size MTRRs in there
190          * and overlapping MTRRs into large pages can cause
191          * slowdowns.
192          */
193         if (pos == 0)
194                 end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
195         else
196                 end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
197                                  << (PMD_SHIFT - PAGE_SHIFT);
198 #else /* CONFIG_X86_64 */
199         end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
200                         << (PMD_SHIFT - PAGE_SHIFT);
201 #endif
202         if (end_pfn > (end >> PAGE_SHIFT))
203                 end_pfn = end >> PAGE_SHIFT;
204         if (start_pfn < end_pfn) {
205                 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
206                 pos = end_pfn << PAGE_SHIFT;
207         }
208
209         /* big page (2M) range */
210         start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
211                          << (PMD_SHIFT - PAGE_SHIFT);
212 #ifdef CONFIG_X86_32
213         end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
214 #else /* CONFIG_X86_64 */
215         end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
216                          << (PUD_SHIFT - PAGE_SHIFT);
217         if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
218                 end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
219 #endif
220
221         if (start_pfn < end_pfn) {
222                 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
223                                 page_size_mask & (1<<PG_LEVEL_2M));
224                 pos = end_pfn << PAGE_SHIFT;
225         }
226
227 #ifdef CONFIG_X86_64
228         /* big page (1G) range */
229         start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
230                          << (PUD_SHIFT - PAGE_SHIFT);
231         end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
232         if (start_pfn < end_pfn) {
233                 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
234                                 page_size_mask &
235                                  ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
236                 pos = end_pfn << PAGE_SHIFT;
237         }
238
239         /* tail is not big page (1G) alignment */
240         start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
241                          << (PMD_SHIFT - PAGE_SHIFT);
242         end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
243         if (start_pfn < end_pfn) {
244                 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
245                                 page_size_mask & (1<<PG_LEVEL_2M));
246                 pos = end_pfn << PAGE_SHIFT;
247         }
248 #endif
249
250         /* tail is not big page (2M) alignment */
251         start_pfn = pos>>PAGE_SHIFT;
252         end_pfn = end>>PAGE_SHIFT;
253         nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
254
255         /* try to merge same page size and continuous */
256         for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
257                 unsigned long old_start;
258                 if (mr[i].end != mr[i+1].start ||
259                     mr[i].page_size_mask != mr[i+1].page_size_mask)
260                         continue;
261                 /* move it */
262                 old_start = mr[i].start;
263                 memmove(&mr[i], &mr[i+1],
264                         (nr_range - 1 - i) * sizeof(struct map_range));
265                 mr[i--].start = old_start;
266                 nr_range--;
267         }
268
269         for (i = 0; i < nr_range; i++)
270                 printk(KERN_DEBUG " %010lx - %010lx page %s\n",
271                                 mr[i].start, mr[i].end,
272                         (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
273                          (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
274
275         /*
276          * Find space for the kernel direct mapping tables.
277          *
278          * Later we should allocate these tables in the local node of the
279          * memory mapped. Unfortunately this is done currently before the
280          * nodes are discovered.
281          */
282         if (!after_bootmem)
283                 find_early_table_space(end, use_pse, use_gbpages);
284
285 #ifdef CONFIG_X86_64
286 #define addr_to_page(addr)                                              \
287         ((unsigned long *)                                              \
288          ((mfn_to_pfn(((addr) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)      \
289            << PAGE_SHIFT) + __START_KERNEL_map))
290
291         if (!start) {
292                 unsigned long addr, va = __START_KERNEL_map;
293                 unsigned long *page = (unsigned long *)init_level4_pgt;
294
295                 /* Kill mapping of memory below _text. */
296                 while (va < (unsigned long)&_text) {
297                         if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
298                                 BUG();
299                         va += PAGE_SIZE;
300                 }
301
302                 /* Blow away any spurious initial mappings. */
303                 va = __START_KERNEL_map + (pgt_buf_start << PAGE_SHIFT);
304
305                 addr = page[pgd_index(va)];
306                 page = addr_to_page(addr);
307                 addr = page[pud_index(va)];
308                 page = addr_to_page(addr);
309                 while (pmd_index(va) | pte_index(va)) {
310                         if (pmd_none(*(pmd_t *)&page[pmd_index(va)]))
311                                 break;
312                         if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
313                                 BUG();
314                         va += PAGE_SIZE;
315                 }
316         }
317 #undef addr_to_page
318 #endif
319
320         for (i = 0; i < nr_range; i++)
321                 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
322                                                    mr[i].page_size_mask);
323
324 #ifdef CONFIG_X86_32
325         early_ioremap_page_table_range_init();
326 #endif
327
328 #ifdef CONFIG_X86_64
329         BUG_ON(pgt_buf_end > pgt_buf_top);
330         if (!start)
331                 xen_finish_init_mapping();
332         else
333 #endif
334         if (pgt_buf_end < pgt_buf_top)
335                 /* Disable the 'table_end' allocator. */
336                 pgt_buf_top = pgt_buf_end;
337
338         __flush_tlb_all();
339
340         /*
341          * Reserve the kernel pagetable pages we used (pgt_buf_start -
342          * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
343          * so that they can be reused for other purposes.
344          *
345          * On native it just means calling memblock_reserve, on Xen it also
346          * means marking RW the pagetable pages that we allocated before
347          * but that haven't been used.
348          *
349          * In fact on xen we mark RO the whole range pgt_buf_start -
350          * pgt_buf_top, because we have to make sure that when
351          * init_memory_mapping reaches the pagetable pages area, it maps
352          * RO all the pagetable pages, including the ones that are beyond
353          * pgt_buf_end at that time.
354          */
355         if (!after_bootmem && pgt_buf_top > pgt_buf_start) {
356 #ifdef CONFIG_X86_64
357                 reserve_pgtable_low();
358 #endif
359                 x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
360                                 PFN_PHYS(pgt_buf_top));
361         }
362
363         if (!after_bootmem)
364                 early_memtest(start, end);
365
366         return ret >> PAGE_SHIFT;
367 }
368
369
370 /*
371  * devmem_is_allowed() checks to see if /dev/mem access to a certain address
372  * is valid. The argument is a physical page number.
373  *
374  *
375  * On x86, access has to be given to the first megabyte of ram because that area
376  * contains bios code and data regions used by X and dosemu and similar apps.
377  * Access has to be given to non-kernel-ram areas as well, these contain the PCI
378  * mmio resources as well as potential bios/acpi data regions.
379  */
380 int devmem_is_allowed(unsigned long pagenr)
381 {
382         if (pagenr <= 256)
383                 return 1;
384         if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
385                 return 0;
386         if (mfn_to_local_pfn(pagenr) >= max_pfn)
387                 return 1;
388         return 0;
389 }
390
391 void free_init_pages(char *what, unsigned long begin, unsigned long end)
392 {
393         unsigned long addr;
394         unsigned long begin_aligned, end_aligned;
395
396         /* Make sure boundaries are page aligned */
397         begin_aligned = PAGE_ALIGN(begin);
398         end_aligned   = end & PAGE_MASK;
399
400         if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
401                 begin = begin_aligned;
402                 end   = end_aligned;
403         }
404
405         if (begin >= end)
406                 return;
407
408         addr = begin;
409
410         /*
411          * If debugging page accesses then do not free this memory but
412          * mark them not present - any buggy init-section access will
413          * create a kernel page fault:
414          */
415 #ifdef CONFIG_DEBUG_PAGEALLOC
416         printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
417                 begin, end);
418         set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
419 #else
420         /*
421          * We just marked the kernel text read only above, now that
422          * we are going to free part of that, we need to make that
423          * writeable and non-executable first.
424          */
425         set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
426         set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
427
428         printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
429
430         for (; addr < end; addr += PAGE_SIZE) {
431                 ClearPageReserved(virt_to_page(addr));
432                 init_page_count(virt_to_page(addr));
433                 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
434 #ifdef CONFIG_X86_64
435                 if (addr >= __START_KERNEL_map) {
436                         /* make_readonly() reports all kernel addresses. */
437                         if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
438                                                          pfn_pte(__pa(addr) >> PAGE_SHIFT,
439                                                                  PAGE_KERNEL),
440                                                          0))
441                                 BUG();
442                         if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
443                                 BUG();
444                 }
445 #endif
446                 free_page(addr);
447                 totalram_pages++;
448         }
449 #endif
450 }
451
452 void free_initmem(void)
453 {
454         free_init_pages("unused kernel memory",
455                         (unsigned long)(&__init_begin),
456                         (unsigned long)(&__init_end));
457 }
458
459 #ifdef CONFIG_BLK_DEV_INITRD
460 void free_initrd_mem(unsigned long start, unsigned long end)
461 {
462         /*
463          * end could be not aligned, and We can not align that,
464          * decompresser could be confused by aligned initrd_end
465          * We already reserve the end partial page before in
466          *   - i386_start_kernel()
467          *   - x86_64_start_kernel()
468          *   - relocate_initrd()
469          * So here We can do PAGE_ALIGN() safely to get partial page to be freed
470          */
471 #ifdef CONFIG_ACPI_INITRD_TABLE_OVERRIDE
472         if (acpi_initrd_offset)
473                 free_init_pages("initrd memory", start - acpi_initrd_offset,
474                                 PAGE_ALIGN(end));
475         else
476 #endif
477         free_init_pages("initrd memory", start, PAGE_ALIGN(end));
478 }
479 #endif
480
481 void __init zone_sizes_init(void)
482 {
483         unsigned long max_zone_pfns[MAX_NR_ZONES];
484
485         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
486
487 #ifdef CONFIG_ZONE_DMA
488         max_zone_pfns[ZONE_DMA]         = MAX_DMA_PFN;
489 #endif
490 #ifdef CONFIG_ZONE_DMA32
491         max_zone_pfns[ZONE_DMA32]       = MAX_DMA32_PFN;
492 #endif
493         max_zone_pfns[ZONE_NORMAL]      = max_low_pfn;
494 #ifdef CONFIG_HIGHMEM
495         max_zone_pfns[ZONE_HIGHMEM]     = max_pfn;
496 #endif
497
498         free_area_init_nodes(max_zone_pfns);
499
500         xen_init_pgd_pin();
501 }
502