Update to 3.4-final.
[linux-flexiantxendom0-3.2.10.git] / arch / x86 / kernel / machine_kexec_64.c
1 /*
2  * handle transition of Linux booting another kernel
3  * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
4  *
5  * This source code is licensed under the GNU General Public License,
6  * Version 2.  See the file COPYING for more details.
7  */
8
9 #include <linux/mm.h>
10 #include <linux/kexec.h>
11 #include <linux/string.h>
12 #include <linux/gfp.h>
13 #include <linux/reboot.h>
14 #include <linux/numa.h>
15 #include <linux/ftrace.h>
16 #include <linux/io.h>
17 #include <linux/suspend.h>
18
19 #include <asm/pgtable.h>
20 #include <asm/tlbflush.h>
21 #include <asm/mmu_context.h>
22 #include <asm/debugreg.h>
23
24 #ifdef CONFIG_XEN
25
26 /* In the case of Xen, override hypervisor functions to be able to create
27  * a regular identity mapping page table...
28  */
29
30 #include <xen/interface/kexec.h>
31 #include <xen/interface/memory.h>
32
33 #define x__pmd(x) ((pmd_t) { (x) } )
34 #define x__pud(x) ((pud_t) { (x) } )
35 #define x__pgd(x) ((pgd_t) { (x) } )
36
37 #define x_pmd_val(x)   ((x).pmd)
38 #define x_pud_val(x)   ((x).pud)
39 #define x_pgd_val(x)   ((x).pgd)
40
41 static inline void x_set_pmd(pmd_t *dst, pmd_t val)
42 {
43         x_pmd_val(*dst) = x_pmd_val(val);
44 }
45
46 static inline void x_set_pud(pud_t *dst, pud_t val)
47 {
48         x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
49 }
50
51 static inline void x_pud_clear (pud_t *pud)
52 {
53         x_pud_val(*pud) = 0;
54 }
55
56 static inline void x_set_pgd(pgd_t *dst, pgd_t val)
57 {
58         x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val));
59 }
60
61 static inline void x_pgd_clear (pgd_t * pgd)
62 {
63         x_pgd_val(*pgd) = 0;
64 }
65
66 #define X__PAGE_KERNEL_LARGE_EXEC \
67          _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
68 #define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
69
70 #define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
71
72 #if PAGES_NR > KEXEC_XEN_NO_PAGES
73 #error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
74 #endif
75
76 #if PA_CONTROL_PAGE != 0
77 #error PA_CONTROL_PAGE is non zero - Xen support will break
78 #endif
79
80 void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
81 {
82         void *control_page;
83         void *table_page;
84
85         memset(xki->page_list, 0, sizeof(xki->page_list));
86
87         control_page = page_address(image->control_code_page) + PAGE_SIZE;
88         memcpy(control_page, relocate_kernel, PAGE_SIZE);
89
90         table_page = page_address(image->control_code_page);
91
92         xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
93         xki->page_list[PA_TABLE_PAGE] = __ma(table_page);
94
95         if (image->type == KEXEC_TYPE_DEFAULT)
96                 xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
97 }
98
99 #include "machine_kexec_xen.c"
100
101 #else /* CONFIG_XEN */
102
103 #define x__pmd(x) __pmd(x)
104 #define x__pud(x) __pud(x)
105 #define x__pgd(x) __pgd(x)
106
107 #define x_set_pmd(x, y) set_pmd(x, y)
108 #define x_set_pud(x, y) set_pud(x, y)
109 #define x_set_pgd(x, y) set_pgd(x, y)
110
111 #define x_pud_clear(x) pud_clear(x)
112 #define x_pgd_clear(x) pgd_clear(x)
113
114 #define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
115 #define X_KERNPG_TABLE _KERNPG_TABLE
116
117 #endif /* CONFIG_XEN */
118
119 static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
120                                 unsigned long addr)
121 {
122         pud_t *pud;
123         pmd_t *pmd;
124         struct page *page;
125         int result = -ENOMEM;
126
127         addr &= PMD_MASK;
128         pgd += pgd_index(addr);
129         if (!pgd_present(*pgd)) {
130                 page = kimage_alloc_control_pages(image, 0);
131                 if (!page)
132                         goto out;
133                 pud = (pud_t *)page_address(page);
134                 clear_page(pud);
135                 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
136         }
137         pud = pud_offset(pgd, addr);
138         if (!pud_present(*pud)) {
139                 page = kimage_alloc_control_pages(image, 0);
140                 if (!page)
141                         goto out;
142                 pmd = (pmd_t *)page_address(page);
143                 clear_page(pmd);
144                 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
145         }
146         pmd = pmd_offset(pud, addr);
147         if (!pmd_present(*pmd))
148                 x_set_pmd(pmd, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
149         result = 0;
150 out:
151         return result;
152 }
153
154 static void init_level2_page(pmd_t *level2p, unsigned long addr)
155 {
156         unsigned long end_addr;
157
158         addr &= PAGE_MASK;
159         end_addr = addr + PUD_SIZE;
160         while (addr < end_addr) {
161                 x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
162                 addr += PMD_SIZE;
163         }
164 }
165
166 static int init_level3_page(struct kimage *image, pud_t *level3p,
167                                 unsigned long addr, unsigned long last_addr)
168 {
169         unsigned long end_addr;
170         int result;
171
172         result = 0;
173         addr &= PAGE_MASK;
174         end_addr = addr + PGDIR_SIZE;
175         while ((addr < last_addr) && (addr < end_addr)) {
176                 struct page *page;
177                 pmd_t *level2p;
178
179                 page = kimage_alloc_control_pages(image, 0);
180                 if (!page) {
181                         result = -ENOMEM;
182                         goto out;
183                 }
184                 level2p = (pmd_t *)page_address(page);
185                 init_level2_page(level2p, addr);
186                 x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE));
187                 addr += PUD_SIZE;
188         }
189         /* clear the unused entries */
190         while (addr < end_addr) {
191                 x_pud_clear(level3p++);
192                 addr += PUD_SIZE;
193         }
194 out:
195         return result;
196 }
197
198
199 static int init_level4_page(struct kimage *image, pgd_t *level4p,
200                                 unsigned long addr, unsigned long last_addr)
201 {
202         unsigned long end_addr;
203         int result;
204
205         result = 0;
206         addr &= PAGE_MASK;
207         end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
208         while ((addr < last_addr) && (addr < end_addr)) {
209                 struct page *page;
210                 pud_t *level3p;
211
212                 page = kimage_alloc_control_pages(image, 0);
213                 if (!page) {
214                         result = -ENOMEM;
215                         goto out;
216                 }
217                 level3p = (pud_t *)page_address(page);
218                 result = init_level3_page(image, level3p, addr, last_addr);
219                 if (result)
220                         goto out;
221                 x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE));
222                 addr += PGDIR_SIZE;
223         }
224         /* clear the unused entries */
225         while (addr < end_addr) {
226                 x_pgd_clear(level4p++);
227                 addr += PGDIR_SIZE;
228         }
229 out:
230         return result;
231 }
232
233 static void free_transition_pgtable(struct kimage *image)
234 {
235         free_page((unsigned long)image->arch.pud);
236         free_page((unsigned long)image->arch.pmd);
237         free_page((unsigned long)image->arch.pte);
238 }
239
240 static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
241 {
242         pud_t *pud;
243         pmd_t *pmd;
244         pte_t *pte;
245         unsigned long vaddr, paddr;
246         int result = -ENOMEM;
247
248         vaddr = (unsigned long)relocate_kernel;
249         paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
250         pgd += pgd_index(vaddr);
251         if (!pgd_present(*pgd)) {
252                 pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
253                 if (!pud)
254                         goto err;
255                 image->arch.pud = pud;
256                 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
257         }
258         pud = pud_offset(pgd, vaddr);
259         if (!pud_present(*pud)) {
260                 pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
261                 if (!pmd)
262                         goto err;
263                 image->arch.pmd = pmd;
264                 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
265         }
266         pmd = pmd_offset(pud, vaddr);
267         if (!pmd_present(*pmd)) {
268                 pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
269                 if (!pte)
270                         goto err;
271                 image->arch.pte = pte;
272                 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
273         }
274         pte = pte_offset_kernel(pmd, vaddr);
275         set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
276         return 0;
277 err:
278         free_transition_pgtable(image);
279         return result;
280 }
281
282
283 static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
284 {
285         pgd_t *level4p;
286         int result;
287         unsigned long x_max_pfn = max_pfn;
288
289 #ifdef CONFIG_XEN
290         x_max_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
291 #endif
292
293         level4p = (pgd_t *)__va(start_pgtable);
294         result = init_level4_page(image, level4p, 0, x_max_pfn << PAGE_SHIFT);
295         if (result)
296                 return result;
297         /*
298          * image->start may be outside 0 ~ max_pfn, for example when
299          * jump back to original kernel from kexeced kernel
300          */
301         result = init_one_level2_page(image, level4p, image->start);
302         if (result)
303                 return result;
304         return init_transition_pgtable(image, level4p);
305 }
306
307 int machine_kexec_prepare(struct kimage *image)
308 {
309         unsigned long start_pgtable;
310         int result;
311
312         /* Calculate the offsets */
313         start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
314
315         /* Setup the identity mapped 64bit page table */
316         result = init_pgtable(image, start_pgtable);
317         if (result)
318                 return result;
319
320         return 0;
321 }
322
323 void machine_kexec_cleanup(struct kimage *image)
324 {
325         free_transition_pgtable(image);
326 }
327
328 #ifndef CONFIG_XEN
329 /*
330  * Do not allocate memory (or fail in any way) in machine_kexec().
331  * We are past the point of no return, committed to rebooting now.
332  */
333 void machine_kexec(struct kimage *image)
334 {
335         unsigned long page_list[PAGES_NR];
336         void *control_page;
337         int save_ftrace_enabled;
338
339 #ifdef CONFIG_KEXEC_JUMP
340         if (image->preserve_context)
341                 save_processor_state();
342 #endif
343
344         save_ftrace_enabled = __ftrace_enabled_save();
345
346         /* Interrupts aren't acceptable while we reboot */
347         local_irq_disable();
348         hw_breakpoint_disable();
349
350         if (image->preserve_context) {
351 #ifdef CONFIG_X86_IO_APIC
352                 /*
353                  * We need to put APICs in legacy mode so that we can
354                  * get timer interrupts in second kernel. kexec/kdump
355                  * paths already have calls to disable_IO_APIC() in
356                  * one form or other. kexec jump path also need
357                  * one.
358                  */
359                 disable_IO_APIC();
360 #endif
361         }
362
363         control_page = page_address(image->control_code_page) + PAGE_SIZE;
364         memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
365
366         page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
367         page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
368         page_list[PA_TABLE_PAGE] =
369           (unsigned long)__pa(page_address(image->control_code_page));
370
371         if (image->type == KEXEC_TYPE_DEFAULT)
372                 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
373                                                 << PAGE_SHIFT);
374
375         /* now call it */
376         image->start = relocate_kernel((unsigned long)image->head,
377                                        (unsigned long)page_list,
378                                        image->start,
379                                        image->preserve_context);
380
381 #ifdef CONFIG_KEXEC_JUMP
382         if (image->preserve_context)
383                 restore_processor_state();
384 #endif
385
386         __ftrace_enabled_restore(save_ftrace_enabled);
387 }
388 #endif
389
390 void arch_crash_save_vmcoreinfo(void)
391 {
392 #ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */
393         VMCOREINFO_SYMBOL(phys_base);
394 #endif
395         VMCOREINFO_SYMBOL(init_level4_pgt);
396
397 #ifdef CONFIG_NUMA
398         VMCOREINFO_SYMBOL(node_data);
399         VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
400 #endif
401 }
402