Update to 3.4-final.
[linux-flexiantxendom0-3.2.10.git] / arch / x86 / kernel / process_64-xen.c
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *      Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *      Andi Kleen.
9  *
10  *      CPU hotplug support - ashok.raj@intel.com
11  * 
12  *  Jun Nakajima <jun.nakajima@intel.com> 
13  *     Modified for Xen
14  */
15
16 /*
17  * This file handles the architecture-dependent parts of process handling..
18  */
19
20 #include <linux/cpu.h>
21 #include <linux/errno.h>
22 #include <linux/sched.h>
23 #include <linux/fs.h>
24 #include <linux/kernel.h>
25 #include <linux/mm.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/notifier.h>
35 #include <linux/kprobes.h>
36 #include <linux/kdebug.h>
37 #include <linux/prctl.h>
38 #include <linux/uaccess.h>
39 #include <linux/io.h>
40 #include <linux/ftrace.h>
41
42 #include <asm/pgtable.h>
43 #include <asm/processor.h>
44 #include <asm/i387.h>
45 #include <asm/fpu-internal.h>
46 #include <asm/mmu_context.h>
47 #include <asm/prctl.h>
48 #include <xen/interface/physdev.h>
49 #include <asm/desc.h>
50 #include <asm/proto.h>
51 #include <asm/hardirq.h>
52 #include <asm/ia32.h>
53 #include <asm/idle.h>
54 #include <asm/syscalls.h>
55 #include <asm/debugreg.h>
56 #include <asm/switch_to.h>
57
58 asmlinkage extern void ret_from_fork(void);
59
60 /* Prints also some state that isn't saved in the pt_regs */
61 void __show_regs(struct pt_regs *regs, int all)
62 {
63         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
64         unsigned long d0, d1, d2, d3, d6, d7;
65         unsigned int fsindex, gsindex;
66         unsigned int ds, cs, es;
67
68         show_regs_common();
69         printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
70         printk_address(regs->ip, 1);
71         printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
72                         regs->sp, regs->flags);
73         printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
74                regs->ax, regs->bx, regs->cx);
75         printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
76                regs->dx, regs->si, regs->di);
77         printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
78                regs->bp, regs->r8, regs->r9);
79         printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
80                regs->r10, regs->r11, regs->r12);
81         printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
82                regs->r13, regs->r14, regs->r15);
83
84         asm("movl %%ds,%0" : "=r" (ds));
85         asm("movl %%cs,%0" : "=r" (cs));
86         asm("movl %%es,%0" : "=r" (es));
87         asm("mov %%fs,%0" : "=r" (fsindex));
88         asm("mov %%gs,%0" : "=r" (gsindex));
89
90         rdmsrl(MSR_FS_BASE, fs);
91         rdmsrl(MSR_GS_BASE, gs);
92         rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
93
94         if (!all)
95                 return;
96
97         cr0 = read_cr0();
98         cr2 = read_cr2();
99         cr3 = read_cr3();
100         cr4 = read_cr4();
101
102         printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
103                fs, fsindex, gs, gsindex, shadowgs);
104         printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
105                         es, cr0);
106         printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
107                         cr4);
108
109         get_debugreg(d0, 0);
110         get_debugreg(d1, 1);
111         get_debugreg(d2, 2);
112         printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
113         get_debugreg(d3, 3);
114         get_debugreg(d6, 6);
115         get_debugreg(d7, 7);
116         printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
117 }
118
119 void xen_load_gs_index(unsigned gs)
120 {
121         WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
122 }
123 EXPORT_SYMBOL(xen_load_gs_index);
124
125 void release_thread(struct task_struct *dead_task)
126 {
127         if (dead_task->mm) {
128                 if (dead_task->mm->context.size) {
129                         printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
130                                         dead_task->comm,
131                                         dead_task->mm->context.ldt,
132                                         dead_task->mm->context.size);
133                         BUG();
134                 }
135         }
136 }
137
138 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
139 {
140         struct user_desc ud = {
141                 .base_addr = addr,
142                 .limit = 0xfffff,
143                 .seg_32bit = 1,
144                 .limit_in_pages = 1,
145                 .useable = 1,
146         };
147         struct desc_struct *desc = t->thread.tls_array;
148         desc += tls;
149         fill_ldt(desc, &ud);
150 }
151
152 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
153 {
154         return get_desc_base(&t->thread.tls_array[tls]);
155 }
156
157 /*
158  * This gets called before we allocate a new thread and copy
159  * the current task into it.
160  */
161 void prepare_to_copy(struct task_struct *tsk)
162 {
163         unlazy_fpu(tsk);
164 }
165
166 int copy_thread(unsigned long clone_flags, unsigned long sp,
167                 unsigned long unused,
168         struct task_struct *p, struct pt_regs *regs)
169 {
170         int err;
171         struct pt_regs *childregs;
172         struct task_struct *me = current;
173
174         childregs = ((struct pt_regs *)
175                         (THREAD_SIZE + task_stack_page(p))) - 1;
176         *childregs = *regs;
177
178         childregs->ax = 0;
179         if (user_mode(regs))
180                 childregs->sp = sp;
181         else
182                 childregs->sp = (unsigned long)childregs;
183
184         p->thread.sp = (unsigned long) childregs;
185         p->thread.sp0 = (unsigned long) (childregs+1);
186
187         set_tsk_thread_flag(p, TIF_FORK);
188
189         p->fpu_counter = 0;
190         p->thread.io_bitmap_ptr = NULL;
191
192         savesegment(gs, p->thread.gsindex);
193         p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
194         savesegment(fs, p->thread.fsindex);
195         p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
196         savesegment(es, p->thread.es);
197         savesegment(ds, p->thread.ds);
198
199         err = -ENOMEM;
200         memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
201
202         if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
203                 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
204                                                   IO_BITMAP_BYTES, GFP_KERNEL);
205                 if (!p->thread.io_bitmap_ptr) {
206                         p->thread.io_bitmap_max = 0;
207                         return -ENOMEM;
208                 }
209                 set_tsk_thread_flag(p, TIF_IO_BITMAP);
210         }
211
212         /*
213          * Set a new TLS for the child thread?
214          */
215         if (clone_flags & CLONE_SETTLS) {
216 #ifdef CONFIG_IA32_EMULATION
217                 if (test_thread_flag(TIF_IA32))
218                         err = do_set_thread_area(p, -1,
219                                 (struct user_desc __user *)childregs->si, 0);
220                 else
221 #endif
222                         err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
223                 if (err)
224                         goto out;
225         }
226         p->thread.iopl = current->thread.iopl;
227
228         err = 0;
229 out:
230         if (err && p->thread.io_bitmap_ptr) {
231                 kfree(p->thread.io_bitmap_ptr);
232                 p->thread.io_bitmap_max = 0;
233         }
234
235         return err;
236 }
237
238 static void
239 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
240                     unsigned long new_sp,
241                     unsigned int _cs, unsigned int _ss, unsigned int _ds)
242 {
243         loadsegment(fs, 0);
244         loadsegment(es, _ds);
245         loadsegment(ds, _ds);
246         load_gs_index(0);
247         regs->ip                = new_ip;
248         regs->sp                = new_sp;
249         regs->cs                = _cs;
250         regs->ss                = _ss;
251         regs->flags             = X86_EFLAGS_IF;
252         /*
253          * Free the old FP and other extended state
254          */
255         free_thread_xstate(current);
256 }
257
258 void
259 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
260 {
261         start_thread_common(regs, new_ip, new_sp,
262                             __USER_CS, __USER_DS, 0);
263 }
264
265 #ifdef CONFIG_IA32_EMULATION
266 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
267 {
268         start_thread_common(regs, new_ip, new_sp,
269                             test_thread_flag(TIF_X32)
270                             ? __USER_CS : __USER32_CS,
271                             __USER_DS, __USER_DS);
272 }
273 #endif
274
275 /*
276  *      switch_to(x,y) should switch tasks from x to y.
277  *
278  * This could still be optimized:
279  * - fold all the options into a flag word and test it with a single test.
280  * - could test fs/gs bitsliced
281  *
282  * Kprobes not supported here. Set the probe on schedule instead.
283  * Function graph tracer not supported too.
284  */
285 __notrace_funcgraph struct task_struct *
286 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
287 {
288         struct thread_struct *prev = &prev_p->thread;
289         struct thread_struct *next = &next_p->thread;
290         int cpu = smp_processor_id();
291 #ifndef CONFIG_X86_NO_TSS
292         struct tss_struct *tss = &per_cpu(init_tss, cpu);
293 #endif
294         fpu_switch_t fpu;
295 #if CONFIG_XEN_COMPAT > 0x030002
296         struct physdev_set_iopl iopl_op;
297         struct physdev_set_iobitmap iobmp_op;
298 #else
299         struct physdev_op _pdo[2], *pdo = _pdo;
300 #define iopl_op pdo->u.set_iopl
301 #define iobmp_op pdo->u.set_iobitmap
302 #endif
303         multicall_entry_t _mcl[8], *mcl = _mcl;
304
305         fpu = xen_switch_fpu_prepare(prev_p, next_p, cpu, &mcl);
306
307         /*
308          * Reload sp0.
309          * This is load_sp0(tss, next) with a multicall.
310          */
311         mcl->op      = __HYPERVISOR_stack_switch;
312         mcl->args[0] = __KERNEL_DS;
313         mcl->args[1] = next->sp0;
314         mcl++;
315
316         /*
317          * Load the per-thread Thread-Local Storage descriptor.
318          * This is load_TLS(next, cpu) with multicalls.
319          */
320 #define C(i) do {                                                       \
321         if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||    \
322                      next->tls_array[i].b != prev->tls_array[i].b)) {   \
323                 mcl->op      = __HYPERVISOR_update_descriptor;          \
324                 mcl->args[0] = arbitrary_virt_to_machine(               \
325                         &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
326                 mcl->args[1] = *(u64 *)&next->tls_array[i];             \
327                 mcl++;                                                  \
328         }                                                               \
329 } while (0)
330         C(0); C(1); C(2);
331 #undef C
332
333         if (unlikely(prev->iopl != next->iopl)) {
334                 iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
335 #if CONFIG_XEN_COMPAT > 0x030002
336                 mcl->op      = __HYPERVISOR_physdev_op;
337                 mcl->args[0] = PHYSDEVOP_set_iopl;
338                 mcl->args[1] = (unsigned long)&iopl_op;
339 #else
340                 mcl->op      = __HYPERVISOR_physdev_op_compat;
341                 pdo->cmd     = PHYSDEVOP_set_iopl;
342                 mcl->args[0] = (unsigned long)pdo++;
343 #endif
344                 mcl++;
345         }
346
347         if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
348                 set_xen_guest_handle(iobmp_op.bitmap,
349                                      (char *)next->io_bitmap_ptr);
350                 iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
351 #if CONFIG_XEN_COMPAT > 0x030002
352                 mcl->op      = __HYPERVISOR_physdev_op;
353                 mcl->args[0] = PHYSDEVOP_set_iobitmap;
354                 mcl->args[1] = (unsigned long)&iobmp_op;
355 #else
356                 mcl->op      = __HYPERVISOR_physdev_op_compat;
357                 pdo->cmd     = PHYSDEVOP_set_iobitmap;
358                 mcl->args[0] = (unsigned long)pdo++;
359 #endif
360                 mcl++;
361         }
362
363 #if CONFIG_XEN_COMPAT <= 0x030002
364         BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
365 #endif
366         BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
367         if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
368                 BUG();
369
370         /*
371          * Switch DS and ES.
372          * This won't pick up thread selector changes, but I guess that is ok.
373          */
374         if (unlikely(next->es))
375                 loadsegment(es, next->es);
376
377         if (unlikely(next->ds))
378                 loadsegment(ds, next->ds);
379
380         /*
381          * Leave lazy mode, flushing any hypercalls made here.
382          * This must be done before restoring TLS segments so
383          * the GDT and LDT are properly updated, and must be
384          * done before math_state_restore, so the TS bit is up
385          * to date.
386          */
387         arch_end_context_switch(next_p);
388
389         /*
390          * Switch FS and GS.
391          *
392          * Segment register != 0 always requires a reload.  Also
393          * reload when it has changed.  When prev process used 64bit
394          * base always reload to avoid an information leak.
395          */
396         if (unlikely(next->fsindex))
397                 loadsegment(fs, next->fsindex);
398
399         if (next->fs)
400                 WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs));
401         
402         if (unlikely(next->gsindex))
403                 load_gs_index(next->gsindex);
404
405         if (next->gs)
406                 WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs));
407
408         switch_fpu_finish(next_p, fpu);
409
410         /*
411          * Switch the PDA context.
412          */
413         percpu_write(current_task, next_p);
414
415         percpu_write(kernel_stack,
416                   (unsigned long)task_stack_page(next_p) +
417                   THREAD_SIZE - KERNEL_STACK_OFFSET);
418
419         /*
420          * Now maybe reload the debug registers
421          */
422         if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
423                      task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
424                 __switch_to_xtra(prev_p, next_p);
425
426         return prev_p;
427 }
428
429 void set_personality_64bit(void)
430 {
431         /* inherit personality from parent */
432
433         /* Make sure to be in 64bit mode */
434         clear_thread_flag(TIF_IA32);
435         clear_thread_flag(TIF_ADDR32);
436         clear_thread_flag(TIF_X32);
437
438         /* Ensure the corresponding mm is not marked. */
439         if (current->mm)
440                 current->mm->context.ia32_compat = 0;
441
442         /* TBD: overwrites user setup. Should have two bits.
443            But 64bit processes have always behaved this way,
444            so it's not too bad. The main problem is just that
445            32bit childs are affected again. */
446         current->personality &= ~READ_IMPLIES_EXEC;
447 }
448
449 void set_personality_ia32(bool x32)
450 {
451         /* inherit personality from parent */
452
453         /* Make sure to be in 32bit mode */
454         set_thread_flag(TIF_ADDR32);
455
456         /* Mark the associated mm as containing 32-bit tasks. */
457         if (current->mm)
458                 current->mm->context.ia32_compat = 1;
459
460         if (x32) {
461                 clear_thread_flag(TIF_IA32);
462                 set_thread_flag(TIF_X32);
463                 current->personality &= ~READ_IMPLIES_EXEC;
464                 /* is_compat_task() uses the presence of the x32
465                    syscall bit flag to determine compat status */
466                 current_thread_info()->status &= ~TS_COMPAT;
467         } else {
468                 set_thread_flag(TIF_IA32);
469                 clear_thread_flag(TIF_X32);
470                 current->personality |= force_personality32;
471                 /* Prepare the first "return" to user space */
472                 current_thread_info()->status |= TS_COMPAT;
473         }
474 }
475 EXPORT_SYMBOL_GPL(set_personality_ia32);
476
477 unsigned long get_wchan(struct task_struct *p)
478 {
479         unsigned long stack;
480         u64 fp, ip;
481         int count = 0;
482
483         if (!p || p == current || p->state == TASK_RUNNING)
484                 return 0;
485         stack = (unsigned long)task_stack_page(p);
486         if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
487                 return 0;
488         fp = *(u64 *)(p->thread.sp);
489         do {
490                 if (fp < (unsigned long)stack ||
491                     fp >= (unsigned long)stack+THREAD_SIZE)
492                         return 0;
493                 ip = *(u64 *)(fp+8);
494                 if (!in_sched_functions(ip))
495                         return ip;
496                 fp = *(u64 *)fp;
497         } while (count++ < 16);
498         return 0;
499 }
500
501 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
502 {
503         int ret = 0;
504         int doit = task == current;
505         int cpu;
506
507         switch (code) {
508         case ARCH_SET_GS:
509                 if (addr >= TASK_SIZE_OF(task))
510                         return -EPERM;
511                 cpu = get_cpu();
512                 /* handle small bases via the GDT because that's faster to
513                    switch. */
514                 if (addr <= 0xffffffff) {
515                         set_32bit_tls(task, GS_TLS, addr);
516                         if (doit) {
517                                 load_TLS(&task->thread, cpu);
518                                 load_gs_index(GS_TLS_SEL);
519                         }
520                         task->thread.gsindex = GS_TLS_SEL;
521                         task->thread.gs = 0;
522                 } else {
523                         task->thread.gsindex = 0;
524                         task->thread.gs = addr;
525                         if (doit) {
526                                 load_gs_index(0);
527                                 ret = HYPERVISOR_set_segment_base(
528                                         SEGBASE_GS_USER, addr);
529                         }
530                 }
531                 put_cpu();
532                 break;
533         case ARCH_SET_FS:
534                 /* Not strictly needed for fs, but do it for symmetry
535                    with gs */
536                 if (addr >= TASK_SIZE_OF(task))
537                         return -EPERM;
538                 cpu = get_cpu();
539                 /* handle small bases via the GDT because that's faster to
540                    switch. */
541                 if (addr <= 0xffffffff) {
542                         set_32bit_tls(task, FS_TLS, addr);
543                         if (doit) {
544                                 load_TLS(&task->thread, cpu);
545                                 loadsegment(fs, FS_TLS_SEL);
546                         }
547                         task->thread.fsindex = FS_TLS_SEL;
548                         task->thread.fs = 0;
549                 } else {
550                         task->thread.fsindex = 0;
551                         task->thread.fs = addr;
552                         if (doit) {
553                                 /* set the selector to 0 to not confuse
554                                    __switch_to */
555                                 loadsegment(fs, 0);
556                                 ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
557                                                                   addr);
558                         }
559                 }
560                 put_cpu();
561                 break;
562         case ARCH_GET_FS: {
563                 unsigned long base;
564                 if (task->thread.fsindex == FS_TLS_SEL)
565                         base = read_32bit_tls(task, FS_TLS);
566                 else if (doit)
567                         rdmsrl(MSR_FS_BASE, base);
568                 else
569                         base = task->thread.fs;
570                 ret = put_user(base, (unsigned long __user *)addr);
571                 break;
572         }
573         case ARCH_GET_GS: {
574                 unsigned long base;
575                 unsigned gsindex;
576                 if (task->thread.gsindex == GS_TLS_SEL)
577                         base = read_32bit_tls(task, GS_TLS);
578                 else if (doit) {
579                         savesegment(gs, gsindex);
580                         if (gsindex)
581                                 rdmsrl(MSR_KERNEL_GS_BASE, base);
582                         else
583                                 base = task->thread.gs;
584                 } else
585                         base = task->thread.gs;
586                 ret = put_user(base, (unsigned long __user *)addr);
587                 break;
588         }
589
590         default:
591                 ret = -EINVAL;
592                 break;
593         }
594
595         return ret;
596 }
597
598 long sys_arch_prctl(int code, unsigned long addr)
599 {
600         return do_arch_prctl(current, code, addr);
601 }
602