2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
12 * Jun Nakajima <jun.nakajima@intel.com>
17 * This file handles the architecture-dependent parts of process handling..
20 #include <linux/stackprotector.h>
21 #include <linux/cpu.h>
22 #include <linux/errno.h>
23 #include <linux/sched.h>
25 #include <linux/kernel.h>
27 #include <linux/elfcore.h>
28 #include <linux/smp.h>
29 #include <linux/slab.h>
30 #include <linux/user.h>
31 #include <linux/interrupt.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
40 #include <linux/uaccess.h>
42 #include <linux/ftrace.h>
43 #include <linux/cpuidle.h>
45 #include <asm/pgtable.h>
46 #include <asm/system.h>
47 #include <asm/processor.h>
49 #include <asm/mmu_context.h>
50 #include <asm/prctl.h>
51 #include <xen/interface/physdev.h>
53 #include <asm/proto.h>
54 #include <asm/hardirq.h>
57 #include <asm/syscalls.h>
58 #include <asm/debugreg.h>
61 asmlinkage extern void ret_from_fork(void);
63 static DEFINE_PER_CPU(unsigned char, is_idle);
65 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
67 void idle_notifier_register(struct notifier_block *n)
69 atomic_notifier_chain_register(&idle_notifier, n);
71 EXPORT_SYMBOL_GPL(idle_notifier_register);
73 void idle_notifier_unregister(struct notifier_block *n)
75 atomic_notifier_chain_unregister(&idle_notifier, n);
77 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
81 percpu_write(is_idle, 1);
82 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
85 static void __exit_idle(void)
87 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
89 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
92 /* Called from interrupts to signify idle end */
95 /* idle loop has pid 0 */
102 static inline void play_dead(void)
109 * The idle thread. There's no useful work to be
110 * done, so just try to conserve power and have a
111 * low exit latency (ie sit in a loop waiting for
112 * somebody to say that they'd like to reschedule)
116 current_thread_info()->status |= TS_POLLING;
119 * If we're the non-boot CPU, nothing set the stack canary up
120 * for us. CPU0 already has it initialized but no harm in
121 * doing it again. This is a good place for updating it, as
122 * we wont ever return from this function (so the invalid
123 * canaries already on the stack wont ever trigger).
125 boot_init_stack_canary();
127 /* endless idle loop with no priority at all */
129 tick_nohz_idle_enter();
130 while (!need_resched()) {
134 if (cpu_is_offline(smp_processor_id()))
137 * Idle routines should keep interrupts disabled
138 * from here on, until they go to idle.
139 * Otherwise, idle callbacks can misfire.
144 /* Don't trace irqs off for idle */
145 stop_critical_timings();
147 /* enter_idle() needs rcu for notifiers */
150 if (cpuidle_idle_call())
154 start_critical_timings();
156 /* In many cases the interrupt that ended idle
157 has already called exit_idle. But some idle
158 loops can be woken up without interrupt. */
162 tick_nohz_idle_exit();
163 preempt_enable_no_resched();
169 /* Prints also some state that isn't saved in the pt_regs */
170 void __show_regs(struct pt_regs *regs, int all)
172 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
173 unsigned long d0, d1, d2, d3, d6, d7;
174 unsigned int fsindex, gsindex;
175 unsigned int ds, cs, es;
178 printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
179 printk_address(regs->ip, 1);
180 printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
181 regs->sp, regs->flags);
182 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
183 regs->ax, regs->bx, regs->cx);
184 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
185 regs->dx, regs->si, regs->di);
186 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
187 regs->bp, regs->r8, regs->r9);
188 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
189 regs->r10, regs->r11, regs->r12);
190 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
191 regs->r13, regs->r14, regs->r15);
193 asm("movl %%ds,%0" : "=r" (ds));
194 asm("movl %%cs,%0" : "=r" (cs));
195 asm("movl %%es,%0" : "=r" (es));
196 asm("mov %%fs,%0" : "=r" (fsindex));
197 asm("mov %%gs,%0" : "=r" (gsindex));
199 rdmsrl(MSR_FS_BASE, fs);
200 rdmsrl(MSR_GS_BASE, gs);
201 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
211 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
212 fs, fsindex, gs, gsindex, shadowgs);
213 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
215 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
221 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
225 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
228 void xen_load_gs_index(unsigned gs)
230 WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
232 EXPORT_SYMBOL(xen_load_gs_index);
234 void release_thread(struct task_struct *dead_task)
237 if (dead_task->mm->context.size) {
238 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
240 dead_task->mm->context.ldt,
241 dead_task->mm->context.size);
247 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
249 struct user_desc ud = {
256 struct desc_struct *desc = t->thread.tls_array;
261 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
263 return get_desc_base(&t->thread.tls_array[tls]);
267 * This gets called before we allocate a new thread and copy
268 * the current task into it.
270 void prepare_to_copy(struct task_struct *tsk)
275 int copy_thread(unsigned long clone_flags, unsigned long sp,
276 unsigned long unused,
277 struct task_struct *p, struct pt_regs *regs)
280 struct pt_regs *childregs;
281 struct task_struct *me = current;
283 childregs = ((struct pt_regs *)
284 (THREAD_SIZE + task_stack_page(p))) - 1;
291 childregs->sp = (unsigned long)childregs;
293 p->thread.sp = (unsigned long) childregs;
294 p->thread.sp0 = (unsigned long) (childregs+1);
296 set_tsk_thread_flag(p, TIF_FORK);
299 p->thread.io_bitmap_ptr = NULL;
301 savesegment(gs, p->thread.gsindex);
302 p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
303 savesegment(fs, p->thread.fsindex);
304 p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
305 savesegment(es, p->thread.es);
306 savesegment(ds, p->thread.ds);
309 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
311 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
312 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
313 IO_BITMAP_BYTES, GFP_KERNEL);
314 if (!p->thread.io_bitmap_ptr) {
315 p->thread.io_bitmap_max = 0;
318 set_tsk_thread_flag(p, TIF_IO_BITMAP);
322 * Set a new TLS for the child thread?
324 if (clone_flags & CLONE_SETTLS) {
325 #ifdef CONFIG_IA32_EMULATION
326 if (test_thread_flag(TIF_IA32))
327 err = do_set_thread_area(p, -1,
328 (struct user_desc __user *)childregs->si, 0);
331 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
335 p->thread.iopl = current->thread.iopl;
339 if (err && p->thread.io_bitmap_ptr) {
340 kfree(p->thread.io_bitmap_ptr);
341 p->thread.io_bitmap_max = 0;
348 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
349 unsigned long new_sp,
350 unsigned int _cs, unsigned int _ss, unsigned int _ds)
353 loadsegment(es, _ds);
354 loadsegment(ds, _ds);
360 regs->flags = X86_EFLAGS_IF;
362 * Free the old FP and other extended state
364 free_thread_xstate(current);
368 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
370 start_thread_common(regs, new_ip, new_sp,
371 __USER_CS, __USER_DS, 0);
374 #ifdef CONFIG_IA32_EMULATION
375 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
377 start_thread_common(regs, new_ip, new_sp,
378 __USER32_CS, __USER32_DS, __USER32_DS);
383 * switch_to(x,y) should switch tasks from x to y.
385 * This could still be optimized:
386 * - fold all the options into a flag word and test it with a single test.
387 * - could test fs/gs bitsliced
389 * Kprobes not supported here. Set the probe on schedule instead.
390 * Function graph tracer not supported too.
392 __notrace_funcgraph struct task_struct *
393 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
395 struct thread_struct *prev = &prev_p->thread;
396 struct thread_struct *next = &next_p->thread;
397 int cpu = smp_processor_id();
398 #ifndef CONFIG_X86_NO_TSS
399 struct tss_struct *tss = &per_cpu(init_tss, cpu);
402 #if CONFIG_XEN_COMPAT > 0x030002
403 struct physdev_set_iopl iopl_op;
404 struct physdev_set_iobitmap iobmp_op;
406 struct physdev_op _pdo[2], *pdo = _pdo;
407 #define iopl_op pdo->u.set_iopl
408 #define iobmp_op pdo->u.set_iobitmap
410 multicall_entry_t _mcl[8], *mcl = _mcl;
412 fpu = xen_switch_fpu_prepare(prev_p, next_p, cpu, &mcl);
416 * This is load_sp0(tss, next) with a multicall.
418 mcl->op = __HYPERVISOR_stack_switch;
419 mcl->args[0] = __KERNEL_DS;
420 mcl->args[1] = next->sp0;
424 * Load the per-thread Thread-Local Storage descriptor.
425 * This is load_TLS(next, cpu) with multicalls.
428 if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \
429 next->tls_array[i].b != prev->tls_array[i].b)) { \
430 mcl->op = __HYPERVISOR_update_descriptor; \
431 mcl->args[0] = arbitrary_virt_to_machine( \
432 &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
433 mcl->args[1] = *(u64 *)&next->tls_array[i]; \
440 if (unlikely(prev->iopl != next->iopl)) {
441 iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
442 #if CONFIG_XEN_COMPAT > 0x030002
443 mcl->op = __HYPERVISOR_physdev_op;
444 mcl->args[0] = PHYSDEVOP_set_iopl;
445 mcl->args[1] = (unsigned long)&iopl_op;
447 mcl->op = __HYPERVISOR_physdev_op_compat;
448 pdo->cmd = PHYSDEVOP_set_iopl;
449 mcl->args[0] = (unsigned long)pdo++;
454 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
455 set_xen_guest_handle(iobmp_op.bitmap,
456 (char *)next->io_bitmap_ptr);
457 iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
458 #if CONFIG_XEN_COMPAT > 0x030002
459 mcl->op = __HYPERVISOR_physdev_op;
460 mcl->args[0] = PHYSDEVOP_set_iobitmap;
461 mcl->args[1] = (unsigned long)&iobmp_op;
463 mcl->op = __HYPERVISOR_physdev_op_compat;
464 pdo->cmd = PHYSDEVOP_set_iobitmap;
465 mcl->args[0] = (unsigned long)pdo++;
470 #if CONFIG_XEN_COMPAT <= 0x030002
471 BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
473 BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
474 if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
479 * This won't pick up thread selector changes, but I guess that is ok.
481 if (unlikely(next->es))
482 loadsegment(es, next->es);
484 if (unlikely(next->ds))
485 loadsegment(ds, next->ds);
488 * Leave lazy mode, flushing any hypercalls made here.
489 * This must be done before restoring TLS segments so
490 * the GDT and LDT are properly updated, and must be
491 * done before math_state_restore, so the TS bit is up
494 arch_end_context_switch(next_p);
499 * Segment register != 0 always requires a reload. Also
500 * reload when it has changed. When prev process used 64bit
501 * base always reload to avoid an information leak.
503 if (unlikely(next->fsindex))
504 loadsegment(fs, next->fsindex);
507 WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs));
509 if (unlikely(next->gsindex))
510 load_gs_index(next->gsindex);
513 WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs));
515 switch_fpu_finish(next_p, fpu);
518 * Switch the PDA context.
520 percpu_write(current_task, next_p);
522 percpu_write(kernel_stack,
523 (unsigned long)task_stack_page(next_p) +
524 THREAD_SIZE - KERNEL_STACK_OFFSET);
527 * Now maybe reload the debug registers
529 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
530 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
531 __switch_to_xtra(prev_p, next_p);
536 void set_personality_64bit(void)
538 /* inherit personality from parent */
540 /* Make sure to be in 64bit mode */
541 clear_thread_flag(TIF_IA32);
543 /* Ensure the corresponding mm is not marked. */
545 current->mm->context.ia32_compat = 0;
547 /* TBD: overwrites user setup. Should have two bits.
548 But 64bit processes have always behaved this way,
549 so it's not too bad. The main problem is just that
550 32bit childs are affected again. */
551 current->personality &= ~READ_IMPLIES_EXEC;
554 void set_personality_ia32(void)
556 /* inherit personality from parent */
558 /* Make sure to be in 32bit mode */
559 set_thread_flag(TIF_IA32);
560 current->personality |= force_personality32;
562 /* Mark the associated mm as containing 32-bit tasks. */
564 current->mm->context.ia32_compat = 1;
566 /* Prepare the first "return" to user space */
567 current_thread_info()->status |= TS_COMPAT;
570 unsigned long get_wchan(struct task_struct *p)
576 if (!p || p == current || p->state == TASK_RUNNING)
578 stack = (unsigned long)task_stack_page(p);
579 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
581 fp = *(u64 *)(p->thread.sp);
583 if (fp < (unsigned long)stack ||
584 fp >= (unsigned long)stack+THREAD_SIZE)
587 if (!in_sched_functions(ip))
590 } while (count++ < 16);
594 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
597 int doit = task == current;
602 if (addr >= TASK_SIZE_OF(task))
605 /* handle small bases via the GDT because that's faster to
607 if (addr <= 0xffffffff) {
608 set_32bit_tls(task, GS_TLS, addr);
610 load_TLS(&task->thread, cpu);
611 load_gs_index(GS_TLS_SEL);
613 task->thread.gsindex = GS_TLS_SEL;
616 task->thread.gsindex = 0;
617 task->thread.gs = addr;
620 ret = HYPERVISOR_set_segment_base(
621 SEGBASE_GS_USER, addr);
627 /* Not strictly needed for fs, but do it for symmetry
629 if (addr >= TASK_SIZE_OF(task))
632 /* handle small bases via the GDT because that's faster to
634 if (addr <= 0xffffffff) {
635 set_32bit_tls(task, FS_TLS, addr);
637 load_TLS(&task->thread, cpu);
638 loadsegment(fs, FS_TLS_SEL);
640 task->thread.fsindex = FS_TLS_SEL;
643 task->thread.fsindex = 0;
644 task->thread.fs = addr;
646 /* set the selector to 0 to not confuse
649 ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
657 if (task->thread.fsindex == FS_TLS_SEL)
658 base = read_32bit_tls(task, FS_TLS);
660 rdmsrl(MSR_FS_BASE, base);
662 base = task->thread.fs;
663 ret = put_user(base, (unsigned long __user *)addr);
669 if (task->thread.gsindex == GS_TLS_SEL)
670 base = read_32bit_tls(task, GS_TLS);
672 savesegment(gs, gsindex);
674 rdmsrl(MSR_KERNEL_GS_BASE, base);
676 base = task->thread.gs;
678 base = task->thread.gs;
679 ret = put_user(base, (unsigned long __user *)addr);
691 long sys_arch_prctl(int code, unsigned long addr)
693 return do_arch_prctl(current, code, addr);