tick: Document TICK_ONESHOT config option

[linux-flexiantxendom0-3.2.10.git] / kernel / fork.c
diff --git a/kernel/fork.c b/kernel/fork.c

index 1f84099..b9372a0 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,7 +37,6 @@
  #include <linux/swap.h>
  #include <linux/syscalls.h>
  #include <linux/jiffies.h>
-#include <linux/tracehook.h>
  #include <linux/futex.h>
  #include <linux/compat.h>
  #include <linux/kthread.h>
@@ -59,7 +58,6 @@
  #include <linux/taskstats_kern.h>
  #include <linux/random.h>
  #include <linux/tty.h>
-#include <linux/proc_fs.h>
  #include <linux/blkdev.h>
  #include <linux/fs_struct.h>
  #include <linux/magic.h>
@@ -68,6 +66,7 @@
  #include <linux/user-return-notifier.h>
  #include <linux/oom.h>
  #include <linux/khugepaged.h>
+#include <linux/signalfd.h>
  
  #include <asm/pgtable.h>
  #include <asm/pgalloc.h>
@@ -78,11 +77,14 @@
  
  #include <trace/events/sched.h>
  
+#define CREATE_TRACE_POINTS
+#include <trace/events/task.h>
+
  /*
   * Protected counters by write_lock_irq(&tasklist_lock)
   */
  unsigned long total_forks;     /* Handle normal Linux uptimes. */
-int nr_threads;                /* The idle threads do not count.. */
+int nr_threads;                        /* The idle threads do not count.. */
  
  int max_threads;               /* tunable limit on nr_threads */
  
@@ -164,7 +166,6 @@ static void account_kernel_stack(struct thread_info *ti, int account)
  
  void free_task(struct task_struct *tsk)
  {
-       prop_local_destroy_single(&tsk->dirties);
         account_kernel_stack(tsk->stack, -1);
         free_thread_info(tsk->stack);
         rt_mutex_debug_task_free(tsk);
@@ -192,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk)
         WARN_ON(atomic_read(&tsk->usage));
         WARN_ON(tsk == current);
  
+       security_task_free(tsk);
         exit_creds(tsk);
         delayacct_tsk_free(tsk);
         put_signal_struct(tsk->signal);
@@ -234,7 +236,7 @@ void __init fork_init(unsigned long mempages)
         /*
          * we need to allow at least 20 threads to boot a system
          */
-       if(max_threads < 20)
+       if (max_threads < 20)
                 max_threads = 20;
  
         init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
@@ -270,16 +272,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
                 return NULL;
         }
  
-       err = arch_dup_task_struct(tsk, orig);
+       err = arch_dup_task_struct(tsk, orig);
         if (err)
                 goto out;
  
         tsk->stack = ti;
  
-       err = prop_local_init_single(&tsk->dirties);
-       if (err)
-               goto out;
-
         setup_thread_stack(tsk, orig);
         clear_user_return_notifier(tsk);
         clear_tsk_need_resched(tsk);
@@ -290,9 +288,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
         tsk->stack_canary = get_random_int();
  #endif
  
-       /* One for us, one for whoever does the "release_task()" (usually parent) */
-       atomic_set(&tsk->usage,2);
-       atomic_set(&tsk->fs_excl, 0);
+       /*
+        * One for us, one for whoever does the "release_task()" (usually
+        * parent)
+        */
+       atomic_set(&tsk->usage, 2);
  #ifdef CONFIG_BLK_DEV_IO_TRACE
         tsk->btrace_seq = 0;
  #endif
@@ -356,7 +356,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                 charge = 0;
                 if (mpnt->vm_flags & VM_ACCOUNT) {
                         unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
-                       if (security_vm_enough_memory(len))
+                       if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                 goto fail_nomem;
                         charge = len;
                 }
@@ -440,7 +440,7 @@ fail_nomem:
         goto out;
  }
  
-static inline int mm_alloc_pgd(struct mm_struct * mm)
+static inline int mm_alloc_pgd(struct mm_struct *mm)
  {
         mm->pgd = pgd_alloc(mm);
         if (unlikely(!mm->pgd))
@@ -448,7 +448,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm)
         return 0;
  }
  
-static inline void mm_free_pgd(struct mm_struct * mm)
+static inline void mm_free_pgd(struct mm_struct *mm)
  {
         pgd_free(mm, mm->pgd);
  }
@@ -485,21 +485,7 @@ static void mm_init_aio(struct mm_struct *mm)
  #endif
  }
  
-int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm)
-{
-#ifdef CONFIG_CPUMASK_OFFSTACK
-       if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL))
-               return -ENOMEM;
-
-       if (oldmm)
-               cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm));
-       else
-               memset(mm_cpumask(mm), 0, cpumask_size());
-#endif
-       return 0;
-}
-
-static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
  {
         atomic_set(&mm->mm_users, 1);
         atomic_set(&mm->mm_count, 1);
@@ -515,7 +501,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
         mm->cached_hole_size = ~0UL;
         mm_init_aio(mm);
         mm_init_owner(mm, p);
-       atomic_set(&mm->oom_disable_count, 0);
  
         if (likely(!mm_alloc_pgd(mm))) {
                 mm->def_flags = 0;
@@ -527,29 +512,37 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
         return NULL;
  }
  
+static void check_mm(struct mm_struct *mm)
+{
+       int i;
+
+       for (i = 0; i < NR_MM_COUNTERS; i++) {
+               long x = atomic_long_read(&mm->rss_stat.count[i]);
+
+               if (unlikely(x))
+                       printk(KERN_ALERT "BUG: Bad rss-counter state "
+                                         "mm:%p idx:%d val:%ld\n", mm, i, x);
+       }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       VM_BUG_ON(mm->pmd_huge_pte);
+#endif
+}
+
  /*
   * Allocate and initialize an mm_struct.
   */
-struct mm_struct * mm_alloc(void)
+struct mm_struct *mm_alloc(void)
  {
-       struct mm_struct * mm;
+       struct mm_struct *mm;
  
         mm = allocate_mm();
         if (!mm)
                 return NULL;
  
         memset(mm, 0, sizeof(*mm));
-       mm = mm_init(mm, current);
-       if (!mm)
-               return NULL;
-
-       if (mm_init_cpumask(mm, NULL)) {
-               mm_free_pgd(mm);
-               free_mm(mm);
-               return NULL;
-       }
-
-       return mm;
+       mm_init_cpumask(mm);
+       return mm_init(mm, current);
  }
  
  /*
@@ -560,13 +553,10 @@ struct mm_struct * mm_alloc(void)
  void __mmdrop(struct mm_struct *mm)
  {
         BUG_ON(mm == &init_mm);
-       free_cpumask_var(mm->cpu_vm_mask_var);
         mm_free_pgd(mm);
         destroy_context(mm);
         mmu_notifier_mm_destroy(mm);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       VM_BUG_ON(mm->pmd_huge_pte);
-#endif
+       check_mm(mm);
         free_mm(mm);
  }
  EXPORT_SYMBOL_GPL(__mmdrop);
@@ -597,6 +587,57 @@ void mmput(struct mm_struct *mm)
  }
  EXPORT_SYMBOL_GPL(mmput);
  
+/*
+ * We added or removed a vma mapping the executable. The vmas are only mapped
+ * during exec and are not mapped with the mmap system call.
+ * Callers must hold down_write() on the mm's mmap_sem for these
+ */
+void added_exe_file_vma(struct mm_struct *mm)
+{
+       mm->num_exe_file_vmas++;
+}
+
+void removed_exe_file_vma(struct mm_struct *mm)
+{
+       mm->num_exe_file_vmas--;
+       if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
+               fput(mm->exe_file);
+               mm->exe_file = NULL;
+       }
+
+}
+
+void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+       if (new_exe_file)
+               get_file(new_exe_file);
+       if (mm->exe_file)
+               fput(mm->exe_file);
+       mm->exe_file = new_exe_file;
+       mm->num_exe_file_vmas = 0;
+}
+
+struct file *get_mm_exe_file(struct mm_struct *mm)
+{
+       struct file *exe_file;
+
+       /* We need mmap_sem to protect against races with removal of
+        * VM_EXECUTABLE vmas */
+       down_read(&mm->mmap_sem);
+       exe_file = mm->exe_file;
+       if (exe_file)
+               get_file(exe_file);
+       up_read(&mm->mmap_sem);
+       return exe_file;
+}
+
+static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+       /* It's safe to write the exe_file pointer without exe_file_lock because
+        * this is called during fork when the task is not yet in /proc */
+       newmm->exe_file = get_mm_exe_file(oldmm);
+}
+
  /**
   * get_task_mm - acquire a reference to the task's mm
   *
@@ -623,6 +664,58 @@ struct mm_struct *get_task_mm(struct task_struct *task)
  }
  EXPORT_SYMBOL_GPL(get_task_mm);
  
+struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
+{
+       struct mm_struct *mm;
+       int err;
+
+       err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
+       if (err)
+               return ERR_PTR(err);
+
+       mm = get_task_mm(task);
+       if (mm && mm != current->mm &&
+                       !ptrace_may_access(task, mode)) {
+               mmput(mm);
+               mm = ERR_PTR(-EACCES);
+       }
+       mutex_unlock(&task->signal->cred_guard_mutex);
+
+       return mm;
+}
+
+static void complete_vfork_done(struct task_struct *tsk)
+{
+       struct completion *vfork;
+
+       task_lock(tsk);
+       vfork = tsk->vfork_done;
+       if (likely(vfork)) {
+               tsk->vfork_done = NULL;
+               complete(vfork);
+       }
+       task_unlock(tsk);
+}
+
+static int wait_for_vfork_done(struct task_struct *child,
+                               struct completion *vfork)
+{
+       int killed;
+
+       freezer_do_not_count();
+       killed = wait_for_completion_killable(vfork);
+       freezer_count();
+
+       if (killed) {
+               task_lock(child);
+               child->vfork_done = NULL;
+               task_unlock(child);
+       }
+
+       put_task_struct(child);
+       return killed;
+}
+
  /* Please note the differences between mmput and mm_release.
   * mmput is called whenever we stop holding onto a mm_struct,
   * error success whatever.
@@ -638,8 +731,6 @@ EXPORT_SYMBOL_GPL(get_task_mm);
   */
  void mm_release(struct task_struct *tsk, struct mm_struct *mm)
  {
-       struct completion *vfork_done = tsk->vfork_done;
-
         /* Get rid of any futexes when releasing the mm */
  #ifdef CONFIG_FUTEX
         if (unlikely(tsk->robust_list)) {
@@ -659,17 +750,15 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
         /* Get rid of any cached register state */
         deactivate_mm(tsk, mm);
  
-       /* notify parent sleeping on vfork() */
-       if (vfork_done) {
-               tsk->vfork_done = NULL;
-               complete(vfork_done);
-       }
+       if (tsk->vfork_done)
+               complete_vfork_done(tsk);
  
         /*
          * If we're exiting normally, clear a user-space tid field if
          * requested.  We leave this alone when dying by signal, to leave
          * the value intact in a core dump, and to save the unnecessary
-        * trouble otherwise.  Userland only wants this done for a sys_exit.
+        * trouble, say, a killed vfork parent shouldn't touch this mm.
+        * Userland only wants this done for a sys_exit.
          */
         if (tsk->clear_child_tid) {
                 if (!(tsk->flags & PF_SIGNALED) &&
@@ -703,6 +792,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
                 goto fail_nomem;
  
         memcpy(mm, oldmm, sizeof(*mm));
+       mm_init_cpumask(mm);
  
         /* Initializing for Swap token stuff */
         mm->token_priority = 0;
@@ -715,9 +805,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
         if (!mm_init(mm, tsk))
                 goto fail_nomem;
  
-       if (mm_init_cpumask(mm, oldmm))
-               goto fail_nocpumask;
-
         if (init_new_context(tsk, mm))
                 goto fail_nocontext;
  
@@ -744,9 +831,6 @@ fail_nomem:
         return NULL;
  
  fail_nocontext:
-       free_cpumask_var(mm->cpu_vm_mask_var);
-
-fail_nocpumask:
         /*
          * If init_new_context() failed, we cannot use mmput() to free the mm
          * because it calls destroy_context()
@@ -756,9 +840,9 @@ fail_nocpumask:
         return NULL;
  }
  
-static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
+static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
  {
-       struct mm_struct * mm, *oldmm;
+       struct mm_struct *mm, *oldmm;
         int retval;
  
         tsk->min_flt = tsk->maj_flt = 0;
@@ -794,8 +878,6 @@ good_mm:
         /* Initializing for Swap token stuff */
         mm->token_priority = 0;
         mm->last_interval = 0;
-       if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-               atomic_inc(&mm->oom_disable_count);
  
         tsk->mm = mm;
         tsk->active_mm = mm;
@@ -825,7 +907,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
         return 0;
  }
  
-static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
  {
         struct files_struct *oldf, *newf;
         int error = 0;
@@ -856,6 +938,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
  {
  #ifdef CONFIG_BLOCK
         struct io_context *ioc = current->io_context;
+       struct io_context *new_ioc;
  
         if (!ioc)
                 return 0;
@@ -867,11 +950,12 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
                 if (unlikely(!tsk->io_context))
                         return -ENOMEM;
         } else if (ioprio_valid(ioc->ioprio)) {
-               tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
-               if (unlikely(!tsk->io_context))
+               new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
+               if (unlikely(!new_ioc))
                         return -ENOMEM;
  
-               tsk->io_context->ioprio = ioc->ioprio;
+               new_ioc->ioprio = ioc->ioprio;
+               put_io_context(new_ioc);
         }
  #endif
         return 0;
@@ -896,8 +980,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
  
  void __cleanup_sighand(struct sighand_struct *sighand)
  {
-       if (atomic_dec_and_test(&sighand->count))
+       if (atomic_dec_and_test(&sighand->count)) {
+               signalfd_cleanup(sighand);
                 kmem_cache_free(sighand_cachep, sighand);
+       }
  }
  
  
@@ -958,13 +1044,16 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
         sched_autogroup_fork(sig);
  
  #ifdef CONFIG_CGROUPS
-       init_rwsem(&sig->threadgroup_fork_lock);
+       init_rwsem(&sig->group_rwsem);
  #endif
  
         sig->oom_adj = current->signal->oom_adj;
         sig->oom_score_adj = current->signal->oom_score_adj;
         sig->oom_score_adj_min = current->signal->oom_score_adj_min;
  
+       sig->has_child_subreaper = current->signal->has_child_subreaper ||
+                                  current->signal->is_child_subreaper;
+
         mutex_init(&sig->cred_guard_mutex);
  
         return 0;
@@ -976,9 +1065,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
  
         new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
         new_flags |= PF_FORKNOEXEC;
-       new_flags |= PF_STARTING;
         p->flags = new_flags;
-       clear_freeze_flag(p);
  }
  
  SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
@@ -992,7 +1079,7 @@ static void rt_mutex_init_task(struct task_struct *p)
  {
         raw_spin_lock_init(&p->pi_lock);
  #ifdef CONFIG_RT_MUTEXES
-       plist_head_init_raw(&p->pi_waiters, &p->pi_lock);
+       plist_head_init(&p->pi_waiters);
         p->pi_blocked_on = NULL;
  #endif
  }
@@ -1009,8 +1096,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
   */
  static void posix_cpu_timers_init(struct task_struct *tsk)
  {
-       tsk->cputime_expires.prof_exp = cputime_zero;
-       tsk->cputime_expires.virt_exp = cputime_zero;
+       tsk->cputime_expires.prof_exp = 0;
+       tsk->cputime_expires.virt_exp = 0;
         tsk->cputime_expires.sched_exp = 0;
         INIT_LIST_HEAD(&tsk->cpu_timers[0]);
         INIT_LIST_HEAD(&tsk->cpu_timers[1]);
@@ -1089,6 +1176,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                     p->real_cred->user != INIT_USER)
                         goto bad_fork_free;
         }
+       current->flags &= ~PF_NPROC_EXCEEDED;
  
         retval = copy_creds(p, clone_flags);
         if (retval < 0)
@@ -1117,14 +1205,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
  
         init_sigpending(&p->pending);
  
-       p->utime = cputime_zero;
-       p->stime = cputime_zero;
-       p->gtime = cputime_zero;
-       p->utimescaled = cputime_zero;
-       p->stimescaled = cputime_zero;
+       p->utime = p->stime = p->gtime = 0;
+       p->utimescaled = p->stimescaled = 0;
  #ifndef CONFIG_VIRT_CPU_ACCOUNTING
-       p->prev_utime = cputime_zero;
-       p->prev_stime = cputime_zero;
+       p->prev_utime = p->prev_stime = 0;
  #endif
  #if defined(SPLIT_RSS_COUNTING)
         memset(&p->rss_stat, 0, sizeof(p->rss_stat));
@@ -1143,17 +1227,22 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         p->io_context = NULL;
         p->audit_context = NULL;
         if (clone_flags & CLONE_THREAD)
-               threadgroup_fork_read_lock(current);
+               threadgroup_change_begin(current);
         cgroup_fork(p);
  #ifdef CONFIG_NUMA
         p->mempolicy = mpol_dup(p->mempolicy);
-       if (IS_ERR(p->mempolicy)) {
-               retval = PTR_ERR(p->mempolicy);
-               p->mempolicy = NULL;
-               goto bad_fork_cleanup_cgroup;
-       }
+       if (IS_ERR(p->mempolicy)) {
+               retval = PTR_ERR(p->mempolicy);
+               p->mempolicy = NULL;
+               goto bad_fork_cleanup_cgroup;
+       }
         mpol_fix_fork_child_flag(p);
  #endif
+#ifdef CONFIG_CPUSETS
+       p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
+       p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+       seqcount_init(&p->mems_allowed_seq);
+#endif
  #ifdef CONFIG_TRACE_IRQFLAGS
         p->irq_events = 0;
  #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
@@ -1193,25 +1282,33 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         retval = perf_event_init_task(p);
         if (retval)
                 goto bad_fork_cleanup_policy;
-
-       if ((retval = audit_alloc(p)))
+       retval = audit_alloc(p);
+       if (retval)
                 goto bad_fork_cleanup_policy;
         /* copy all the process information */
-       if ((retval = copy_semundo(clone_flags, p)))
+       retval = copy_semundo(clone_flags, p);
+       if (retval)
                 goto bad_fork_cleanup_audit;
-       if ((retval = copy_files(clone_flags, p)))
+       retval = copy_files(clone_flags, p);
+       if (retval)
                 goto bad_fork_cleanup_semundo;
-       if ((retval = copy_fs(clone_flags, p)))
+       retval = copy_fs(clone_flags, p);
+       if (retval)
                 goto bad_fork_cleanup_files;
-       if ((retval = copy_sighand(clone_flags, p)))
+       retval = copy_sighand(clone_flags, p);
+       if (retval)
                 goto bad_fork_cleanup_fs;
-       if ((retval = copy_signal(clone_flags, p)))
+       retval = copy_signal(clone_flags, p);
+       if (retval)
                 goto bad_fork_cleanup_sighand;
-       if ((retval = copy_mm(clone_flags, p)))
+       retval = copy_mm(clone_flags, p);
+       if (retval)
                 goto bad_fork_cleanup_signal;
-       if ((retval = copy_namespaces(clone_flags, p)))
+       retval = copy_namespaces(clone_flags, p);
+       if (retval)
                 goto bad_fork_cleanup_mm;
-       if ((retval = copy_io(clone_flags, p)))
+       retval = copy_io(clone_flags, p);
+       if (retval)
                 goto bad_fork_cleanup_namespaces;
         retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
         if (retval)
@@ -1233,7 +1330,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         /*
          * Clear TID on mm_release()?
          */
-       p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+       p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
  #ifdef CONFIG_BLOCK
         p->plug = NULL;
  #endif
@@ -1263,10 +1360,20 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         clear_all_latency_tracing(p);
  
         /* ok, now we should be set up.. */
-       p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
+       if (clone_flags & CLONE_THREAD)
+               p->exit_signal = -1;
+       else if (clone_flags & CLONE_PARENT)
+               p->exit_signal = current->group_leader->exit_signal;
+       else
+               p->exit_signal = (clone_flags & CSIGNAL);
+
         p->pdeath_signal = 0;
         p->exit_state = 0;
  
+       p->nr_dirtied = 0;
+       p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
+       p->dirty_paused_when = 0;
+
         /*
          * Ok, make it visible to the rest of the system.
          * We dont wake it up yet.
@@ -1301,7 +1408,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
          * it's process group.
          * A fatal signal pending means that current will exit, so the new
          * thread can't slip out of an OOM kill (or normal SIGKILL).
-        */
+       */
         recalc_sigpending();
         if (signal_pending(current)) {
                 spin_unlock(&current->sighand->siglock);
@@ -1319,7 +1426,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         }
  
         if (likely(p->pid)) {
-               tracehook_finish_clone(p, clone_flags, trace);
+               ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
  
                 if (thread_group_leader(p)) {
                         if (is_child_reaper(pid))
@@ -1343,8 +1450,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         proc_fork_connector(p);
         cgroup_post_fork(p);
         if (clone_flags & CLONE_THREAD)
-               threadgroup_fork_read_unlock(current);
+               threadgroup_change_end(current);
         perf_event_fork(p);
+
+       trace_task_newtask(p, clone_flags);
+
         return p;
  
  bad_fork_free_pid:
@@ -1356,13 +1466,8 @@ bad_fork_cleanup_io:
  bad_fork_cleanup_namespaces:
         exit_task_namespaces(p);
  bad_fork_cleanup_mm:
-       if (p->mm) {
-               task_lock(p);
-               if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-                       atomic_dec(&p->mm->oom_disable_count);
-               task_unlock(p);
+       if (p->mm)
                 mmput(p->mm);
-       }
  bad_fork_cleanup_signal:
         if (!(clone_flags & CLONE_THREAD))
                 free_signal_struct(p->signal);
@@ -1383,7 +1488,7 @@ bad_fork_cleanup_policy:
  bad_fork_cleanup_cgroup:
  #endif
         if (clone_flags & CLONE_THREAD)
-               threadgroup_fork_read_unlock(current);
+               threadgroup_change_end(current);
         cgroup_exit(p, cgroup_callbacks_done);
         delayacct_tsk_free(p);
         module_put(task_thread_info(p)->exec_domain->module);
@@ -1460,10 +1565,22 @@ long do_fork(unsigned long clone_flags,
         }
  
         /*
-        * When called from kernel_thread, don't do user tracing stuff.
+        * Determine whether and which event to report to ptracer.  When
+        * called from kernel_thread or CLONE_UNTRACED is explicitly
+        * requested, no event is reported; otherwise, report if the event
+        * for the type of forking is enabled.
          */
-       if (likely(user_mode(regs)))
-               trace = tracehook_prepare_clone(clone_flags);
+       if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
+               if (clone_flags & CLONE_VFORK)
+                       trace = PTRACE_EVENT_VFORK;
+               else if ((clone_flags & CSIGNAL) != SIGCHLD)
+                       trace = PTRACE_EVENT_CLONE;
+               else
+                       trace = PTRACE_EVENT_FORK;
+
+               if (likely(!ptrace_event_enabled(current, trace)))
+                       trace = 0;
+       }
  
         p = copy_process(clone_flags, stack_start, regs, stack_size,
                          child_tidptr, NULL, trace);
@@ -1484,29 +1601,18 @@ long do_fork(unsigned long clone_flags,
                 if (clone_flags & CLONE_VFORK) {
                         p->vfork_done = &vfork;
                         init_completion(&vfork);
+                       get_task_struct(p);
                 }
  
-               audit_finish_fork(p);
-               tracehook_report_clone(regs, clone_flags, nr, p);
-
-               /*
-                * We set PF_STARTING at creation in case tracing wants to
-                * use this to distinguish a fully live task from one that
-                * hasn't gotten to tracehook_report_clone() yet.  Now we
-                * clear it and set the child going.
-                */
-               p->flags &= ~PF_STARTING;
-
                 wake_up_new_task(p);
  
-               tracehook_report_clone_complete(trace, regs,
-                                               clone_flags, nr, p);
+               /* forking complete and child started to run, tell ptracer */
+               if (unlikely(trace))
+                       ptrace_event(trace, nr);
  
                 if (clone_flags & CLONE_VFORK) {
-                       freezer_do_not_count();
-                       wait_for_completion(&vfork);
-                       freezer_count();
-                       tracehook_report_vfork_done(p, nr);
+                       if (!wait_for_vfork_done(p, &vfork))
+                               ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
                 }
         } else {
                 nr = PTR_ERR(p);
@@ -1541,11 +1647,19 @@ void __init proc_caches_init(void)
         fs_cachep = kmem_cache_create("fs_cache",
                         sizeof(struct fs_struct), 0,
                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+       /*
+        * FIXME! The "sizeof(struct mm_struct)" currently includes the
+        * whole struct cpumask for the OFFSTACK case. We could change
+        * this to *only* allocate as much of it as required by the
+        * maximum number of CPU's we can ever have.  The cpumask_allocation
+        * is at the end of the structure, exactly for that reason.
+        */
         mm_cachep = kmem_cache_create("mm_struct",
                         sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
         vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
         mmap_init();
+       nsproxy_cache_init();
  }
  
  /*
@@ -1642,12 +1756,14 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
          */
         if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
                 do_sysvsem = 1;
-       if ((err = unshare_fs(unshare_flags, &new_fs)))
+       err = unshare_fs(unshare_flags, &new_fs);
+       if (err)
                 goto bad_unshare_out;
-       if ((err = unshare_fd(unshare_flags, &new_fd)))
+       err = unshare_fd(unshare_flags, &new_fd);
+       if (err)
                 goto bad_unshare_cleanup_fs;
-       if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
-                       new_fs)))
+       err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
+       if (err)
                 goto bad_unshare_cleanup_fd;
  
         if (new_fs || new_fd || do_sysvsem || new_nsproxy) {