PM / Hibernate: fix the number of pages used for hibernate/thaw buffering
[linux-flexiantxendom0-3.2.10.git] / kernel / fork.c
index da4a6a1..b9372a0 100644 (file)
@@ -66,6 +66,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/oom.h>
 #include <linux/khugepaged.h>
+#include <linux/signalfd.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -76,6 +77,9 @@
 
 #include <trace/events/sched.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/task.h>
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
@@ -189,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk)
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
 
+       security_task_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);
@@ -351,7 +356,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                charge = 0;
                if (mpnt->vm_flags & VM_ACCOUNT) {
                        unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
-                       if (security_vm_enough_memory(len))
+                       if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
                }
@@ -507,6 +512,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
        return NULL;
 }
 
+static void check_mm(struct mm_struct *mm)
+{
+       int i;
+
+       for (i = 0; i < NR_MM_COUNTERS; i++) {
+               long x = atomic_long_read(&mm->rss_stat.count[i]);
+
+               if (unlikely(x))
+                       printk(KERN_ALERT "BUG: Bad rss-counter state "
+                                         "mm:%p idx:%d val:%ld\n", mm, i, x);
+       }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       VM_BUG_ON(mm->pmd_huge_pte);
+#endif
+}
+
 /*
  * Allocate and initialize an mm_struct.
  */
@@ -534,9 +556,7 @@ void __mmdrop(struct mm_struct *mm)
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_mm_destroy(mm);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       VM_BUG_ON(mm->pmd_huge_pte);
-#endif
+       check_mm(mm);
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -644,6 +664,58 @@ struct mm_struct *get_task_mm(struct task_struct *task)
 }
 EXPORT_SYMBOL_GPL(get_task_mm);
 
+struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
+{
+       struct mm_struct *mm;
+       int err;
+
+       err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
+       if (err)
+               return ERR_PTR(err);
+
+       mm = get_task_mm(task);
+       if (mm && mm != current->mm &&
+                       !ptrace_may_access(task, mode)) {
+               mmput(mm);
+               mm = ERR_PTR(-EACCES);
+       }
+       mutex_unlock(&task->signal->cred_guard_mutex);
+
+       return mm;
+}
+
+static void complete_vfork_done(struct task_struct *tsk)
+{
+       struct completion *vfork;
+
+       task_lock(tsk);
+       vfork = tsk->vfork_done;
+       if (likely(vfork)) {
+               tsk->vfork_done = NULL;
+               complete(vfork);
+       }
+       task_unlock(tsk);
+}
+
+static int wait_for_vfork_done(struct task_struct *child,
+                               struct completion *vfork)
+{
+       int killed;
+
+       freezer_do_not_count();
+       killed = wait_for_completion_killable(vfork);
+       freezer_count();
+
+       if (killed) {
+               task_lock(child);
+               child->vfork_done = NULL;
+               task_unlock(child);
+       }
+
+       put_task_struct(child);
+       return killed;
+}
+
 /* Please note the differences between mmput and mm_release.
  * mmput is called whenever we stop holding onto a mm_struct,
  * error success whatever.
@@ -659,8 +731,6 @@ EXPORT_SYMBOL_GPL(get_task_mm);
  */
 void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 {
-       struct completion *vfork_done = tsk->vfork_done;
-
        /* Get rid of any futexes when releasing the mm */
 #ifdef CONFIG_FUTEX
        if (unlikely(tsk->robust_list)) {
@@ -680,17 +750,15 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
        /* Get rid of any cached register state */
        deactivate_mm(tsk, mm);
 
-       /* notify parent sleeping on vfork() */
-       if (vfork_done) {
-               tsk->vfork_done = NULL;
-               complete(vfork_done);
-       }
+       if (tsk->vfork_done)
+               complete_vfork_done(tsk);
 
        /*
         * If we're exiting normally, clear a user-space tid field if
         * requested.  We leave this alone when dying by signal, to leave
         * the value intact in a core dump, and to save the unnecessary
-        * trouble otherwise.  Userland only wants this done for a sys_exit.
+        * trouble, say, a killed vfork parent shouldn't touch this mm.
+        * Userland only wants this done for a sys_exit.
         */
        if (tsk->clear_child_tid) {
                if (!(tsk->flags & PF_SIGNALED) &&
@@ -870,6 +938,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 {
 #ifdef CONFIG_BLOCK
        struct io_context *ioc = current->io_context;
+       struct io_context *new_ioc;
 
        if (!ioc)
                return 0;
@@ -881,11 +950,12 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
                if (unlikely(!tsk->io_context))
                        return -ENOMEM;
        } else if (ioprio_valid(ioc->ioprio)) {
-               tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
-               if (unlikely(!tsk->io_context))
+               new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
+               if (unlikely(!new_ioc))
                        return -ENOMEM;
 
-               tsk->io_context->ioprio = ioc->ioprio;
+               new_ioc->ioprio = ioc->ioprio;
+               put_io_context(new_ioc);
        }
 #endif
        return 0;
@@ -910,8 +980,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 
 void __cleanup_sighand(struct sighand_struct *sighand)
 {
-       if (atomic_dec_and_test(&sighand->count))
+       if (atomic_dec_and_test(&sighand->count)) {
+               signalfd_cleanup(sighand);
                kmem_cache_free(sighand_cachep, sighand);
+       }
 }
 
 
@@ -972,13 +1044,16 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sched_autogroup_fork(sig);
 
 #ifdef CONFIG_CGROUPS
-       init_rwsem(&sig->threadgroup_fork_lock);
+       init_rwsem(&sig->group_rwsem);
 #endif
 
        sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
+       sig->has_child_subreaper = current->signal->has_child_subreaper ||
+                                  current->signal->is_child_subreaper;
+
        mutex_init(&sig->cred_guard_mutex);
 
        return 0;
@@ -990,9 +1065,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
 
        new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
        new_flags |= PF_FORKNOEXEC;
-       new_flags |= PF_STARTING;
        p->flags = new_flags;
-       clear_freeze_flag(p);
 }
 
 SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
@@ -1023,8 +1096,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
  */
 static void posix_cpu_timers_init(struct task_struct *tsk)
 {
-       tsk->cputime_expires.prof_exp = cputime_zero;
-       tsk->cputime_expires.virt_exp = cputime_zero;
+       tsk->cputime_expires.prof_exp = 0;
+       tsk->cputime_expires.virt_exp = 0;
        tsk->cputime_expires.sched_exp = 0;
        INIT_LIST_HEAD(&tsk->cpu_timers[0]);
        INIT_LIST_HEAD(&tsk->cpu_timers[1]);
@@ -1132,14 +1205,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
        init_sigpending(&p->pending);
 
-       p->utime = cputime_zero;
-       p->stime = cputime_zero;
-       p->gtime = cputime_zero;
-       p->utimescaled = cputime_zero;
-       p->stimescaled = cputime_zero;
+       p->utime = p->stime = p->gtime = 0;
+       p->utimescaled = p->stimescaled = 0;
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
-       p->prev_utime = cputime_zero;
-       p->prev_stime = cputime_zero;
+       p->prev_utime = p->prev_stime = 0;
 #endif
 #if defined(SPLIT_RSS_COUNTING)
        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
@@ -1158,7 +1227,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->io_context = NULL;
        p->audit_context = NULL;
        if (clone_flags & CLONE_THREAD)
-               threadgroup_fork_read_lock(current);
+               threadgroup_change_begin(current);
        cgroup_fork(p);
 #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
@@ -1172,6 +1241,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_CPUSETS
        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
        p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+       seqcount_init(&p->mems_allowed_seq);
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
        p->irq_events = 0;
@@ -1290,12 +1360,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        clear_all_latency_tracing(p);
 
        /* ok, now we should be set up.. */
-       p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
+       if (clone_flags & CLONE_THREAD)
+               p->exit_signal = -1;
+       else if (clone_flags & CLONE_PARENT)
+               p->exit_signal = current->group_leader->exit_signal;
+       else
+               p->exit_signal = (clone_flags & CSIGNAL);
+
        p->pdeath_signal = 0;
        p->exit_state = 0;
 
        p->nr_dirtied = 0;
        p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
+       p->dirty_paused_when = 0;
 
        /*
         * Ok, make it visible to the rest of the system.
@@ -1373,8 +1450,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        proc_fork_connector(p);
        cgroup_post_fork(p);
        if (clone_flags & CLONE_THREAD)
-               threadgroup_fork_read_unlock(current);
+               threadgroup_change_end(current);
        perf_event_fork(p);
+
+       trace_task_newtask(p, clone_flags);
+
        return p;
 
 bad_fork_free_pid:
@@ -1408,7 +1488,7 @@ bad_fork_cleanup_policy:
 bad_fork_cleanup_cgroup:
 #endif
        if (clone_flags & CLONE_THREAD)
-               threadgroup_fork_read_unlock(current);
+               threadgroup_change_end(current);
        cgroup_exit(p, cgroup_callbacks_done);
        delayacct_tsk_free(p);
        module_put(task_thread_info(p)->exec_domain->module);
@@ -1521,18 +1601,9 @@ long do_fork(unsigned long clone_flags,
                if (clone_flags & CLONE_VFORK) {
                        p->vfork_done = &vfork;
                        init_completion(&vfork);
+                       get_task_struct(p);
                }
 
-               audit_finish_fork(p);
-
-               /*
-                * We set PF_STARTING at creation in case tracing wants to
-                * use this to distinguish a fully live task from one that
-                * hasn't finished SIGSTOP raising yet.  Now we clear it
-                * and set the child going.
-                */
-               p->flags &= ~PF_STARTING;
-
                wake_up_new_task(p);
 
                /* forking complete and child started to run, tell ptracer */
@@ -1540,10 +1611,8 @@ long do_fork(unsigned long clone_flags,
                        ptrace_event(trace, nr);
 
                if (clone_flags & CLONE_VFORK) {
-                       freezer_do_not_count();
-                       wait_for_completion(&vfork);
-                       freezer_count();
-                       ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
+                       if (!wait_for_vfork_done(p, &vfork))
+                               ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
                }
        } else {
                nr = PTR_ERR(p);