Update to 3.4-final.
[linux-flexiantxendom0-3.2.10.git] / fs / exec.c
index 5cb53f0..b1fd202 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -42,7 +42,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/module.h>
 #include <linux/namei.h>
-#include <linux/proc_fs.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
+#include <asm/exec.h>
+
+#include <trace/events/task.h>
 #include "internal.h"
 
+#include <trace/events/sched.h>
+
 int core_uses_pid;
 char core_pattern[CORENAME_MAX_SIZE] = "core";
 unsigned int core_pipe_limit;
@@ -78,15 +82,13 @@ static atomic_t call_count = ATOMIC_INIT(1);
 static LIST_HEAD(formats);
 static DEFINE_RWLOCK(binfmt_lock);
 
-int __register_binfmt(struct linux_binfmt * fmt, int insert)
+void __register_binfmt(struct linux_binfmt * fmt, int insert)
 {
-       if (!fmt)
-               return -EINVAL;
+       BUG_ON(!fmt);
        write_lock(&binfmt_lock);
        insert ? list_add(&fmt->lh, &formats) :
                 list_add_tail(&fmt->lh, &formats);
        write_unlock(&binfmt_lock);
-       return 0;       
 }
 
 EXPORT_SYMBOL(__register_binfmt);
@@ -182,14 +184,7 @@ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
                return;
 
        bprm->vma_pages = pages;
-
-#ifdef SPLIT_RSS_COUNTING
        add_mm_counter(mm, MM_ANONPAGES, diff);
-#else
-       spin_lock(&mm->page_table_lock);
-       add_mm_counter(mm, MM_ANONPAGES, diff);
-       spin_unlock(&mm->page_table_lock);
-#endif
 }
 
 static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
@@ -200,7 +195,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 
 #ifdef CONFIG_STACK_GROWSUP
        if (write) {
-               ret = expand_stack_downwards(bprm->vma, pos);
+               ret = expand_downwards(bprm->vma, pos);
                if (ret < 0)
                        return NULL;
        }
@@ -278,7 +273,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
         * use STACK_TOP because that can depend on attributes which aren't
         * configured yet.
         */
-       BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
+       BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
        vma->vm_end = STACK_TOP_MAX;
        vma->vm_start = vma->vm_end - PAGE_SIZE;
        vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
@@ -600,7 +595,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
        unsigned long length = old_end - old_start;
        unsigned long new_start = old_start - shift;
        unsigned long new_end = old_end - shift;
-       struct mmu_gather *tlb;
+       struct mmu_gather tlb;
 
        BUG_ON(new_start > new_end);
 
@@ -626,12 +621,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                return -ENOMEM;
 
        lru_add_drain();
-       tlb = tlb_gather_mmu(mm, 0);
+       tlb_gather_mmu(&tlb, mm, 0);
        if (new_end > old_start) {
                /*
                 * when the old and new regions overlap clear from new_end.
                 */
-               free_pgd_range(tlb, new_end, old_end, new_end,
+               free_pgd_range(&tlb, new_end, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : 0);
        } else {
                /*
@@ -640,10 +635,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                 * have constraints on va-space that make this illegal (IA64) -
                 * for the others its just a little faster.
                 */
-               free_pgd_range(tlb, old_start, old_end, new_end,
+               free_pgd_range(&tlb, old_start, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : 0);
        }
-       tlb_finish_mmu(tlb, new_end, old_end);
+       tlb_finish_mmu(&tlb, new_end, old_end);
 
        /*
         * Shrink the vma to just the new range.  Always succeeds.
@@ -828,7 +823,7 @@ static int exec_mmap(struct mm_struct *mm)
        /* Notify parent that we're no longer interested in the old VM */
        tsk = current;
        old_mm = current->mm;
-       sync_mm_rss(tsk, old_mm);
+       sync_mm_rss(old_mm);
        mm_release(tsk, old_mm);
 
        if (old_mm) {
@@ -849,15 +844,12 @@ static int exec_mmap(struct mm_struct *mm)
        tsk->mm = mm;
        tsk->active_mm = mm;
        activate_mm(active_mm, mm);
-       if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
-               atomic_dec(&old_mm->oom_disable_count);
-               atomic_inc(&tsk->mm->oom_disable_count);
-       }
        task_unlock(tsk);
        arch_pick_mmap_layout(mm);
        if (old_mm) {
                up_read(&old_mm->mmap_sem);
                BUG_ON(active_mm != old_mm);
+               setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
                mm_update_next_owner(old_mm);
                mmput(old_mm);
                return 0;
@@ -964,9 +956,18 @@ static int de_thread(struct task_struct *tsk)
                leader->group_leader = tsk;
 
                tsk->exit_signal = SIGCHLD;
+               leader->exit_signal = -1;
 
                BUG_ON(leader->exit_state != EXIT_ZOMBIE);
                leader->exit_state = EXIT_DEAD;
+
+               /*
+                * We are going to release_task()->ptrace_unlink() silently,
+                * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
+                * the tracer wont't block again waiting for this thread.
+                */
+               if (unlikely(leader->ptrace))
+                       __wake_up_parent(leader, leader->parent);
                write_unlock_irq(&tasklist_lock);
 
                release_task(leader);
@@ -976,8 +977,8 @@ static int de_thread(struct task_struct *tsk)
        sig->notify_count = 0;
 
 no_thread_group:
-       if (current->mm)
-               setmax_mm_hiwater_rss(&sig->maxrss, current->mm);
+       /* we have changed execution domain */
+       tsk->exit_signal = SIGCHLD;
 
        exit_itimers(sig);
        flush_itimer_signals();
@@ -1027,10 +1028,10 @@ static void flush_old_files(struct files_struct * files)
                fdt = files_fdtable(files);
                if (i >= fdt->max_fds)
                        break;
-               set = fdt->close_on_exec->fds_bits[j];
+               set = fdt->close_on_exec[j];
                if (!set)
                        continue;
-               fdt->close_on_exec->fds_bits[j] = 0;
+               fdt->close_on_exec[j] = 0;
                spin_unlock(&files->file_lock);
                for ( ; set ; i++,set >>= 1) {
                        if (set & 1) {
@@ -1051,11 +1052,14 @@ char *get_task_comm(char *buf, struct task_struct *tsk)
        task_unlock(tsk);
        return buf;
 }
+EXPORT_SYMBOL_GPL(get_task_comm);
 
 void set_task_comm(struct task_struct *tsk, char *buf)
 {
        task_lock(tsk);
 
+       trace_task_rename(tsk, buf);
+
        /*
         * Threads may access current->comm without holding
         * the task lock, so write the string carefully.
@@ -1069,6 +1073,21 @@ void set_task_comm(struct task_struct *tsk, char *buf)
        perf_event_comm(tsk);
 }
 
+static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
+{
+       int i, ch;
+
+       /* Copies the binary name from after last slash */
+       for (i = 0; (ch = *(fn++)) != '\0';) {
+               if (ch == '/')
+                       i = 0; /* overwrite what we wrote */
+               else
+                       if (i < len - 1)
+                               tcomm[i++] = ch;
+       }
+       tcomm[i] = '\0';
+}
+
 int flush_old_exec(struct linux_binprm * bprm)
 {
        int retval;
@@ -1083,6 +1102,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 
        set_mm_exe_file(bprm->mm, bprm->file);
 
+       filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));
        /*
         * Release all of the old mmap stuff
         */
@@ -1093,7 +1113,8 @@ int flush_old_exec(struct linux_binprm * bprm)
 
        bprm->mm = NULL;                /* We're using it now */
 
-       current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
+       set_fs(USER_DS);
+       current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD);
        flush_thread();
        current->personality &= ~bprm->per_clear;
 
@@ -1104,12 +1125,15 @@ out:
 }
 EXPORT_SYMBOL(flush_old_exec);
 
-void setup_new_exec(struct linux_binprm * bprm)
+void would_dump(struct linux_binprm *bprm, struct file *file)
 {
-       int i, ch;
-       const char *name;
-       char tcomm[sizeof(current->comm)];
+       if (inode_permission(file->f_path.dentry->d_inode, MAY_READ) < 0)
+               bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+}
+EXPORT_SYMBOL(would_dump);
 
+void setup_new_exec(struct linux_binprm * bprm)
+{
        arch_pick_mmap_layout(current->mm);
 
        /* This is the point of no return */
@@ -1120,18 +1144,7 @@ void setup_new_exec(struct linux_binprm * bprm)
        else
                set_dumpable(current->mm, suid_dumpable);
 
-       name = bprm->filename;
-
-       /* Copies the binary name from after last slash */
-       for (i=0; (ch = *(name++)) != '\0';) {
-               if (ch == '/')
-                       i = 0; /* overwrite what we wrote */
-               else
-                       if (i < (sizeof(tcomm) - 1))
-                               tcomm[i++] = ch;
-       }
-       tcomm[i] = '\0';
-       set_task_comm(current, tcomm);
+       set_task_comm(current, bprm->tcomm);
 
        /* Set the new mm task size. We have to do that late because it may
         * depend on TIF_32BIT which is only updated in flush_thread() on
@@ -1143,9 +1156,10 @@ void setup_new_exec(struct linux_binprm * bprm)
        if (bprm->cred->uid != current_euid() ||
            bprm->cred->gid != current_egid()) {
                current->pdeath_signal = 0;
-       } else if (file_permission(bprm->file, MAY_READ) ||
-                  bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) {
-               set_dumpable(current->mm, suid_dumpable);
+       } else {
+               would_dump(bprm, bprm->file);
+               if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
+                       set_dumpable(current->mm, suid_dumpable);
        }
 
        /*
@@ -1218,13 +1232,18 @@ EXPORT_SYMBOL(install_exec_creds);
  * - the caller must hold ->cred_guard_mutex to protect against
  *   PTRACE_ATTACH
  */
-int check_unsafe_exec(struct linux_binprm *bprm)
+static int check_unsafe_exec(struct linux_binprm *bprm)
 {
        struct task_struct *p = current, *t;
        unsigned n_fs;
        int res = 0;
 
-       bprm->unsafe = tracehook_unsafe_exec(p);
+       if (p->ptrace) {
+               if (p->ptrace & PT_PTRACE_CAP)
+                       bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
+               else
+                       bprm->unsafe |= LSM_UNSAFE_PTRACE;
+       }
 
        n_fs = 1;
        spin_lock(&p->fs->lock);
@@ -1322,13 +1341,13 @@ int remove_arg_zero(struct linux_binprm *bprm)
                        ret = -EFAULT;
                        goto out;
                }
-               kaddr = kmap_atomic(page, KM_USER0);
+               kaddr = kmap_atomic(page);
 
                for (; offset < PAGE_SIZE && kaddr[offset];
                                offset++, bprm->p++)
                        ;
 
-               kunmap_atomic(kaddr, KM_USER0);
+               kunmap_atomic(kaddr);
                put_arg_page(page);
 
                if (offset == PAGE_SIZE)
@@ -1352,19 +1371,22 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
        unsigned int depth = bprm->recursion_depth;
        int try,retval;
        struct linux_binfmt *fmt;
+       pid_t old_pid, old_vpid;
 
        retval = security_bprm_check(bprm);
        if (retval)
                return retval;
 
-       /* kernel module loader fixup */
-       /* so we don't try to load run modprobe in kernel space. */
-       set_fs(USER_DS);
-
        retval = audit_bprm(bprm);
        if (retval)
                return retval;
 
+       /* Need to fetch pid before load_binary changes it */
+       old_pid = current->pid;
+       rcu_read_lock();
+       old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
+       rcu_read_unlock();
+
        retval = -ENOENT;
        for (try=0; try<2; try++) {
                read_lock(&binfmt_lock);
@@ -1383,8 +1405,10 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
                         */
                        bprm->recursion_depth = depth;
                        if (retval >= 0) {
-                               if (depth == 0)
-                                       tracehook_report_exec(fmt, bprm, regs);
+                               if (depth == 0) {
+                                       trace_sched_process_exec(current, old_pid, bprm);
+                                       ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
+                               }
                                put_binfmt(fmt);
                                allow_write_access(bprm->file);
                                if (bprm->file)
@@ -1404,9 +1428,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
                        }
                }
                read_unlock(&binfmt_lock);
+#ifdef CONFIG_MODULES
                if (retval != -ENOEXEC || bprm->mm == NULL) {
                        break;
-#ifdef CONFIG_MODULES
                } else {
 #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
                        if (printable(bprm->buf[0]) &&
@@ -1414,9 +1438,13 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
                            printable(bprm->buf[2]) &&
                            printable(bprm->buf[3]))
                                break; /* -ENOEXEC */
+                       if (try)
+                               break; /* -ENOEXEC */
                        request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
-#endif
                }
+#else
+               break;
+#endif
        }
        return retval;
 }
@@ -1436,6 +1464,23 @@ static int do_execve_common(const char *filename,
        struct files_struct *displaced;
        bool clear_in_exec;
        int retval;
+       const struct cred *cred = current_cred();
+
+       /*
+        * We move the actual failure in case of RLIMIT_NPROC excess from
+        * set*uid() to execve() because too many poorly written programs
+        * don't check setuid() return code.  Here we additionally recheck
+        * whether NPROC limit is still exceeded.
+        */
+       if ((current->flags & PF_NPROC_EXCEEDED) &&
+           atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
+               retval = -EAGAIN;
+               goto out_ret;
+       }
+
+       /* We're below the limit (still or again), so we don't want to make
+        * further execve() calls fail. */
+       current->flags &= ~PF_NPROC_EXCEEDED;
 
        retval = unshare_files(&displaced);
        if (retval)
@@ -1623,6 +1668,50 @@ expand_fail:
        return ret;
 }
 
+static void cn_escape(char *str)
+{
+       for (; *str; str++)
+               if (*str == '/')
+                       *str = '!';
+}
+
+static int cn_print_exe_file(struct core_name *cn)
+{
+       struct file *exe_file;
+       char *pathbuf, *path;
+       int ret;
+
+       exe_file = get_mm_exe_file(current->mm);
+       if (!exe_file) {
+               char *commstart = cn->corename + cn->used;
+               ret = cn_printf(cn, "%s (path unknown)", current->comm);
+               cn_escape(commstart);
+               return ret;
+       }
+
+       pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
+       if (!pathbuf) {
+               ret = -ENOMEM;
+               goto put_exe_file;
+       }
+
+       path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+       if (IS_ERR(path)) {
+               ret = PTR_ERR(path);
+               goto free_buf;
+       }
+
+       cn_escape(path);
+
+       ret = cn_printf(cn, "%s", path);
+
+free_buf:
+       kfree(pathbuf);
+put_exe_file:
+       fput(exe_file);
+       return ret;
+}
+
 /* format_corename will inspect the pattern parameter, and output a
  * name into corename, which must have space for at least
  * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
@@ -1684,15 +1773,24 @@ static int format_corename(struct core_name *cn, long signr)
                                break;
                        }
                        /* hostname */
-                       case 'h':
+                       case 'h': {
+                               char *namestart = cn->corename + cn->used;
                                down_read(&uts_sem);
                                err = cn_printf(cn, "%s",
                                              utsname()->nodename);
                                up_read(&uts_sem);
+                               cn_escape(namestart);
                                break;
+                       }
                        /* executable */
-                       case 'e':
+                       case 'e': {
+                               char *commstart = cn->corename + cn->used;
                                err = cn_printf(cn, "%s", current->comm);
+                               cn_escape(commstart);
+                               break;
+                       }
+                       case 'E':
+                               err = cn_print_exe_file(cn);
                                break;
                        /* core limit size */
                        case 'c':
@@ -1734,6 +1832,7 @@ static int zap_process(struct task_struct *start, int exit_code)
 
        t = start;
        do {
+               task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                if (t != current && t->mm) {
                        sigaddset(&t->pending.signal, SIGKILL);
                        signal_wake_up(t, 1);
@@ -1820,7 +1919,6 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
 {
        struct task_struct *tsk = current;
        struct mm_struct *mm = tsk->mm;
-       struct completion *vfork_done;
        int core_waiters = -EBUSY;
 
        init_completion(&core_state->startup);
@@ -1832,22 +1930,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
                core_waiters = zap_threads(tsk, mm, core_state, exit_code);
        up_write(&mm->mmap_sem);
 
-       if (unlikely(core_waiters < 0))
-               goto fail;
-
-       /*
-        * Make sure nobody is waiting for us to release the VM,
-        * otherwise we can deadlock when we wait on each other
-        */
-       vfork_done = tsk->vfork_done;
-       if (vfork_done) {
-               tsk->vfork_done = NULL;
-               complete(vfork_done);
-       }
-
-       if (core_waiters)
+       if (core_waiters > 0)
                wait_for_completion(&core_state->startup);
-fail:
+
        return core_waiters;
 }
 
@@ -1960,7 +2045,7 @@ static void wait_for_dump_helpers(struct file *file)
  * is a special value that we use to trap recursive
  * core dumps
  */
-static int umh_pipe_setup(struct subprocess_info *info)
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 {
        struct file *rp, *wp;
        struct fdtable *fdt;
@@ -1983,8 +2068,8 @@ static int umh_pipe_setup(struct subprocess_info *info)
        fd_install(0, rp);
        spin_lock(&cf->file_lock);
        fdt = files_fdtable(cf);
-       FD_SET(0, fdt->open_fds);
-       FD_CLR(0, fdt->close_on_exec);
+       __set_open_fd(0, fdt);
+       __clear_close_on_exec(0, fdt);
        spin_unlock(&cf->file_lock);
 
        /* and disallow core files too */
@@ -2053,16 +2138,16 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 
        ispipe = format_corename(&cn, signr);
 
-       if (ispipe == -ENOMEM) {
-               printk(KERN_WARNING "format_corename failed\n");
-               printk(KERN_WARNING "Aborting core\n");
-               goto fail_corename;
-       }
-
        if (ispipe) {
                int dump_count;
                char **helper_argv;
 
+               if (ispipe < 0) {
+                       printk(KERN_WARNING "format_corename failed\n");
+                       printk(KERN_WARNING "Aborting core\n");
+                       goto fail_corename;
+               }
+
                if (cprm.limit == 1) {
                        /*
                         * Normally core limits are irrelevant to pipes, since