Update to 3.4-final.

[linux-flexiantxendom0-3.2.10.git] / fs / exec.c
diff --git a/fs/exec.c b/fs/exec.c

index 5cb53f0..b1fd202 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -42,7 +42,6 @@
  #include <linux/pid_namespace.h>
  #include <linux/module.h>
  #include <linux/namei.h>
-#include <linux/proc_fs.h>
  #include <linux/mount.h>
  #include <linux/security.h>
  #include <linux/syscalls.h>
@@ -60,8 +59,13 @@
  #include <asm/uaccess.h>
  #include <asm/mmu_context.h>
  #include <asm/tlb.h>
+#include <asm/exec.h>
+
+#include <trace/events/task.h>
  #include "internal.h"
  
+#include <trace/events/sched.h>
+
  int core_uses_pid;
  char core_pattern[CORENAME_MAX_SIZE] = "core";
  unsigned int core_pipe_limit;
@@ -78,15 +82,13 @@ static atomic_t call_count = ATOMIC_INIT(1);
  static LIST_HEAD(formats);
  static DEFINE_RWLOCK(binfmt_lock);
  
-int __register_binfmt(struct linux_binfmt * fmt, int insert)
+void __register_binfmt(struct linux_binfmt * fmt, int insert)
  {
-       if (!fmt)
-               return -EINVAL;
+       BUG_ON(!fmt);
         write_lock(&binfmt_lock);
         insert ? list_add(&fmt->lh, &formats) :
                  list_add_tail(&fmt->lh, &formats);
         write_unlock(&binfmt_lock);
-       return 0;       
  }
  
  EXPORT_SYMBOL(__register_binfmt);
@@ -182,14 +184,7 @@ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
                 return;
  
         bprm->vma_pages = pages;
-
-#ifdef SPLIT_RSS_COUNTING
         add_mm_counter(mm, MM_ANONPAGES, diff);
-#else
-       spin_lock(&mm->page_table_lock);
-       add_mm_counter(mm, MM_ANONPAGES, diff);
-       spin_unlock(&mm->page_table_lock);
-#endif
  }
  
  static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
@@ -200,7 +195,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
  
  #ifdef CONFIG_STACK_GROWSUP
         if (write) {
-               ret = expand_stack_downwards(bprm->vma, pos);
+               ret = expand_downwards(bprm->vma, pos);
                 if (ret < 0)
                         return NULL;
         }
@@ -278,7 +273,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
          * use STACK_TOP because that can depend on attributes which aren't
          * configured yet.
          */
-       BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
+       BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
         vma->vm_end = STACK_TOP_MAX;
         vma->vm_start = vma->vm_end - PAGE_SIZE;
         vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
@@ -600,7 +595,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
         unsigned long length = old_end - old_start;
         unsigned long new_start = old_start - shift;
         unsigned long new_end = old_end - shift;
-       struct mmu_gather *tlb;
+       struct mmu_gather tlb;
  
         BUG_ON(new_start > new_end);
  
@@ -626,12 +621,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                 return -ENOMEM;
  
         lru_add_drain();
-       tlb = tlb_gather_mmu(mm, 0);
+       tlb_gather_mmu(&tlb, mm, 0);
         if (new_end > old_start) {
                 /*
                  * when the old and new regions overlap clear from new_end.
                  */
-               free_pgd_range(tlb, new_end, old_end, new_end,
+               free_pgd_range(&tlb, new_end, old_end, new_end,
                         vma->vm_next ? vma->vm_next->vm_start : 0);
         } else {
                 /*
@@ -640,10 +635,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                  * have constraints on va-space that make this illegal (IA64) -
                  * for the others its just a little faster.
                  */
-               free_pgd_range(tlb, old_start, old_end, new_end,
+               free_pgd_range(&tlb, old_start, old_end, new_end,
                         vma->vm_next ? vma->vm_next->vm_start : 0);
         }
-       tlb_finish_mmu(tlb, new_end, old_end);
+       tlb_finish_mmu(&tlb, new_end, old_end);
  
         /*
          * Shrink the vma to just the new range.  Always succeeds.
@@ -828,7 +823,7 @@ static int exec_mmap(struct mm_struct *mm)
         /* Notify parent that we're no longer interested in the old VM */
         tsk = current;
         old_mm = current->mm;
-       sync_mm_rss(tsk, old_mm);
+       sync_mm_rss(old_mm);
         mm_release(tsk, old_mm);
  
         if (old_mm) {
@@ -849,15 +844,12 @@ static int exec_mmap(struct mm_struct *mm)
         tsk->mm = mm;
         tsk->active_mm = mm;
         activate_mm(active_mm, mm);
-       if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
-               atomic_dec(&old_mm->oom_disable_count);
-               atomic_inc(&tsk->mm->oom_disable_count);
-       }
         task_unlock(tsk);
         arch_pick_mmap_layout(mm);
         if (old_mm) {
                 up_read(&old_mm->mmap_sem);
                 BUG_ON(active_mm != old_mm);
+               setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
                 mm_update_next_owner(old_mm);
                 mmput(old_mm);
                 return 0;
@@ -964,9 +956,18 @@ static int de_thread(struct task_struct *tsk)
                 leader->group_leader = tsk;
  
                 tsk->exit_signal = SIGCHLD;
+               leader->exit_signal = -1;
  
                 BUG_ON(leader->exit_state != EXIT_ZOMBIE);
                 leader->exit_state = EXIT_DEAD;
+
+               /*
+                * We are going to release_task()->ptrace_unlink() silently,
+                * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
+                * the tracer wont't block again waiting for this thread.
+                */
+               if (unlikely(leader->ptrace))
+                       __wake_up_parent(leader, leader->parent);
                 write_unlock_irq(&tasklist_lock);
  
                 release_task(leader);
@@ -976,8 +977,8 @@ static int de_thread(struct task_struct *tsk)
         sig->notify_count = 0;
  
  no_thread_group:
-       if (current->mm)
-               setmax_mm_hiwater_rss(&sig->maxrss, current->mm);
+       /* we have changed execution domain */
+       tsk->exit_signal = SIGCHLD;
  
         exit_itimers(sig);
         flush_itimer_signals();
@@ -1027,10 +1028,10 @@ static void flush_old_files(struct files_struct * files)
                 fdt = files_fdtable(files);
                 if (i >= fdt->max_fds)
                         break;
-               set = fdt->close_on_exec->fds_bits[j];
+               set = fdt->close_on_exec[j];
                 if (!set)
                         continue;
-               fdt->close_on_exec->fds_bits[j] = 0;
+               fdt->close_on_exec[j] = 0;
                 spin_unlock(&files->file_lock);
                 for ( ; set ; i++,set >>= 1) {
                         if (set & 1) {
@@ -1051,11 +1052,14 @@ char *get_task_comm(char *buf, struct task_struct *tsk)
         task_unlock(tsk);
         return buf;
  }
+EXPORT_SYMBOL_GPL(get_task_comm);
  
  void set_task_comm(struct task_struct *tsk, char *buf)
  {
         task_lock(tsk);
  
+       trace_task_rename(tsk, buf);
+
         /*
          * Threads may access current->comm without holding
          * the task lock, so write the string carefully.
@@ -1069,6 +1073,21 @@ void set_task_comm(struct task_struct *tsk, char *buf)
         perf_event_comm(tsk);
  }
  
+static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
+{
+       int i, ch;
+
+       /* Copies the binary name from after last slash */
+       for (i = 0; (ch = *(fn++)) != '\0';) {
+               if (ch == '/')
+                       i = 0; /* overwrite what we wrote */
+               else
+                       if (i < len - 1)
+                               tcomm[i++] = ch;
+       }
+       tcomm[i] = '\0';
+}
+
  int flush_old_exec(struct linux_binprm * bprm)
  {
         int retval;
@@ -1083,6 +1102,7 @@ int flush_old_exec(struct linux_binprm * bprm)
  
         set_mm_exe_file(bprm->mm, bprm->file);
  
+       filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));
         /*
          * Release all of the old mmap stuff
          */
@@ -1093,7 +1113,8 @@ int flush_old_exec(struct linux_binprm * bprm)
  
         bprm->mm = NULL;                /* We're using it now */
  
-       current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
+       set_fs(USER_DS);
+       current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD);
         flush_thread();
         current->personality &= ~bprm->per_clear;
  
@@ -1104,12 +1125,15 @@ out:
  }
  EXPORT_SYMBOL(flush_old_exec);
  
-void setup_new_exec(struct linux_binprm * bprm)
+void would_dump(struct linux_binprm *bprm, struct file *file)
  {
-       int i, ch;
-       const char *name;
-       char tcomm[sizeof(current->comm)];
+       if (inode_permission(file->f_path.dentry->d_inode, MAY_READ) < 0)
+               bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+}
+EXPORT_SYMBOL(would_dump);
  
+void setup_new_exec(struct linux_binprm * bprm)
+{
         arch_pick_mmap_layout(current->mm);
  
         /* This is the point of no return */
@@ -1120,18 +1144,7 @@ void setup_new_exec(struct linux_binprm * bprm)
         else
                 set_dumpable(current->mm, suid_dumpable);
  
-       name = bprm->filename;
-
-       /* Copies the binary name from after last slash */
-       for (i=0; (ch = *(name++)) != '\0';) {
-               if (ch == '/')
-                       i = 0; /* overwrite what we wrote */
-               else
-                       if (i < (sizeof(tcomm) - 1))
-                               tcomm[i++] = ch;
-       }
-       tcomm[i] = '\0';
-       set_task_comm(current, tcomm);
+       set_task_comm(current, bprm->tcomm);
  
         /* Set the new mm task size. We have to do that late because it may
          * depend on TIF_32BIT which is only updated in flush_thread() on
@@ -1143,9 +1156,10 @@ void setup_new_exec(struct linux_binprm * bprm)
         if (bprm->cred->uid != current_euid() ||
             bprm->cred->gid != current_egid()) {
                 current->pdeath_signal = 0;
-       } else if (file_permission(bprm->file, MAY_READ) ||
-                  bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) {
-               set_dumpable(current->mm, suid_dumpable);
+       } else {
+               would_dump(bprm, bprm->file);
+               if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
+                       set_dumpable(current->mm, suid_dumpable);
         }
  
         /*
@@ -1218,13 +1232,18 @@ EXPORT_SYMBOL(install_exec_creds);
   * - the caller must hold ->cred_guard_mutex to protect against
   *   PTRACE_ATTACH
   */
-int check_unsafe_exec(struct linux_binprm *bprm)
+static int check_unsafe_exec(struct linux_binprm *bprm)
  {
         struct task_struct *p = current, *t;
         unsigned n_fs;
         int res = 0;
  
-       bprm->unsafe = tracehook_unsafe_exec(p);
+       if (p->ptrace) {
+               if (p->ptrace & PT_PTRACE_CAP)
+                       bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
+               else
+                       bprm->unsafe |= LSM_UNSAFE_PTRACE;
+       }
  
         n_fs = 1;
         spin_lock(&p->fs->lock);
@@ -1322,13 +1341,13 @@ int remove_arg_zero(struct linux_binprm *bprm)
                         ret = -EFAULT;
                         goto out;
                 }
-               kaddr = kmap_atomic(page, KM_USER0);
+               kaddr = kmap_atomic(page);
  
                 for (; offset < PAGE_SIZE && kaddr[offset];
                                 offset++, bprm->p++)
                         ;
  
-               kunmap_atomic(kaddr, KM_USER0);
+               kunmap_atomic(kaddr);
                 put_arg_page(page);
  
                 if (offset == PAGE_SIZE)
@@ -1352,19 +1371,22 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
         unsigned int depth = bprm->recursion_depth;
         int try,retval;
         struct linux_binfmt *fmt;
+       pid_t old_pid, old_vpid;
  
         retval = security_bprm_check(bprm);
         if (retval)
                 return retval;
  
-       /* kernel module loader fixup */
-       /* so we don't try to load run modprobe in kernel space. */
-       set_fs(USER_DS);
-
         retval = audit_bprm(bprm);
         if (retval)
                 return retval;
  
+       /* Need to fetch pid before load_binary changes it */
+       old_pid = current->pid;
+       rcu_read_lock();
+       old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
+       rcu_read_unlock();
+
         retval = -ENOENT;
         for (try=0; try<2; try++) {
                 read_lock(&binfmt_lock);
@@ -1383,8 +1405,10 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
                          */
                         bprm->recursion_depth = depth;
                         if (retval >= 0) {
-                               if (depth == 0)
-                                       tracehook_report_exec(fmt, bprm, regs);
+                               if (depth == 0) {
+                                       trace_sched_process_exec(current, old_pid, bprm);
+                                       ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
+                               }
                                 put_binfmt(fmt);
                                 allow_write_access(bprm->file);
                                 if (bprm->file)
@@ -1404,9 +1428,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
                         }
                 }
                 read_unlock(&binfmt_lock);
+#ifdef CONFIG_MODULES
                 if (retval != -ENOEXEC || bprm->mm == NULL) {
                         break;
-#ifdef CONFIG_MODULES
                 } else {
  #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
                         if (printable(bprm->buf[0]) &&
@@ -1414,9 +1438,13 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
                             printable(bprm->buf[2]) &&
                             printable(bprm->buf[3]))
                                 break; /* -ENOEXEC */
+                       if (try)
+                               break; /* -ENOEXEC */
                         request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
-#endif
                 }
+#else
+               break;
+#endif
         }
         return retval;
  }
@@ -1436,6 +1464,23 @@ static int do_execve_common(const char *filename,
         struct files_struct *displaced;
         bool clear_in_exec;
         int retval;
+       const struct cred *cred = current_cred();
+
+       /*
+        * We move the actual failure in case of RLIMIT_NPROC excess from
+        * set*uid() to execve() because too many poorly written programs
+        * don't check setuid() return code.  Here we additionally recheck
+        * whether NPROC limit is still exceeded.
+        */
+       if ((current->flags & PF_NPROC_EXCEEDED) &&
+           atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
+               retval = -EAGAIN;
+               goto out_ret;
+       }
+
+       /* We're below the limit (still or again), so we don't want to make
+        * further execve() calls fail. */
+       current->flags &= ~PF_NPROC_EXCEEDED;
  
         retval = unshare_files(&displaced);
         if (retval)
@@ -1623,6 +1668,50 @@ expand_fail:
         return ret;
  }
  
+static void cn_escape(char *str)
+{
+       for (; *str; str++)
+               if (*str == '/')
+                       *str = '!';
+}
+
+static int cn_print_exe_file(struct core_name *cn)
+{
+       struct file *exe_file;
+       char *pathbuf, *path;
+       int ret;
+
+       exe_file = get_mm_exe_file(current->mm);
+       if (!exe_file) {
+               char *commstart = cn->corename + cn->used;
+               ret = cn_printf(cn, "%s (path unknown)", current->comm);
+               cn_escape(commstart);
+               return ret;
+       }
+
+       pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
+       if (!pathbuf) {
+               ret = -ENOMEM;
+               goto put_exe_file;
+       }
+
+       path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+       if (IS_ERR(path)) {
+               ret = PTR_ERR(path);
+               goto free_buf;
+       }
+
+       cn_escape(path);
+
+       ret = cn_printf(cn, "%s", path);
+
+free_buf:
+       kfree(pathbuf);
+put_exe_file:
+       fput(exe_file);
+       return ret;
+}
+
  /* format_corename will inspect the pattern parameter, and output a
   * name into corename, which must have space for at least
   * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
@@ -1684,15 +1773,24 @@ static int format_corename(struct core_name *cn, long signr)
                                 break;
                         }
                         /* hostname */
-                       case 'h':
+                       case 'h': {
+                               char *namestart = cn->corename + cn->used;
                                 down_read(&uts_sem);
                                 err = cn_printf(cn, "%s",
                                               utsname()->nodename);
                                 up_read(&uts_sem);
+                               cn_escape(namestart);
                                 break;
+                       }
                         /* executable */
-                       case 'e':
+                       case 'e': {
+                               char *commstart = cn->corename + cn->used;
                                 err = cn_printf(cn, "%s", current->comm);
+                               cn_escape(commstart);
+                               break;
+                       }
+                       case 'E':
+                               err = cn_print_exe_file(cn);
                                 break;
                         /* core limit size */
                         case 'c':
@@ -1734,6 +1832,7 @@ static int zap_process(struct task_struct *start, int exit_code)
  
         t = start;
         do {
+               task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                 if (t != current && t->mm) {
                         sigaddset(&t->pending.signal, SIGKILL);
                         signal_wake_up(t, 1);
@@ -1820,7 +1919,6 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
  {
         struct task_struct *tsk = current;
         struct mm_struct *mm = tsk->mm;
-       struct completion *vfork_done;
         int core_waiters = -EBUSY;
  
         init_completion(&core_state->startup);
@@ -1832,22 +1930,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
                 core_waiters = zap_threads(tsk, mm, core_state, exit_code);
         up_write(&mm->mmap_sem);
  
-       if (unlikely(core_waiters < 0))
-               goto fail;
-
-       /*
-        * Make sure nobody is waiting for us to release the VM,
-        * otherwise we can deadlock when we wait on each other
-        */
-       vfork_done = tsk->vfork_done;
-       if (vfork_done) {
-               tsk->vfork_done = NULL;
-               complete(vfork_done);
-       }
-
-       if (core_waiters)
+       if (core_waiters > 0)
                 wait_for_completion(&core_state->startup);
-fail:
+
         return core_waiters;
  }
  
@@ -1960,7 +2045,7 @@ static void wait_for_dump_helpers(struct file *file)
   * is a special value that we use to trap recursive
   * core dumps
   */
-static int umh_pipe_setup(struct subprocess_info *info)
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
  {
         struct file *rp, *wp;
         struct fdtable *fdt;
@@ -1983,8 +2068,8 @@ static int umh_pipe_setup(struct subprocess_info *info)
         fd_install(0, rp);
         spin_lock(&cf->file_lock);
         fdt = files_fdtable(cf);
-       FD_SET(0, fdt->open_fds);
-       FD_CLR(0, fdt->close_on_exec);
+       __set_open_fd(0, fdt);
+       __clear_close_on_exec(0, fdt);
         spin_unlock(&cf->file_lock);
  
         /* and disallow core files too */
@@ -2053,16 +2138,16 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
  
         ispipe = format_corename(&cn, signr);
  
-       if (ispipe == -ENOMEM) {
-               printk(KERN_WARNING "format_corename failed\n");
-               printk(KERN_WARNING "Aborting core\n");
-               goto fail_corename;
-       }
-
         if (ispipe) {
                 int dump_count;
                 char **helper_argv;
  
+               if (ispipe < 0) {
+                       printk(KERN_WARNING "format_corename failed\n");
+                       printk(KERN_WARNING "Aborting core\n");
+                       goto fail_corename;
+               }
+
                 if (cprm.limit == 1) {
                         /*
                          * Normally core limits are irrelevant to pipes, since