#include <linux/mm.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
-#include <linux/smp_lock.h>
#include <linux/swap.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/pid_namespace.h>
#include <linux/module.h>
#include <linux/namei.h>
-#include <linux/proc_fs.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
#include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
+#include <linux/compat.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>
+#include <asm/exec.h>
+
+#include <trace/events/task.h>
#include "internal.h"
+#include <trace/events/sched.h>
+
int core_uses_pid;
char core_pattern[CORENAME_MAX_SIZE] = "core";
unsigned int core_pipe_limit;
int suid_dumpable = 0;
+struct core_name {
+ char *corename;
+ int used, size;
+};
+static atomic_t call_count = ATOMIC_INIT(1);
+
/* The maximal length of core_pattern is also specified in sysctl.c */
static LIST_HEAD(formats);
static DEFINE_RWLOCK(binfmt_lock);
-int __register_binfmt(struct linux_binfmt * fmt, int insert)
+void __register_binfmt(struct linux_binfmt * fmt, int insert)
{
- if (!fmt)
- return -EINVAL;
+ BUG_ON(!fmt);
write_lock(&binfmt_lock);
insert ? list_add(&fmt->lh, &formats) :
list_add_tail(&fmt->lh, &formats);
write_unlock(&binfmt_lock);
- return 0;
}
EXPORT_SYMBOL(__register_binfmt);
struct file *file;
char *tmp = getname(library);
int error = PTR_ERR(tmp);
+ static const struct open_flags uselib_flags = {
+ .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+ .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
+ .intent = LOOKUP_OPEN
+ };
if (IS_ERR(tmp))
goto out;
- file = do_filp_open(AT_FDCWD, tmp,
- O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
- MAY_READ | MAY_EXEC | MAY_OPEN);
+ file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
putname(tmp);
error = PTR_ERR(file);
if (IS_ERR(file))
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
goto exit;
- fsnotify_open(file->f_path.dentry);
+ fsnotify_open(file);
error = -ENOEXEC;
if(file->f_op) {
}
#ifdef CONFIG_MMU
+/*
+ * The nascent bprm->mm is not visible until exec_mmap() but it can
+ * use a lot of memory, account these pages in current->mm temporary
+ * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
+ * change the counter back via acct_arg_size(0).
+ */
+static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+ struct mm_struct *mm = current->mm;
+ long diff = (long)(pages - bprm->vma_pages);
+
+ if (!mm || !diff)
+ return;
+
+ bprm->vma_pages = pages;
+ add_mm_counter(mm, MM_ANONPAGES, diff);
+}
static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
int write)
#ifdef CONFIG_STACK_GROWSUP
if (write) {
- ret = expand_stack_downwards(bprm->vma, pos);
+ ret = expand_downwards(bprm->vma, pos);
if (ret < 0)
return NULL;
}
unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
struct rlimit *rlim;
+ acct_arg_size(bprm, size / PAGE_SIZE);
+
/*
* We've historically supported up to 32 pages (ARG_MAX)
* of argument strings even with small stacks
* use STACK_TOP because that can depend on attributes which aren't
* configured yet.
*/
+ BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
vma->vm_end = STACK_TOP_MAX;
vma->vm_start = vma->vm_end - PAGE_SIZE;
- vma->vm_flags = VM_STACK_FLAGS;
+ vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
INIT_LIST_HEAD(&vma->anon_vma_chain);
+
+ err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
+ if (err)
+ goto err;
+
err = insert_vm_struct(mm, vma);
if (err)
goto err;
#else
+static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+{
+}
+
static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
int write)
{
return err;
}
+struct user_arg_ptr {
+#ifdef CONFIG_COMPAT
+ bool is_compat;
+#endif
+ union {
+ const char __user *const __user *native;
+#ifdef CONFIG_COMPAT
+ compat_uptr_t __user *compat;
+#endif
+ } ptr;
+};
+
+static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
+{
+ const char __user *native;
+
+#ifdef CONFIG_COMPAT
+ if (unlikely(argv.is_compat)) {
+ compat_uptr_t compat;
+
+ if (get_user(compat, argv.ptr.compat + nr))
+ return ERR_PTR(-EFAULT);
+
+ return compat_ptr(compat);
+ }
+#endif
+
+ if (get_user(native, argv.ptr.native + nr))
+ return ERR_PTR(-EFAULT);
+
+ return native;
+}
+
/*
* count() counts the number of strings in array ARGV.
*/
-static int count(char __user * __user * argv, int max)
+static int count(struct user_arg_ptr argv, int max)
{
int i = 0;
- if (argv != NULL) {
+ if (argv.ptr.native != NULL) {
for (;;) {
- char __user * p;
+ const char __user *p = get_user_arg_ptr(argv, i);
- if (get_user(p, argv))
- return -EFAULT;
if (!p)
break;
- argv++;
+
+ if (IS_ERR(p))
+ return -EFAULT;
+
if (i++ >= max)
return -E2BIG;
+
+ if (fatal_signal_pending(current))
+ return -ERESTARTNOHAND;
cond_resched();
}
}
* processes's memory to the new process's stack. The call to get_user_pages()
* ensures the destination page is created and not swapped out.
*/
-static int copy_strings(int argc, char __user * __user * argv,
+static int copy_strings(int argc, struct user_arg_ptr argv,
struct linux_binprm *bprm)
{
struct page *kmapped_page = NULL;
int ret;
while (argc-- > 0) {
- char __user *str;
+ const char __user *str;
int len;
unsigned long pos;
- if (get_user(str, argv+argc) ||
- !(len = strnlen_user(str, MAX_ARG_STRLEN))) {
- ret = -EFAULT;
+ ret = -EFAULT;
+ str = get_user_arg_ptr(argv, argc);
+ if (IS_ERR(str))
goto out;
- }
- if (!valid_arg_len(bprm, len)) {
- ret = -E2BIG;
+ len = strnlen_user(str, MAX_ARG_STRLEN);
+ if (!len)
+ goto out;
+
+ ret = -E2BIG;
+ if (!valid_arg_len(bprm, len))
goto out;
- }
/* We're going to work our way backwords. */
pos = bprm->p;
while (len > 0) {
int offset, bytes_to_copy;
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTNOHAND;
+ goto out;
+ }
+ cond_resched();
+
offset = pos % PAGE_SIZE;
if (offset == 0)
offset = PAGE_SIZE;
/*
* Like copy_strings, but get argv and its values from kernel memory.
*/
-int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
+int copy_strings_kernel(int argc, const char *const *__argv,
+ struct linux_binprm *bprm)
{
int r;
mm_segment_t oldfs = get_fs();
+ struct user_arg_ptr argv = {
+ .ptr.native = (const char __user *const __user *)__argv,
+ };
+
set_fs(KERNEL_DS);
- r = copy_strings(argc, (char __user * __user *)argv, bprm);
+ r = copy_strings(argc, argv, bprm);
set_fs(oldfs);
+
return r;
}
EXPORT_SYMBOL(copy_strings_kernel);
unsigned long length = old_end - old_start;
unsigned long new_start = old_start - shift;
unsigned long new_end = old_end - shift;
- struct mmu_gather *tlb;
+ struct mmu_gather tlb;
BUG_ON(new_start > new_end);
return -ENOMEM;
lru_add_drain();
- tlb = tlb_gather_mmu(mm, 0);
+ tlb_gather_mmu(&tlb, mm, 0);
if (new_end > old_start) {
/*
* when the old and new regions overlap clear from new_end.
*/
- free_pgd_range(tlb, new_end, old_end, new_end,
+ free_pgd_range(&tlb, new_end, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : 0);
} else {
/*
* have constraints on va-space that make this illegal (IA64) -
* for the others its just a little faster.
*/
- free_pgd_range(tlb, old_start, old_end, new_end,
+ free_pgd_range(&tlb, old_start, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : 0);
}
- tlb_finish_mmu(tlb, new_end, old_end);
+ tlb_finish_mmu(&tlb, new_end, old_end);
/*
* Shrink the vma to just the new range. Always succeeds.
#else
stack_top = arch_align_stack(stack_top);
stack_top = PAGE_ALIGN(stack_top);
+
+ if (unlikely(stack_top < mmap_min_addr) ||
+ unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
+ return -ENOMEM;
+
stack_shift = vma->vm_end - stack_top;
bprm->p -= stack_shift;
else if (executable_stack == EXSTACK_DISABLE_X)
vm_flags &= ~VM_EXEC;
vm_flags |= mm->def_flags;
+ vm_flags |= VM_STACK_INCOMPLETE_SETUP;
ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
vm_flags);
goto out_unlock;
}
+ /* mprotect_fixup is overkill to remove the temporary stack flags */
+ vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
+
stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
stack_size = vma->vm_end - vma->vm_start;
/*
else
stack_base = vma->vm_start - stack_expand;
#endif
+ current->mm->start_stack = bprm->p;
ret = expand_stack(vma, stack_base);
if (ret)
ret = -EFAULT;
{
struct file *file;
int err;
+ static const struct open_flags open_exec_flags = {
+ .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+ .acc_mode = MAY_EXEC | MAY_OPEN,
+ .intent = LOOKUP_OPEN
+ };
- file = do_filp_open(AT_FDCWD, name,
- O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
- MAY_EXEC | MAY_OPEN);
+ file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
if (IS_ERR(file))
goto out;
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
goto exit;
- fsnotify_open(file->f_path.dentry);
+ fsnotify_open(file);
err = deny_write_access(file);
if (err)
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
old_mm = current->mm;
- sync_mm_rss(tsk, old_mm);
+ sync_mm_rss(old_mm);
mm_release(tsk, old_mm);
if (old_mm) {
if (old_mm) {
up_read(&old_mm->mmap_sem);
BUG_ON(active_mm != old_mm);
+ setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
mm_update_next_owner(old_mm);
mmput(old_mm);
return 0;
struct signal_struct *sig = tsk->signal;
struct sighand_struct *oldsighand = tsk->sighand;
spinlock_t *lock = &oldsighand->siglock;
- int count;
if (thread_group_empty(tsk))
goto no_thread_group;
spin_unlock_irq(lock);
return -EAGAIN;
}
+
sig->group_exit_task = tsk;
- zap_other_threads(tsk);
+ sig->notify_count = zap_other_threads(tsk);
+ if (!thread_group_leader(tsk))
+ sig->notify_count--;
- /* Account for the thread group leader hanging around: */
- count = thread_group_leader(tsk) ? 1 : 2;
- sig->notify_count = count;
- while (atomic_read(&sig->count) > count) {
+ while (sig->notify_count) {
__set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock_irq(lock);
schedule();
leader->group_leader = tsk;
tsk->exit_signal = SIGCHLD;
+ leader->exit_signal = -1;
BUG_ON(leader->exit_state != EXIT_ZOMBIE);
leader->exit_state = EXIT_DEAD;
+
+ /*
+ * We are going to release_task()->ptrace_unlink() silently,
+ * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
+ * the tracer wont't block again waiting for this thread.
+ */
+ if (unlikely(leader->ptrace))
+ __wake_up_parent(leader, leader->parent);
write_unlock_irq(&tasklist_lock);
release_task(leader);
sig->notify_count = 0;
no_thread_group:
- if (current->mm)
- setmax_mm_hiwater_rss(&sig->maxrss, current->mm);
+ /* we have changed execution domain */
+ tsk->exit_signal = SIGCHLD;
exit_itimers(sig);
flush_itimer_signals();
fdt = files_fdtable(files);
if (i >= fdt->max_fds)
break;
- set = fdt->close_on_exec->fds_bits[j];
+ set = fdt->close_on_exec[j];
if (!set)
continue;
- fdt->close_on_exec->fds_bits[j] = 0;
+ fdt->close_on_exec[j] = 0;
spin_unlock(&files->file_lock);
for ( ; set ; i++,set >>= 1) {
if (set & 1) {
task_unlock(tsk);
return buf;
}
+EXPORT_SYMBOL_GPL(get_task_comm);
void set_task_comm(struct task_struct *tsk, char *buf)
{
task_lock(tsk);
+ trace_task_rename(tsk, buf);
+
/*
* Threads may access current->comm without holding
* the task lock, so write the string carefully.
perf_event_comm(tsk);
}
+static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
+{
+ int i, ch;
+
+ /* Copies the binary name from after last slash */
+ for (i = 0; (ch = *(fn++)) != '\0';) {
+ if (ch == '/')
+ i = 0; /* overwrite what we wrote */
+ else
+ if (i < len - 1)
+ tcomm[i++] = ch;
+ }
+ tcomm[i] = '\0';
+}
+
int flush_old_exec(struct linux_binprm * bprm)
{
int retval;
set_mm_exe_file(bprm->mm, bprm->file);
+ filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));
/*
* Release all of the old mmap stuff
*/
+ acct_arg_size(bprm, 0);
retval = exec_mmap(bprm->mm);
if (retval)
goto out;
bprm->mm = NULL; /* We're using it now */
- current->flags &= ~PF_RANDOMIZE;
+ set_fs(USER_DS);
+ current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD);
flush_thread();
current->personality &= ~bprm->per_clear;
}
EXPORT_SYMBOL(flush_old_exec);
-void setup_new_exec(struct linux_binprm * bprm)
+void would_dump(struct linux_binprm *bprm, struct file *file)
{
- int i, ch;
- char * name;
- char tcomm[sizeof(current->comm)];
+ if (inode_permission(file->f_path.dentry->d_inode, MAY_READ) < 0)
+ bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+}
+EXPORT_SYMBOL(would_dump);
+void setup_new_exec(struct linux_binprm * bprm)
+{
arch_pick_mmap_layout(current->mm);
/* This is the point of no return */
else
set_dumpable(current->mm, suid_dumpable);
- name = bprm->filename;
-
- /* Copies the binary name from after last slash */
- for (i=0; (ch = *(name++)) != '\0';) {
- if (ch == '/')
- i = 0; /* overwrite what we wrote */
- else
- if (i < (sizeof(tcomm) - 1))
- tcomm[i++] = ch;
- }
- tcomm[i] = '\0';
- set_task_comm(current, tcomm);
+ set_task_comm(current, bprm->tcomm);
/* Set the new mm task size. We have to do that late because it may
* depend on TIF_32BIT which is only updated in flush_thread() on
if (bprm->cred->uid != current_euid() ||
bprm->cred->gid != current_egid()) {
current->pdeath_signal = 0;
- } else if (file_permission(bprm->file, MAY_READ) ||
- bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) {
- set_dumpable(current->mm, suid_dumpable);
+ } else {
+ would_dump(bprm, bprm->file);
+ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
+ set_dumpable(current->mm, suid_dumpable);
}
/*
*/
int prepare_bprm_creds(struct linux_binprm *bprm)
{
- if (mutex_lock_interruptible(¤t->cred_guard_mutex))
+ if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
return -ERESTARTNOINTR;
bprm->cred = prepare_exec_creds();
if (likely(bprm->cred))
return 0;
- mutex_unlock(¤t->cred_guard_mutex);
+ mutex_unlock(¤t->signal->cred_guard_mutex);
return -ENOMEM;
}
{
free_arg_pages(bprm);
if (bprm->cred) {
- mutex_unlock(¤t->cred_guard_mutex);
+ mutex_unlock(¤t->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
kfree(bprm);
* credentials; any time after this it may be unlocked.
*/
security_bprm_committed_creds(bprm);
- mutex_unlock(¤t->cred_guard_mutex);
+ mutex_unlock(¤t->signal->cred_guard_mutex);
}
EXPORT_SYMBOL(install_exec_creds);
/*
* determine how safe it is to execute the proposed program
- * - the caller must hold current->cred_guard_mutex to protect against
+ * - the caller must hold ->cred_guard_mutex to protect against
* PTRACE_ATTACH
*/
-int check_unsafe_exec(struct linux_binprm *bprm)
+static int check_unsafe_exec(struct linux_binprm *bprm)
{
struct task_struct *p = current, *t;
unsigned n_fs;
int res = 0;
- bprm->unsafe = tracehook_unsafe_exec(p);
+ if (p->ptrace) {
+ if (p->ptrace & PT_PTRACE_CAP)
+ bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
+ else
+ bprm->unsafe |= LSM_UNSAFE_PTRACE;
+ }
n_fs = 1;
- write_lock(&p->fs->lock);
+ spin_lock(&p->fs->lock);
rcu_read_lock();
for (t = next_thread(p); t != p; t = next_thread(t)) {
if (t->fs == p->fs)
res = 1;
}
}
- write_unlock(&p->fs->lock);
+ spin_unlock(&p->fs->lock);
return res;
}
ret = -EFAULT;
goto out;
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
for (; offset < PAGE_SIZE && kaddr[offset];
offset++, bprm->p++)
;
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
put_arg_page(page);
if (offset == PAGE_SIZE)
unsigned int depth = bprm->recursion_depth;
int try,retval;
struct linux_binfmt *fmt;
+ pid_t old_pid, old_vpid;
retval = security_bprm_check(bprm);
if (retval)
return retval;
- /* kernel module loader fixup */
- /* so we don't try to load run modprobe in kernel space. */
- set_fs(USER_DS);
-
retval = audit_bprm(bprm);
if (retval)
return retval;
+ /* Need to fetch pid before load_binary changes it */
+ old_pid = current->pid;
+ rcu_read_lock();
+ old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
+ rcu_read_unlock();
+
retval = -ENOENT;
for (try=0; try<2; try++) {
read_lock(&binfmt_lock);
*/
bprm->recursion_depth = depth;
if (retval >= 0) {
- if (depth == 0)
- tracehook_report_exec(fmt, bprm, regs);
+ if (depth == 0) {
+ trace_sched_process_exec(current, old_pid, bprm);
+ ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
+ }
put_binfmt(fmt);
allow_write_access(bprm->file);
if (bprm->file)
}
}
read_unlock(&binfmt_lock);
+#ifdef CONFIG_MODULES
if (retval != -ENOEXEC || bprm->mm == NULL) {
break;
-#ifdef CONFIG_MODULES
} else {
#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
if (printable(bprm->buf[0]) &&
printable(bprm->buf[2]) &&
printable(bprm->buf[3]))
break; /* -ENOEXEC */
+ if (try)
+ break; /* -ENOEXEC */
request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
-#endif
}
+#else
+ break;
+#endif
}
return retval;
}
/*
* sys_execve() executes a new program.
*/
-int do_execve(char * filename,
- char __user *__user *argv,
- char __user *__user *envp,
- struct pt_regs * regs)
+static int do_execve_common(const char *filename,
+ struct user_arg_ptr argv,
+ struct user_arg_ptr envp,
+ struct pt_regs *regs)
{
struct linux_binprm *bprm;
struct file *file;
struct files_struct *displaced;
bool clear_in_exec;
int retval;
+ const struct cred *cred = current_cred();
+
+ /*
+ * We move the actual failure in case of RLIMIT_NPROC excess from
+ * set*uid() to execve() because too many poorly written programs
+ * don't check setuid() return code. Here we additionally recheck
+ * whether NPROC limit is still exceeded.
+ */
+ if ((current->flags & PF_NPROC_EXCEEDED) &&
+ atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
+ retval = -EAGAIN;
+ goto out_ret;
+ }
+
+ /* We're below the limit (still or again), so we don't want to make
+ * further execve() calls fail. */
+ current->flags &= ~PF_NPROC_EXCEEDED;
retval = unshare_files(&displaced);
if (retval)
if (retval < 0)
goto out;
- current->flags &= ~PF_KTHREAD;
retval = search_binary_handler(bprm,regs);
if (retval < 0)
goto out;
- current->stack_start = current->mm->start_stack;
-
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
return retval;
out:
- if (bprm->mm)
- mmput (bprm->mm);
+ if (bprm->mm) {
+ acct_arg_size(bprm, 0);
+ mmput(bprm->mm);
+ }
out_file:
if (bprm->file) {
return retval;
}
+int do_execve(const char *filename,
+ const char __user *const __user *__argv,
+ const char __user *const __user *__envp,
+ struct pt_regs *regs)
+{
+ struct user_arg_ptr argv = { .ptr.native = __argv };
+ struct user_arg_ptr envp = { .ptr.native = __envp };
+ return do_execve_common(filename, argv, envp, regs);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_do_execve(char *filename,
+ compat_uptr_t __user *__argv,
+ compat_uptr_t __user *__envp,
+ struct pt_regs *regs)
+{
+ struct user_arg_ptr argv = {
+ .is_compat = true,
+ .ptr.compat = __argv,
+ };
+ struct user_arg_ptr envp = {
+ .is_compat = true,
+ .ptr.compat = __envp,
+ };
+ return do_execve_common(filename, argv, envp, regs);
+}
+#endif
+
void set_binfmt(struct linux_binfmt *new)
{
struct mm_struct *mm = current->mm;
EXPORT_SYMBOL(set_binfmt);
+static int expand_corename(struct core_name *cn)
+{
+ char *old_corename = cn->corename;
+
+ cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
+ cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
+
+ if (!cn->corename) {
+ kfree(old_corename);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int cn_printf(struct core_name *cn, const char *fmt, ...)
+{
+ char *cur;
+ int need;
+ int ret;
+ va_list arg;
+
+ va_start(arg, fmt);
+ need = vsnprintf(NULL, 0, fmt, arg);
+ va_end(arg);
+
+ if (likely(need < cn->size - cn->used - 1))
+ goto out_printf;
+
+ ret = expand_corename(cn);
+ if (ret)
+ goto expand_fail;
+
+out_printf:
+ cur = cn->corename + cn->used;
+ va_start(arg, fmt);
+ vsnprintf(cur, need + 1, fmt, arg);
+ va_end(arg);
+ cn->used += need;
+ return 0;
+
+expand_fail:
+ return ret;
+}
+
+static void cn_escape(char *str)
+{
+ for (; *str; str++)
+ if (*str == '/')
+ *str = '!';
+}
+
+static int cn_print_exe_file(struct core_name *cn)
+{
+ struct file *exe_file;
+ char *pathbuf, *path;
+ int ret;
+
+ exe_file = get_mm_exe_file(current->mm);
+ if (!exe_file) {
+ char *commstart = cn->corename + cn->used;
+ ret = cn_printf(cn, "%s (path unknown)", current->comm);
+ cn_escape(commstart);
+ return ret;
+ }
+
+ pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
+ if (!pathbuf) {
+ ret = -ENOMEM;
+ goto put_exe_file;
+ }
+
+ path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
+ goto free_buf;
+ }
+
+ cn_escape(path);
+
+ ret = cn_printf(cn, "%s", path);
+
+free_buf:
+ kfree(pathbuf);
+put_exe_file:
+ fput(exe_file);
+ return ret;
+}
+
/* format_corename will inspect the pattern parameter, and output a
* name into corename, which must have space for at least
* CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
*/
-static int format_corename(char *corename, long signr)
+static int format_corename(struct core_name *cn, long signr)
{
const struct cred *cred = current_cred();
const char *pat_ptr = core_pattern;
int ispipe = (*pat_ptr == '|');
- char *out_ptr = corename;
- char *const out_end = corename + CORENAME_MAX_SIZE;
- int rc;
int pid_in_pattern = 0;
+ int err = 0;
+
+ cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
+ cn->corename = kmalloc(cn->size, GFP_KERNEL);
+ cn->used = 0;
+
+ if (!cn->corename)
+ return -ENOMEM;
/* Repeat as long as we have more pattern to process and more output
space */
while (*pat_ptr) {
if (*pat_ptr != '%') {
- if (out_ptr == out_end)
+ if (*pat_ptr == 0)
goto out;
- *out_ptr++ = *pat_ptr++;
+ err = cn_printf(cn, "%c", *pat_ptr++);
} else {
switch (*++pat_ptr) {
+ /* single % at the end, drop that */
case 0:
goto out;
/* Double percent, output one percent */
case '%':
- if (out_ptr == out_end)
- goto out;
- *out_ptr++ = '%';
+ err = cn_printf(cn, "%c", '%');
break;
/* pid */
case 'p':
pid_in_pattern = 1;
- rc = snprintf(out_ptr, out_end - out_ptr,
- "%d", task_tgid_vnr(current));
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
+ err = cn_printf(cn, "%d",
+ task_tgid_vnr(current));
break;
/* uid */
case 'u':
- rc = snprintf(out_ptr, out_end - out_ptr,
- "%d", cred->uid);
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
+ err = cn_printf(cn, "%d", cred->uid);
break;
/* gid */
case 'g':
- rc = snprintf(out_ptr, out_end - out_ptr,
- "%d", cred->gid);
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
+ err = cn_printf(cn, "%d", cred->gid);
break;
/* signal that caused the coredump */
case 's':
- rc = snprintf(out_ptr, out_end - out_ptr,
- "%ld", signr);
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
+ err = cn_printf(cn, "%ld", signr);
break;
/* UNIX time of coredump */
case 't': {
struct timeval tv;
do_gettimeofday(&tv);
- rc = snprintf(out_ptr, out_end - out_ptr,
- "%lu", tv.tv_sec);
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
+ err = cn_printf(cn, "%lu", tv.tv_sec);
break;
}
/* hostname */
- case 'h':
+ case 'h': {
+ char *namestart = cn->corename + cn->used;
down_read(&uts_sem);
- rc = snprintf(out_ptr, out_end - out_ptr,
- "%s", utsname()->nodename);
+ err = cn_printf(cn, "%s",
+ utsname()->nodename);
up_read(&uts_sem);
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
+ cn_escape(namestart);
break;
+ }
/* executable */
- case 'e':
- rc = snprintf(out_ptr, out_end - out_ptr,
- "%s", current->comm);
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
+ case 'e': {
+ char *commstart = cn->corename + cn->used;
+ err = cn_printf(cn, "%s", current->comm);
+ cn_escape(commstart);
+ break;
+ }
+ case 'E':
+ err = cn_print_exe_file(cn);
break;
/* core limit size */
case 'c':
- rc = snprintf(out_ptr, out_end - out_ptr,
- "%lu", rlimit(RLIMIT_CORE));
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
+ err = cn_printf(cn, "%lu",
+ rlimit(RLIMIT_CORE));
break;
default:
break;
}
++pat_ptr;
}
+
+ if (err)
+ return err;
}
+
/* Backward compatibility with core_uses_pid:
*
* If core_pattern does not include a %p (as is the default)
* and core_uses_pid is set, then .%pid will be appended to
* the filename. Do not do this for piped commands. */
if (!ispipe && !pid_in_pattern && core_uses_pid) {
- rc = snprintf(out_ptr, out_end - out_ptr,
- ".%d", task_tgid_vnr(current));
- if (rc > out_end - out_ptr)
- goto out;
- out_ptr += rc;
+ err = cn_printf(cn, ".%d", task_tgid_vnr(current));
+ if (err)
+ return err;
}
out:
- *out_ptr = 0;
return ispipe;
}
-static int zap_process(struct task_struct *start)
+static int zap_process(struct task_struct *start, int exit_code)
{
struct task_struct *t;
int nr = 0;
start->signal->flags = SIGNAL_GROUP_EXIT;
+ start->signal->group_exit_code = exit_code;
start->signal->group_stop_count = 0;
t = start;
do {
+ task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
if (t != current && t->mm) {
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
spin_lock_irq(&tsk->sighand->siglock);
if (!signal_group_exit(tsk->signal)) {
mm->core_state = core_state;
- tsk->signal->group_exit_code = exit_code;
- nr = zap_process(tsk);
+ nr = zap_process(tsk, exit_code);
}
spin_unlock_irq(&tsk->sighand->siglock);
if (unlikely(nr < 0))
if (p->mm) {
if (unlikely(p->mm == mm)) {
lock_task_sighand(p, &flags);
- nr += zap_process(p);
+ nr += zap_process(p, exit_code);
unlock_task_sighand(p, &flags);
}
break;
{
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
- struct completion *vfork_done;
- int core_waiters;
+ int core_waiters = -EBUSY;
init_completion(&core_state->startup);
core_state->dumper.task = tsk;
core_state->dumper.next = NULL;
- core_waiters = zap_threads(tsk, mm, core_state, exit_code);
- up_write(&mm->mmap_sem);
- if (unlikely(core_waiters < 0))
- goto fail;
-
- /*
- * Make sure nobody is waiting for us to release the VM,
- * otherwise we can deadlock when we wait on each other
- */
- vfork_done = tsk->vfork_done;
- if (vfork_done) {
- tsk->vfork_done = NULL;
- complete(vfork_done);
- }
+ down_write(&mm->mmap_sem);
+ if (!mm->core_state)
+ core_waiters = zap_threads(tsk, mm, core_state, exit_code);
+ up_write(&mm->mmap_sem);
- if (core_waiters)
+ if (core_waiters > 0)
wait_for_completion(&core_state->startup);
-fail:
+
return core_waiters;
}
}
+/*
+ * umh_pipe_setup
+ * helper function to customize the process used
+ * to collect the core in userspace. Specifically
+ * it sets up a pipe and installs it as fd 0 (stdin)
+ * for the process. Returns 0 on success, or
+ * PTR_ERR on failure.
+ * Note that it also sets the core limit to 1. This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+{
+ struct file *rp, *wp;
+ struct fdtable *fdt;
+ struct coredump_params *cp = (struct coredump_params *)info->data;
+ struct files_struct *cf = current->files;
+
+ wp = create_write_pipe(0);
+ if (IS_ERR(wp))
+ return PTR_ERR(wp);
+
+ rp = create_read_pipe(wp, 0);
+ if (IS_ERR(rp)) {
+ free_write_pipe(wp);
+ return PTR_ERR(rp);
+ }
+
+ cp->file = wp;
+
+ sys_close(0);
+ fd_install(0, rp);
+ spin_lock(&cf->file_lock);
+ fdt = files_fdtable(cf);
+ __set_open_fd(0, fdt);
+ __clear_close_on_exec(0, fdt);
+ spin_unlock(&cf->file_lock);
+
+ /* and disallow core files too */
+ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
+
+ return 0;
+}
+
void do_coredump(long signr, int exit_code, struct pt_regs *regs)
{
struct core_state core_state;
- char corename[CORENAME_MAX_SIZE + 1];
+ struct core_name cn;
struct mm_struct *mm = current->mm;
struct linux_binfmt * binfmt;
- struct inode * inode;
const struct cred *old_cred;
struct cred *cred;
int retval = 0;
int flag = 0;
- int ispipe = 0;
- char **helper_argv = NULL;
- int helper_argc = 0;
- int dump_count = 0;
+ int ispipe;
static atomic_t core_dump_count = ATOMIC_INIT(0);
struct coredump_params cprm = {
.signr = signr,
binfmt = mm->binfmt;
if (!binfmt || !binfmt->core_dump)
goto fail;
-
- cred = prepare_creds();
- if (!cred) {
- retval = -ENOMEM;
+ if (!__get_dumpable(cprm.mm_flags))
goto fail;
- }
- down_write(&mm->mmap_sem);
- /*
- * If another thread got here first, or we are not dumpable, bail out.
- */
- if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
- up_write(&mm->mmap_sem);
- put_cred(cred);
+ cred = prepare_creds();
+ if (!cred)
goto fail;
- }
-
/*
* We cannot trust fsuid as being the "true" uid of the
* process nor do we know its entire history. We only know it
}
retval = coredump_wait(exit_code, &core_state);
- if (retval < 0) {
- put_cred(cred);
- goto fail;
- }
+ if (retval < 0)
+ goto fail_creds;
old_cred = override_creds(cred);
*/
clear_thread_flag(TIF_SIGPENDING);
- /*
- * lock_kernel() because format_corename() is controlled by sysctl, which
- * uses lock_kernel()
- */
- lock_kernel();
- ispipe = format_corename(corename, signr);
- unlock_kernel();
-
- if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
- goto fail_unlock;
+ ispipe = format_corename(&cn, signr);
if (ispipe) {
- if (cprm.limit == 0) {
+ int dump_count;
+ char **helper_argv;
+
+ if (ispipe < 0) {
+ printk(KERN_WARNING "format_corename failed\n");
+ printk(KERN_WARNING "Aborting core\n");
+ goto fail_corename;
+ }
+
+ if (cprm.limit == 1) {
/*
* Normally core limits are irrelevant to pipes, since
* we're not writing to the file system, but we use
- * cprm.limit of 0 here as a speacial value. Any
- * non-zero limit gets set to RLIM_INFINITY below, but
+ * cprm.limit of 1 here as a speacial value. Any
+ * non-1 limit gets set to RLIM_INFINITY below, but
* a limit of 0 skips the dump. This is a consistent
* way to catch recursive crashes. We can still crash
- * if the core_pattern binary sets RLIM_CORE = !0
+ * if the core_pattern binary sets RLIM_CORE = !1
* but it runs as root, and can do lots of stupid things
* Note that we use task_tgid_vnr here to grab the pid
* of the process group leader. That way we get the
* core_pattern process dies.
*/
printk(KERN_WARNING
- "Process %d(%s) has RLIMIT_CORE set to 0\n",
+ "Process %d(%s) has RLIMIT_CORE set to 1\n",
task_tgid_vnr(current), current->comm);
printk(KERN_WARNING "Aborting core\n");
goto fail_unlock;
}
+ cprm.limit = RLIM_INFINITY;
dump_count = atomic_inc_return(&core_dump_count);
if (core_pipe_limit && (core_pipe_limit < dump_count)) {
goto fail_dropcount;
}
- helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
+ helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
if (!helper_argv) {
printk(KERN_WARNING "%s failed to allocate memory\n",
__func__);
goto fail_dropcount;
}
- cprm.limit = RLIM_INFINITY;
-
- /* SIGPIPE can happen, but it's just never processed */
- if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
- &cprm.file)) {
+ retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
+ NULL, UMH_WAIT_EXEC, umh_pipe_setup,
+ NULL, &cprm);
+ argv_free(helper_argv);
+ if (retval) {
printk(KERN_INFO "Core dump to %s pipe failed\n",
- corename);
- goto fail_dropcount;
+ cn.corename);
+ goto close_fail;
}
- } else
- cprm.file = filp_open(corename,
+ } else {
+ struct inode *inode;
+
+ if (cprm.limit < binfmt->min_coredump)
+ goto fail_unlock;
+
+ cprm.file = filp_open(cn.corename,
O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
0600);
- if (IS_ERR(cprm.file))
- goto fail_dropcount;
- inode = cprm.file->f_path.dentry->d_inode;
- if (inode->i_nlink > 1)
- goto close_fail; /* multiple links - don't dump */
- if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
- goto close_fail;
-
- /* AK: actually i see no reason to not allow this for named pipes etc.,
- but keep the previous behaviour for now. */
- if (!ispipe && !S_ISREG(inode->i_mode))
- goto close_fail;
- /*
- * Dont allow local users get cute and trick others to coredump
- * into their pre-created files:
- */
- if (inode->i_uid != current_fsuid())
- goto close_fail;
- if (!cprm.file->f_op)
- goto close_fail;
- if (!cprm.file->f_op->write)
- goto close_fail;
- if (!ispipe &&
- do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
- goto close_fail;
+ if (IS_ERR(cprm.file))
+ goto fail_unlock;
- retval = binfmt->core_dump(&cprm);
+ inode = cprm.file->f_path.dentry->d_inode;
+ if (inode->i_nlink > 1)
+ goto close_fail;
+ if (d_unhashed(cprm.file->f_path.dentry))
+ goto close_fail;
+ /*
+ * AK: actually i see no reason to not allow this for named
+ * pipes etc, but keep the previous behaviour for now.
+ */
+ if (!S_ISREG(inode->i_mode))
+ goto close_fail;
+ /*
+ * Dont allow local users get cute and trick others to coredump
+ * into their pre-created files.
+ */
+ if (inode->i_uid != current_fsuid())
+ goto close_fail;
+ if (!cprm.file->f_op || !cprm.file->f_op->write)
+ goto close_fail;
+ if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+ goto close_fail;
+ }
+ retval = binfmt->core_dump(&cprm);
if (retval)
current->signal->group_exit_code |= 0x80;
-close_fail:
+
if (ispipe && core_pipe_limit)
wait_for_dump_helpers(cprm.file);
- filp_close(cprm.file, NULL);
+close_fail:
+ if (cprm.file)
+ filp_close(cprm.file, NULL);
fail_dropcount:
- if (dump_count)
+ if (ispipe)
atomic_dec(&core_dump_count);
fail_unlock:
- if (helper_argv)
- argv_free(helper_argv);
-
+ kfree(cn.corename);
+fail_corename:
+ coredump_finish(mm);
revert_creds(old_cred);
+fail_creds:
put_cred(cred);
- coredump_finish(mm);
fail:
return;
}
+
+/*
+ * Core dumping helper functions. These are the only things you should
+ * do on a core-file: use only these functions to write out all the
+ * necessary info.
+ */
+int dump_write(struct file *file, const void *addr, int nr)
+{
+ return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+}
+EXPORT_SYMBOL(dump_write);
+
+int dump_seek(struct file *file, loff_t off)
+{
+ int ret = 1;
+
+ if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+ if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
+ return 0;
+ } else {
+ char *buf = (char *)get_zeroed_page(GFP_KERNEL);
+
+ if (!buf)
+ return 0;
+ while (off > 0) {
+ unsigned long n = off;
+
+ if (n > PAGE_SIZE)
+ n = PAGE_SIZE;
+ if (!dump_write(file, buf, n)) {
+ ret = 0;
+ break;
+ }
+ off -= n;
+ }
+ free_page((unsigned long)buf);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(dump_seek);