fs, proc: fix ABBA deadlock in case of execution attempt of map_files/ entries
[linux-flexiantxendom0-3.2.10.git] / fs / proc / base.c
index 9d096e8..8e139c9 100644 (file)
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/flex_array.h>
+#ifdef CONFIG_HARDWALL
+#include <asm/hardwall.h>
+#endif
+#include <trace/events/oom.h>
 #include "internal.h"
 
 /* NOTE:
 struct pid_entry {
        char *name;
        int len;
-       mode_t mode;
+       umode_t mode;
        const struct inode_operations *iop;
        const struct file_operations *fop;
        union proc_op op;
@@ -130,6 +135,8 @@ struct pid_entry {
                NULL, &proc_single_file_operations,     \
                { .proc_show = show } )
 
+static int proc_fd_permission(struct inode *inode, int mask);
+
 /*
  * Count the number of hardlinks for the pid_entry table, excluding the .
  * and .. links.
@@ -162,9 +169,9 @@ static int get_task_root(struct task_struct *task, struct path *root)
        return result;
 }
 
-static int proc_cwd_link(struct inode *inode, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct path *path)
 {
-       struct task_struct *task = get_proc_task(inode);
+       struct task_struct *task = get_proc_task(dentry->d_inode);
        int result = -ENOENT;
 
        if (task) {
@@ -179,9 +186,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
        return result;
 }
 
-static int proc_root_link(struct inode *inode, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct path *path)
 {
-       struct task_struct *task = get_proc_task(inode);
+       struct task_struct *task = get_proc_task(dentry->d_inode);
        int result = -ENOENT;
 
        if (task) {
@@ -191,53 +198,9 @@ static int proc_root_link(struct inode *inode, struct path *path)
        return result;
 }
 
-/*
- * Return zero if current may access user memory in @task, -error if not.
- */
-static int check_mem_permission(struct task_struct *task)
-{
-       /*
-        * A task can always look at itself, in case it chooses
-        * to use system calls instead of load instructions.
-        */
-       if (task == current)
-               return 0;
-
-       /*
-        * If current is actively ptrace'ing, and would also be
-        * permitted to freshly attach with ptrace now, permit it.
-        */
-       if (task_is_stopped_or_traced(task)) {
-               int match;
-               rcu_read_lock();
-               match = (tracehook_tracer_task(task) == current);
-               rcu_read_unlock();
-               if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
-                       return 0;
-       }
-
-       /*
-        * Noone else is allowed.
-        */
-       return -EPERM;
-}
-
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
-       struct mm_struct *mm;
-
-       if (mutex_lock_killable(&task->signal->cred_guard_mutex))
-               return NULL;
-
-       mm = get_task_mm(task);
-       if (mm && mm != current->mm &&
-                       !ptrace_may_access(task, PTRACE_MODE_READ)) {
-               mmput(mm);
-               mm = NULL;
-       }
-       mutex_unlock(&task->signal->cred_guard_mutex);
-
-       return mm;
+       return mm_access(task, PTRACE_MODE_READ);
 }
 
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
@@ -279,9 +242,9 @@ out:
 
 static int proc_pid_auxv(struct task_struct *task, char *buffer)
 {
-       int res = 0;
-       struct mm_struct *mm = get_task_mm(task);
-       if (mm) {
+       struct mm_struct *mm = mm_for_maps(task);
+       int res = PTR_ERR(mm);
+       if (mm && !IS_ERR(mm)) {
                unsigned int nwords = 0;
                do {
                        nwords += 2;
@@ -318,6 +281,23 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 }
 #endif /* CONFIG_KALLSYMS */
 
+static int lock_trace(struct task_struct *task)
+{
+       int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+       if (err)
+               return err;
+       if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+               mutex_unlock(&task->signal->cred_guard_mutex);
+               return -EPERM;
+       }
+       return 0;
+}
+
+static void unlock_trace(struct task_struct *task)
+{
+       mutex_unlock(&task->signal->cred_guard_mutex);
+}
+
 #ifdef CONFIG_STACKTRACE
 
 #define MAX_STACK_TRACE_DEPTH  64
@@ -327,6 +307,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 {
        struct stack_trace trace;
        unsigned long *entries;
+       int err;
        int i;
 
        entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
@@ -337,15 +318,20 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
        trace.max_entries       = MAX_STACK_TRACE_DEPTH;
        trace.entries           = entries;
        trace.skip              = 0;
-       save_stack_trace_tsk(task, &trace);
 
-       for (i = 0; i < trace.nr_entries; i++) {
-               seq_printf(m, "[<%p>] %pS\n",
-                          (void *)entries[i], (void *)entries[i]);
+       err = lock_trace(task);
+       if (!err) {
+               save_stack_trace_tsk(task, &trace);
+
+               for (i = 0; i < trace.nr_entries; i++) {
+                       seq_printf(m, "[<%pK>] %pS\n",
+                                  (void *)entries[i], (void *)entries[i]);
+               }
+               unlock_trace(task);
        }
        kfree(entries);
 
-       return 0;
+       return err;
 }
 #endif
 
@@ -508,18 +494,22 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
 {
        long nr;
        unsigned long args[6], sp, pc;
+       int res = lock_trace(task);
+       if (res)
+               return res;
 
        if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
-               return sprintf(buffer, "running\n");
-
-       if (nr < 0)
-               return sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
-
-       return sprintf(buffer,
+               res = sprintf(buffer, "running\n");
+       else if (nr < 0)
+               res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+       else
+               res = sprintf(buffer,
                       "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
                       nr,
                       args[0], args[1], args[2], args[3], args[4], args[5],
                       sp, pc);
+       unlock_trace(task);
+       return res;
 }
 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
 
@@ -544,7 +534,7 @@ static int proc_fd_access_allowed(struct inode *inode)
        return allowed;
 }
 
-static int proc_setattr(struct dentry *dentry, struct iattr *attr)
+int proc_setattr(struct dentry *dentry, struct iattr *attr)
 {
        int error;
        struct inode *inode = dentry->d_inode;
@@ -568,122 +558,54 @@ static int proc_setattr(struct dentry *dentry, struct iattr *attr)
        return 0;
 }
 
-static const struct inode_operations proc_def_inode_operations = {
-       .setattr        = proc_setattr,
-};
-
-static int mounts_open_common(struct inode *inode, struct file *file,
-                             const struct seq_operations *op)
+/*
+ * May current process learn task's sched/cmdline info (for hide_pid_min=1)
+ * or euid/egid (for hide_pid_min=2)?
+ */
+static bool has_pid_permissions(struct pid_namespace *pid,
+                                struct task_struct *task,
+                                int hide_pid_min)
 {
-       struct task_struct *task = get_proc_task(inode);
-       struct nsproxy *nsp;
-       struct mnt_namespace *ns = NULL;
-       struct path root;
-       struct proc_mounts *p;
-       int ret = -EINVAL;
-
-       if (task) {
-               rcu_read_lock();
-               nsp = task_nsproxy(task);
-               if (nsp) {
-                       ns = nsp->mnt_ns;
-                       if (ns)
-                               get_mnt_ns(ns);
-               }
-               rcu_read_unlock();
-               if (ns && get_task_root(task, &root) == 0)
-                       ret = 0;
-               put_task_struct(task);
-       }
-
-       if (!ns)
-               goto err;
-       if (ret)
-               goto err_put_ns;
-
-       ret = -ENOMEM;
-       p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
-       if (!p)
-               goto err_put_path;
-
-       file->private_data = &p->m;
-       ret = seq_open(file, op);
-       if (ret)
-               goto err_free;
-
-       p->m.private = p;
-       p->ns = ns;
-       p->root = root;
-       p->event = ns->event;
-
-       return 0;
-
- err_free:
-       kfree(p);
- err_put_path:
-       path_put(&root);
- err_put_ns:
-       put_mnt_ns(ns);
- err:
-       return ret;
+       if (pid->hide_pid < hide_pid_min)
+               return true;
+       if (in_group_p(pid->pid_gid))
+               return true;
+       return ptrace_may_access(task, PTRACE_MODE_READ);
 }
 
-static int mounts_release(struct inode *inode, struct file *file)
-{
-       struct proc_mounts *p = file->private_data;
-       path_put(&p->root);
-       put_mnt_ns(p->ns);
-       return seq_release(inode, file);
-}
 
-static unsigned mounts_poll(struct file *file, poll_table *wait)
+static int proc_pid_permission(struct inode *inode, int mask)
 {
-       struct proc_mounts *p = file->private_data;
-       unsigned res = POLLIN | POLLRDNORM;
-
-       poll_wait(file, &p->ns->poll, wait);
-       if (mnt_had_events(p))
-               res |= POLLERR | POLLPRI;
-
-       return res;
-}
+       struct pid_namespace *pid = inode->i_sb->s_fs_info;
+       struct task_struct *task;
+       bool has_perms;
 
-static int mounts_open(struct inode *inode, struct file *file)
-{
-       return mounts_open_common(inode, file, &mounts_op);
-}
+       task = get_proc_task(inode);
+       if (!task)
+               return -ESRCH;
+       has_perms = has_pid_permissions(pid, task, 1);
+       put_task_struct(task);
 
-static const struct file_operations proc_mounts_operations = {
-       .open           = mounts_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = mounts_release,
-       .poll           = mounts_poll,
-};
+       if (!has_perms) {
+               if (pid->hide_pid == 2) {
+                       /*
+                        * Let's make getdents(), stat(), and open()
+                        * consistent with each other.  If a process
+                        * may not stat() a file, it shouldn't be seen
+                        * in procfs at all.
+                        */
+                       return -ENOENT;
+               }
 
-static int mountinfo_open(struct inode *inode, struct file *file)
-{
-       return mounts_open_common(inode, file, &mountinfo_op);
+               return -EPERM;
+       }
+       return generic_permission(inode, mask);
 }
 
-static const struct file_operations proc_mountinfo_operations = {
-       .open           = mountinfo_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = mounts_release,
-       .poll           = mounts_poll,
-};
 
-static int mountstats_open(struct inode *inode, struct file *file)
-{
-       return mounts_open_common(inode, file, &mountstats_op);
-}
 
-static const struct file_operations proc_mountstats_operations = {
-       .open           = mountstats_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = mounts_release,
+static const struct inode_operations proc_def_inode_operations = {
+       .setattr        = proc_setattr,
 };
 
 #define PROC_BLOCK_SIZE        (3*1024)                /* 4K page size but our output routines use some slack for overruns */
@@ -757,130 +679,95 @@ static const struct file_operations proc_single_file_operations = {
 
 static int mem_open(struct inode* inode, struct file* file)
 {
-       file->private_data = (void*)((long)current->self_exec_id);
-       /* OK to pass negative loff_t, we can catch out-of-range */
-       file->f_mode |= FMODE_UNSIGNED_OFFSET;
-       return 0;
-}
-
-static ssize_t mem_read(struct file * file, char __user * buf,
-                       size_t count, loff_t *ppos)
-{
        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
-       char *page;
-       unsigned long src = *ppos;
-       int ret = -ESRCH;
        struct mm_struct *mm;
 
        if (!task)
-               goto out_no_task;
-
-       if (check_mem_permission(task))
-               goto out;
-
-       ret = -ENOMEM;
-       page = (char *)__get_free_page(GFP_TEMPORARY);
-       if (!page)
-               goto out;
-
-       ret = 0;
-       mm = get_task_mm(task);
-       if (!mm)
-               goto out_free;
-
-       ret = -EIO;
-       if (file->private_data != (void*)((long)current->self_exec_id))
-               goto out_put;
+               return -ESRCH;
 
-       ret = 0;
-       while (count > 0) {
-               int this_len, retval;
+       mm = mm_access(task, PTRACE_MODE_ATTACH);
+       put_task_struct(task);
 
-               this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
-               retval = access_process_vm(task, src, page, this_len, 0);
-               if (!retval || check_mem_permission(task)) {
-                       if (!ret)
-                               ret = -EIO;
-                       break;
-               }
+       if (IS_ERR(mm))
+               return PTR_ERR(mm);
 
-               if (copy_to_user(buf, page, retval)) {
-                       ret = -EFAULT;
-                       break;
-               }
-               ret += retval;
-               src += retval;
-               buf += retval;
-               count -= retval;
+       if (mm) {
+               /* ensure this mm_struct can't be freed */
+               atomic_inc(&mm->mm_count);
+               /* but do not pin its memory */
+               mmput(mm);
        }
-       *ppos = src;
 
-out_put:
-       mmput(mm);
-out_free:
-       free_page((unsigned long) page);
-out:
-       put_task_struct(task);
-out_no_task:
-       return ret;
-}
+       /* OK to pass negative loff_t, we can catch out-of-range */
+       file->f_mode |= FMODE_UNSIGNED_OFFSET;
+       file->private_data = mm;
 
-#define mem_write NULL
+       return 0;
+}
 
-#ifndef mem_write
-/* This is a security hazard */
-static ssize_t mem_write(struct file * file, const char __user *buf,
-                        size_t count, loff_t *ppos)
+static ssize_t mem_rw(struct file *file, char __user *buf,
+                       size_t count, loff_t *ppos, int write)
 {
-       int copied;
+       struct mm_struct *mm = file->private_data;
+       unsigned long addr = *ppos;
+       ssize_t copied;
        char *page;
-       struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
-       unsigned long dst = *ppos;
-
-       copied = -ESRCH;
-       if (!task)
-               goto out_no_task;
 
-       if (check_mem_permission(task))
-               goto out;
+       if (!mm)
+               return 0;
 
-       copied = -ENOMEM;
        page = (char *)__get_free_page(GFP_TEMPORARY);
        if (!page)
-               goto out;
+               return -ENOMEM;
 
        copied = 0;
+       if (!atomic_inc_not_zero(&mm->mm_users))
+               goto free;
+
        while (count > 0) {
-               int this_len, retval;
+               int this_len = min_t(int, count, PAGE_SIZE);
 
-               this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
-               if (copy_from_user(page, buf, this_len)) {
+               if (write && copy_from_user(page, buf, this_len)) {
                        copied = -EFAULT;
                        break;
                }
-               retval = access_process_vm(task, dst, page, this_len, 1);
-               if (!retval) {
+
+               this_len = access_remote_vm(mm, addr, page, this_len, write);
+               if (!this_len) {
                        if (!copied)
                                copied = -EIO;
                        break;
                }
-               copied += retval;
-               buf += retval;
-               dst += retval;
-               count -= retval;                        
+
+               if (!write && copy_to_user(buf, page, this_len)) {
+                       copied = -EFAULT;
+                       break;
+               }
+
+               buf += this_len;
+               addr += this_len;
+               copied += this_len;
+               count -= this_len;
        }
-       *ppos = dst;
+       *ppos = addr;
+
+       mmput(mm);
+free:
        free_page((unsigned long) page);
-out:
-       put_task_struct(task);
-out_no_task:
        return copied;
 }
-#endif
+
+static ssize_t mem_read(struct file *file, char __user *buf,
+                       size_t count, loff_t *ppos)
+{
+       return mem_rw(file, buf, count, ppos, 0);
+}
+
+static ssize_t mem_write(struct file *file, const char __user *buf,
+                        size_t count, loff_t *ppos)
+{
+       return mem_rw(file, (char __user*)buf, count, ppos, 1);
+}
 
 loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 {
@@ -898,11 +785,20 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
        return file->f_pos;
 }
 
+static int mem_release(struct inode *inode, struct file *file)
+{
+       struct mm_struct *mm = file->private_data;
+       if (mm)
+               mmdrop(mm);
+       return 0;
+}
+
 static const struct file_operations proc_mem_operations = {
        .llseek         = mem_lseek,
        .read           = mem_read,
        .write          = mem_write,
        .open           = mem_open,
+       .release        = mem_release,
 };
 
 static ssize_t environ_read(struct file *file, char __user *buf,
@@ -917,20 +813,18 @@ static ssize_t environ_read(struct file *file, char __user *buf,
        if (!task)
                goto out_no_task;
 
-       if (!ptrace_may_access(task, PTRACE_MODE_READ))
-               goto out;
-
        ret = -ENOMEM;
        page = (char *)__get_free_page(GFP_TEMPORARY);
        if (!page)
                goto out;
 
-       ret = 0;
 
-       mm = get_task_mm(task);
-       if (!mm)
+       mm = mm_for_maps(task);
+       ret = PTR_ERR(mm);
+       if (!mm || IS_ERR(mm))
                goto out_free;
 
+       ret = 0;
        while (count > 0) {
                int this_len, retval, max_len;
 
@@ -1005,7 +899,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 {
        struct task_struct *task;
        char buffer[PROC_NUMBUF];
-       long oom_adjust;
+       int oom_adjust;
        unsigned long flags;
        int err;
 
@@ -1017,7 +911,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
                goto out;
        }
 
-       err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
+       err = kstrtoint(strstrip(buffer), 0, &oom_adjust);
        if (err)
                goto out;
        if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
@@ -1048,21 +942,13 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
                goto err_sighand;
        }
 
-       if (oom_adjust != task->signal->oom_adj) {
-               if (oom_adjust == OOM_DISABLE)
-                       atomic_inc(&task->mm->oom_disable_count);
-               if (task->signal->oom_adj == OOM_DISABLE)
-                       atomic_dec(&task->mm->oom_disable_count);
-       }
-
        /*
         * Warn that /proc/pid/oom_adj is deprecated, see
         * Documentation/feature-removal-schedule.txt.
         */
-       printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, "
-                       "please use /proc/%d/oom_score_adj instead.\n",
-                       current->comm, task_pid_nr(current),
-                       task_pid_nr(task), task_pid_nr(task));
+       printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+                 current->comm, task_pid_nr(current), task_pid_nr(task),
+                 task_pid_nr(task));
        task->signal->oom_adj = oom_adjust;
        /*
         * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
@@ -1073,6 +959,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
        else
                task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
                                                                -OOM_DISABLE;
+       trace_oom_score_adj_update(task);
 err_sighand:
        unlock_task_sighand(task, &flags);
 err_task_lock:
@@ -1114,7 +1001,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
        struct task_struct *task;
        char buffer[PROC_NUMBUF];
        unsigned long flags;
-       long oom_score_adj;
+       int oom_score_adj;
        int err;
 
        memset(buffer, 0, sizeof(buffer));
@@ -1125,7 +1012,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                goto out;
        }
 
-       err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
+       err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
        if (err)
                goto out;
        if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
@@ -1157,15 +1044,10 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                goto err_sighand;
        }
 
-       if (oom_score_adj != task->signal->oom_score_adj) {
-               if (oom_score_adj == OOM_SCORE_ADJ_MIN)
-                       atomic_inc(&task->mm->oom_disable_count);
-               if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-                       atomic_dec(&task->mm->oom_disable_count);
-       }
        task->signal->oom_score_adj = oom_score_adj;
        if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
                task->signal->oom_score_adj_min = oom_score_adj;
+       trace_oom_score_adj_update(task);
        /*
         * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
         * always attainable.
@@ -1216,9 +1098,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
        ssize_t length;
        uid_t loginuid;
 
-       if (!capable(CAP_AUDIT_CONTROL))
-               return -EPERM;
-
        rcu_read_lock();
        if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
                rcu_read_unlock();
@@ -1247,7 +1126,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
                goto out_free_page;
 
        }
-       length = audit_set_loginuid(current, loginuid);
+       length = audit_set_loginuid(loginuid);
        if (likely(length == 0))
                length = count;
 
@@ -1414,7 +1293,7 @@ sched_autogroup_write(struct file *file, const char __user *buf,
        struct inode *inode = file->f_path.dentry->d_inode;
        struct task_struct *p;
        char buffer[PROC_NUMBUF];
-       long nice;
+       int nice;
        int err;
 
        memset(buffer, 0, sizeof(buffer));
@@ -1423,16 +1302,15 @@ sched_autogroup_write(struct file *file, const char __user *buf,
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
 
-       err = strict_strtol(strstrip(buffer), 0, &nice);
-       if (err)
-               return -EINVAL;
+       err = kstrtoint(strstrip(buffer), 0, &nice);
+       if (err < 0)
+               return err;
 
        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
 
-       err = nice;
-       err = proc_sched_autogroup_set_nice(p, &err);
+       err = proc_sched_autogroup_set_nice(p, nice);
        if (err)
                count = err;
 
@@ -1522,64 +1400,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
        .release        = single_release,
 };
 
-/*
- * We added or removed a vma mapping the executable. The vmas are only mapped
- * during exec and are not mapped with the mmap system call.
- * Callers must hold down_write() on the mm's mmap_sem for these
- */
-void added_exe_file_vma(struct mm_struct *mm)
-{
-       mm->num_exe_file_vmas++;
-}
-
-void removed_exe_file_vma(struct mm_struct *mm)
-{
-       mm->num_exe_file_vmas--;
-       if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
-               fput(mm->exe_file);
-               mm->exe_file = NULL;
-       }
-
-}
-
-void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
-{
-       if (new_exe_file)
-               get_file(new_exe_file);
-       if (mm->exe_file)
-               fput(mm->exe_file);
-       mm->exe_file = new_exe_file;
-       mm->num_exe_file_vmas = 0;
-}
-
-struct file *get_mm_exe_file(struct mm_struct *mm)
-{
-       struct file *exe_file;
-
-       /* We need mmap_sem to protect against races with removal of
-        * VM_EXECUTABLE vmas */
-       down_read(&mm->mmap_sem);
-       exe_file = mm->exe_file;
-       if (exe_file)
-               get_file(exe_file);
-       up_read(&mm->mmap_sem);
-       return exe_file;
-}
-
-void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
-{
-       /* It's safe to write the exe_file pointer without exe_file_lock because
-        * this is called during fork when the task is not yet in /proc */
-       newmm->exe_file = get_mm_exe_file(oldmm);
-}
-
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 {
        struct task_struct *task;
        struct mm_struct *mm;
        struct file *exe_file;
 
-       task = get_proc_task(inode);
+       task = get_proc_task(dentry->d_inode);
        if (!task)
                return -ENOENT;
        mm = get_task_mm(task);
@@ -1609,7 +1436,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
        if (!proc_fd_access_allowed(inode))
                goto out;
 
-       error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+       error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path);
 out:
        return ERR_PTR(error);
 }
@@ -1648,7 +1475,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
        if (!proc_fd_access_allowed(inode))
                goto out;
 
-       error = PROC_I(inode)->op.proc_get_link(inode, &path);
+       error = PROC_I(inode)->op.proc_get_link(dentry, &path);
        if (error)
                goto out;
 
@@ -1682,8 +1509,7 @@ static int task_dumpable(struct task_struct *task)
        return 0;
 }
 
-
-static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
+struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
 {
        struct inode * inode;
        struct proc_inode *ei;
@@ -1725,11 +1551,12 @@ out_unlock:
        return NULL;
 }
 
-static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
        struct inode *inode = dentry->d_inode;
        struct task_struct *task;
        const struct cred *cred;
+       struct pid_namespace *pid = dentry->d_sb->s_fs_info;
 
        generic_fillattr(inode, stat);
 
@@ -1738,6 +1565,14 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
        stat->gid = 0;
        task = pid_task(proc_pid(inode), PIDTYPE_PID);
        if (task) {
+               if (!has_pid_permissions(pid, task, 2)) {
+                       rcu_read_unlock();
+                       /*
+                        * This doesn't prevent learning whether PID exists,
+                        * it only makes getattr() consistent with readdir().
+                        */
+                       return -ENOENT;
+               }
                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
                    task_dumpable(task)) {
                        cred = __task_cred(task);
@@ -1766,7 +1601,7 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
  * made this apply to all per process world readable and executable
  * directories.
  */
-static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
+int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *inode;
        struct task_struct *task;
@@ -1808,7 +1643,7 @@ static int pid_delete_dentry(const struct dentry * dentry)
        return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
 }
 
-static const struct dentry_operations pid_dentry_operations =
+const struct dentry_operations pid_dentry_operations =
 {
        .d_revalidate   = pid_revalidate,
        .d_delete       = pid_delete_dentry,
@@ -1816,9 +1651,6 @@ static const struct dentry_operations pid_dentry_operations =
 
 /* Lookups */
 
-typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
-                               struct task_struct *, const void *);
-
 /*
  * Fill a directory entry.
  *
@@ -1831,8 +1663,8 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
  * reported by readdir in sync with the inode numbers reported
  * by stat.
  */
-static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-       char *name, int len,
+int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+       const char *name, int len,
        instantiate_t instantiate, struct task_struct *task, const void *ptr)
 {
        struct dentry *child, *dir = filp->f_path.dentry;
@@ -1916,6 +1748,14 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
                spin_lock(&files->file_lock);
                file = fcheck_files(files, fd);
                if (file) {
+                       unsigned int f_flags;
+                       struct fdtable *fdt;
+
+                       fdt = files_fdtable(files);
+                       f_flags = file->f_flags & ~O_CLOEXEC;
+                       if (close_on_exec(fd, fdt))
+                               f_flags |= O_CLOEXEC;
+
                        if (path) {
                                *path = file->f_path;
                                path_get(&file->f_path);
@@ -1925,7 +1765,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
                                         "pos:\t%lli\n"
                                         "flags:\t0%o\n",
                                         (long long) file->f_pos,
-                                        file->f_flags);
+                                        f_flags);
                        spin_unlock(&files->file_lock);
                        put_files_struct(files);
                        return 0;
@@ -1936,9 +1776,9 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
        return -ENOENT;
 }
 
-static int proc_fd_link(struct inode *inode, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct path *path)
 {
-       return proc_fd_info(inode, path, NULL);
+       return proc_fd_info(dentry->d_inode, path, NULL);
 }
 
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
@@ -2159,17 +1999,358 @@ static const struct file_operations proc_fd_operations = {
        .llseek         = default_llseek,
 };
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+
 /*
- * /proc/pid/fd needs a special permission handler so that a process can still
- * access /proc/self/fd after it has executed a setuid().
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
  */
-static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags)
+static int dname_to_vma_addr(struct dentry *dentry,
+                            unsigned long *start, unsigned long *end)
+{
+       if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-       int rv;
+       unsigned long vm_start, vm_end;
+       bool exact_vma_exists = false;
+       struct mm_struct *mm = NULL;
+       struct task_struct *task;
+       const struct cred *cred;
+       struct inode *inode;
+       int status = 0;
 
-       if (flags & IPERM_FLAG_RCU)
+       if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
-       rv = generic_permission(inode, mask, flags, NULL);
+
+       if (!capable(CAP_SYS_ADMIN)) {
+               status = -EACCES;
+               goto out_notask;
+       }
+
+       inode = dentry->d_inode;
+       task = get_proc_task(inode);
+       if (!task)
+               goto out_notask;
+
+       if (!ptrace_may_access(task, PTRACE_MODE_READ))
+               goto out;
+
+       mm = get_task_mm(task);
+       if (!mm)
+               goto out;
+
+       if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+               down_read(&mm->mmap_sem);
+               exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+               up_read(&mm->mmap_sem);
+       }
+
+       mmput(mm);
+
+       if (exact_vma_exists) {
+               if (task_dumpable(task)) {
+                       rcu_read_lock();
+                       cred = __task_cred(task);
+                       inode->i_uid = cred->euid;
+                       inode->i_gid = cred->egid;
+                       rcu_read_unlock();
+               } else {
+                       inode->i_uid = 0;
+                       inode->i_gid = 0;
+               }
+               security_task_to_inode(task, inode);
+               status = 1;
+       }
+
+out:
+       put_task_struct(task);
+
+out_notask:
+       if (status <= 0)
+               d_drop(dentry);
+
+       return status;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+       .d_revalidate   = map_files_d_revalidate,
+       .d_delete       = pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+       unsigned long vm_start, vm_end;
+       struct vm_area_struct *vma;
+       struct task_struct *task;
+       struct mm_struct *mm;
+       int rc;
+
+       rc = -ENOENT;
+       task = get_proc_task(dentry->d_inode);
+       if (!task)
+               goto out;
+
+       mm = get_task_mm(task);
+       put_task_struct(task);
+       if (!mm)
+               goto out;
+
+       rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+       if (rc)
+               goto out_mmput;
+
+       down_read(&mm->mmap_sem);
+       vma = find_exact_vma(mm, vm_start, vm_end);
+       if (vma && vma->vm_file) {
+               *path = vma->vm_file->f_path;
+               path_get(path);
+               rc = 0;
+       }
+       up_read(&mm->mmap_sem);
+
+out_mmput:
+       mmput(mm);
+out:
+       return rc;
+}
+
+struct map_files_info {
+       struct file     *file;
+       unsigned long   len;
+       unsigned char   name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+                          struct task_struct *task, const void *ptr)
+{
+       const struct file *file = ptr;
+       struct proc_inode *ei;
+       struct inode *inode;
+
+       if (!file)
+               return ERR_PTR(-ENOENT);
+
+       inode = proc_pid_make_inode(dir->i_sb, task);
+       if (!inode)
+               return ERR_PTR(-ENOENT);
+
+       ei = PROC_I(inode);
+       ei->op.proc_get_link = proc_map_files_get_link;
+
+       inode->i_op = &proc_pid_link_inode_operations;
+       inode->i_size = 64;
+       inode->i_mode = S_IFLNK;
+
+       if (file->f_mode & FMODE_READ)
+               inode->i_mode |= S_IRUSR;
+       if (file->f_mode & FMODE_WRITE)
+               inode->i_mode |= S_IWUSR;
+
+       d_set_d_op(dentry, &tid_map_files_dentry_operations);
+       d_add(dentry, inode);
+
+       return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+               struct dentry *dentry, struct nameidata *nd)
+{
+       unsigned long vm_start, vm_end;
+       struct vm_area_struct *vma;
+       struct task_struct *task;
+       struct dentry *result;
+       struct mm_struct *mm;
+
+       result = ERR_PTR(-EACCES);
+       if (!capable(CAP_SYS_ADMIN))
+               goto out;
+
+       result = ERR_PTR(-ENOENT);
+       task = get_proc_task(dir);
+       if (!task)
+               goto out;
+
+       result = ERR_PTR(-EACCES);
+       if (!ptrace_may_access(task, PTRACE_MODE_READ))
+               goto out_put_task;
+
+       result = ERR_PTR(-ENOENT);
+       if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+               goto out_put_task;
+
+       mm = get_task_mm(task);
+       if (!mm)
+               goto out_put_task;
+
+       down_read(&mm->mmap_sem);
+       vma = find_exact_vma(mm, vm_start, vm_end);
+       if (!vma)
+               goto out_no_vma;
+
+       result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+       up_read(&mm->mmap_sem);
+       mmput(mm);
+out_put_task:
+       put_task_struct(task);
+out:
+       return result;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+       .lookup         = proc_map_files_lookup,
+       .permission     = proc_fd_permission,
+       .setattr        = proc_setattr,
+};
+
+static int
+proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+       struct dentry *dentry = filp->f_path.dentry;
+       struct inode *inode = dentry->d_inode;
+       struct vm_area_struct *vma;
+       struct task_struct *task;
+       struct mm_struct *mm;
+       ino_t ino;
+       int ret;
+
+       ret = -EACCES;
+       if (!capable(CAP_SYS_ADMIN))
+               goto out;
+
+       ret = -ENOENT;
+       task = get_proc_task(inode);
+       if (!task)
+               goto out;
+
+       ret = -EACCES;
+       if (!ptrace_may_access(task, PTRACE_MODE_READ))
+               goto out_put_task;
+
+       ret = 0;
+       switch (filp->f_pos) {
+       case 0:
+               ino = inode->i_ino;
+               if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+                       goto out_put_task;
+               filp->f_pos++;
+       case 1:
+               ino = parent_ino(dentry);
+               if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+                       goto out_put_task;
+               filp->f_pos++;
+       default:
+       {
+               unsigned long nr_files, pos, i;
+               struct flex_array *fa = NULL;
+               struct map_files_info info;
+               struct map_files_info *p;
+
+               mm = get_task_mm(task);
+               if (!mm)
+                       goto out_put_task;
+               down_read(&mm->mmap_sem);
+
+               nr_files = 0;
+
+               /*
+                * We need two passes here:
+                *
+                *  1) Collect vmas of mapped files with mmap_sem taken
+                *  2) Release mmap_sem and instantiate entries
+                *
+                * otherwise we get lockdep complained, since filldir()
+                * routine might require mmap_sem taken in might_fault().
+                */
+
+               for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+                       if (vma->vm_file && ++pos > filp->f_pos)
+                               nr_files++;
+               }
+
+               if (nr_files) {
+                       fa = flex_array_alloc(sizeof(info), nr_files,
+                                               GFP_KERNEL);
+                       if (!fa || flex_array_prealloc(fa, 0, nr_files,
+                                                       GFP_KERNEL)) {
+                               ret = -ENOMEM;
+                               if (fa)
+                                       flex_array_free(fa);
+                               up_read(&mm->mmap_sem);
+                               mmput(mm);
+                               goto out_put_task;
+                       }
+                       for (i = 0, vma = mm->mmap, pos = 2; vma;
+                                       vma = vma->vm_next) {
+                               if (!vma->vm_file)
+                                       continue;
+                               if (++pos <= filp->f_pos)
+                                       continue;
+
+                               get_file(vma->vm_file);
+                               info.file = vma->vm_file;
+                               info.len = snprintf(info.name,
+                                               sizeof(info.name), "%lx-%lx",
+                                               vma->vm_start, vma->vm_end);
+                               if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+                                       BUG();
+                       }
+               }
+               up_read(&mm->mmap_sem);
+
+               for (i = 0; i < nr_files; i++) {
+                       p = flex_array_get(fa, i);
+                       ret = proc_fill_cache(filp, dirent, filldir,
+                                             p->name, p->len,
+                                             proc_map_files_instantiate,
+                                             task, p->file);
+                       if (ret)
+                               break;
+                       filp->f_pos++;
+                       fput(p->file);
+               }
+               for (; i < nr_files; i++) {
+                       /*
+                        * In case of error don't forget
+                        * to put rest of file refs.
+                        */
+                       p = flex_array_get(fa, i);
+                       fput(p->file);
+               }
+               if (fa)
+                       flex_array_free(fa);
+               mmput(mm);
+       }
+       }
+
+out_put_task:
+       put_task_struct(task);
+out:
+       return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+       .read           = generic_read_dir,
+       .readdir        = proc_map_files_readdir,
+       .llseek         = default_llseek,
+};
+
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
+/*
+ * /proc/pid/fd needs a special permission handler so that a process can still
+ * access /proc/self/fd after it has executed a setuid().
+ */
+static int proc_fd_permission(struct inode *inode, int mask)
+{
+       int rv = generic_permission(inode, mask);
        if (rv == 0)
                return 0;
        if (task_pid(current) == proc_pid(inode))
@@ -2254,7 +2435,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
        ei = PROC_I(inode);
        inode->i_mode = p->mode;
        if (S_ISDIR(inode->i_mode))
-               inode->i_nlink = 2;     /* Use getattr to fix if necessary */
+               set_nlink(inode, 2);    /* Use getattr to fix if necessary */
        if (p->iop)
                inode->i_op = p->iop;
        if (p->fop)
@@ -2620,35 +2801,6 @@ static const struct pid_entry proc_base_stuff[] = {
                &proc_self_inode_operations, NULL, {}),
 };
 
-/*
- *     Exceptional case: normally we are not allowed to unhash a busy
- * directory. In this case, however, we can do it - no aliasing problems
- * due to the way we treat inodes.
- */
-static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
-       struct inode *inode;
-       struct task_struct *task;
-
-       if (nd->flags & LOOKUP_RCU)
-               return -ECHILD;
-
-       inode = dentry->d_inode;
-       task = get_proc_task(inode);
-       if (task) {
-               put_task_struct(task);
-               return 1;
-       }
-       d_drop(dentry);
-       return 0;
-}
-
-static const struct dentry_operations proc_base_dentry_operations =
-{
-       .d_revalidate   = proc_base_revalidate,
-       .d_delete       = pid_delete_dentry,
-};
-
 static struct dentry *proc_base_instantiate(struct inode *dir,
        struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
@@ -2677,7 +2829,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
 
        inode->i_mode = p->mode;
        if (S_ISDIR(inode->i_mode))
-               inode->i_nlink = 2;
+               set_nlink(inode, 2);
        if (S_ISLNK(inode->i_mode))
                inode->i_size = 64;
        if (p->iop)
@@ -2685,7 +2837,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
-       d_set_d_op(dentry, &proc_base_dentry_operations);
        d_add(dentry, inode);
        error = NULL;
 out:
@@ -2737,6 +2888,16 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 {
        struct task_io_accounting acct = task->ioac;
        unsigned long flags;
+       int result;
+
+       result = mutex_lock_killable(&task->signal->cred_guard_mutex);
+       if (result)
+               return result;
+
+       if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
+               result = -EACCES;
+               goto out_unlock;
+       }
 
        if (whole && lock_task_sighand(task, &flags)) {
                struct task_struct *t = task;
@@ -2747,7 +2908,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 
                unlock_task_sighand(task, &flags);
        }
-       return sprintf(buffer,
+       result = sprintf(buffer,
                        "rchar: %llu\n"
                        "wchar: %llu\n"
                        "syscr: %llu\n"
@@ -2762,6 +2923,9 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
                        (unsigned long long)acct.read_bytes,
                        (unsigned long long)acct.write_bytes,
                        (unsigned long long)acct.cancelled_write_bytes);
+out_unlock:
+       mutex_unlock(&task->signal->cred_guard_mutex);
+       return result;
 }
 
 static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
@@ -2778,8 +2942,12 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
 {
-       seq_printf(m, "%08x\n", task->personality);
-       return 0;
+       int err = lock_trace(task);
+       if (!err) {
+               seq_printf(m, "%08x\n", task->personality);
+               unlock_trace(task);
+       }
+       return err;
 }
 
 /*
@@ -2791,14 +2959,18 @@ static const struct inode_operations proc_task_inode_operations;
 static const struct pid_entry tgid_base_stuff[] = {
        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+#ifdef CONFIG_CHECKPOINT_RESTORE
+       DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
+#endif
        DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+       DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
        DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
 #endif
        REG("environ",    S_IRUSR, proc_environ_operations),
        INF("auxv",       S_IRUSR, proc_pid_auxv),
        ONE("status",     S_IRUGO, proc_pid_status),
-       ONE("personality", S_IRUSR, proc_pid_personality),
+       ONE("personality", S_IRUGO, proc_pid_personality),
        INF("limits",     S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
@@ -2808,14 +2980,14 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-       INF("syscall",    S_IRUSR, proc_pid_syscall),
+       INF("syscall",    S_IRUGO, proc_pid_syscall),
 #endif
        INF("cmdline",    S_IRUGO, proc_pid_cmdline),
        ONE("stat",       S_IRUGO, proc_tgid_stat),
        ONE("statm",      S_IRUGO, proc_pid_statm),
-       REG("maps",       S_IRUGO, proc_maps_operations),
+       REG("maps",       S_IRUGO, proc_pid_maps_operations),
 #ifdef CONFIG_NUMA
-       REG("numa_maps",  S_IRUGO, proc_numa_maps_operations),
+       REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),
 #endif
        REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
        LNK("cwd",        proc_cwd_link),
@@ -2826,8 +2998,8 @@ static const struct pid_entry tgid_base_stuff[] = {
        REG("mountstats", S_IRUSR, proc_mountstats_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
-       REG("smaps",      S_IRUGO, proc_smaps_operations),
-       REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+       REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
+       REG("pagemap",    S_IRUGO, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
        DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -2836,7 +3008,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        INF("wchan",      S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_STACKTRACE
-       ONE("stack",      S_IRUSR, proc_pid_stack),
+       ONE("stack",      S_IRUGO, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
        INF("schedstat",  S_IRUGO, proc_pid_schedstat),
@@ -2864,7 +3036,10 @@ static const struct pid_entry tgid_base_stuff[] = {
        REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-       INF("io",       S_IRUGO, proc_tgid_io_accounting),
+       INF("io",       S_IRUSR, proc_tgid_io_accounting),
+#endif
+#ifdef CONFIG_HARDWALL
+       INF("hardwall",   S_IRUGO, proc_pid_hardwall),
 #endif
 };
 
@@ -2890,6 +3065,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
        .lookup         = proc_tgid_base_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_setattr,
+       .permission     = proc_pid_permission,
 };
 
 static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
@@ -2996,8 +3172,8 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
        inode->i_fop = &proc_tgid_base_operations;
        inode->i_flags|=S_IMMUTABLE;
 
-       inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
-               ARRAY_SIZE(tgid_base_stuff));
+       set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff,
+                                                 ARRAY_SIZE(tgid_base_stuff)));
 
        d_set_d_op(dentry, &pid_dentry_operations);
 
@@ -3093,14 +3269,26 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi
                                proc_pid_instantiate, iter.task, NULL);
 }
 
+static int fake_filldir(void *buf, const char *name, int namelen,
+                       loff_t offset, u64 ino, unsigned d_type)
+{
+       return 0;
+}
+
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-       unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
-       struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode);
+       unsigned int nr;
+       struct task_struct *reaper;
        struct tgid_iter iter;
        struct pid_namespace *ns;
+       filldir_t __filldir;
+
+       if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
+               goto out_no_task;
+       nr = filp->f_pos - FIRST_PROCESS_ENTRY;
 
+       reaper = get_proc_task(filp->f_path.dentry->d_inode);
        if (!reaper)
                goto out_no_task;
 
@@ -3116,8 +3304,13 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
        for (iter = next_tgid(ns, iter);
             iter.task;
             iter.tgid += 1, iter = next_tgid(ns, iter)) {
+               if (has_pid_permissions(ns, iter.task, 2))
+                       __filldir = filldir;
+               else
+                       __filldir = fake_filldir;
+
                filp->f_pos = iter.tgid + TGID_OFFSET;
-               if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
+               if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) {
                        put_task_struct(iter.task);
                        goto out;
                }
@@ -3135,24 +3328,25 @@ out_no_task:
 static const struct pid_entry tid_base_stuff[] = {
        DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
        DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+       DIR("ns",        S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
        REG("environ",   S_IRUSR, proc_environ_operations),
        INF("auxv",      S_IRUSR, proc_pid_auxv),
        ONE("status",    S_IRUGO, proc_pid_status),
-       ONE("personality", S_IRUSR, proc_pid_personality),
+       ONE("personality", S_IRUGO, proc_pid_personality),
        INF("limits",    S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-       INF("syscall",   S_IRUSR, proc_pid_syscall),
+       INF("syscall",   S_IRUGO, proc_pid_syscall),
 #endif
        INF("cmdline",   S_IRUGO, proc_pid_cmdline),
        ONE("stat",      S_IRUGO, proc_tid_stat),
        ONE("statm",     S_IRUGO, proc_pid_statm),
-       REG("maps",      S_IRUGO, proc_maps_operations),
+       REG("maps",      S_IRUGO, proc_tid_maps_operations),
 #ifdef CONFIG_NUMA
-       REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
+       REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations),
 #endif
        REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
        LNK("cwd",       proc_cwd_link),
@@ -3162,8 +3356,8 @@ static const struct pid_entry tid_base_stuff[] = {
        REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
-       REG("smaps",     S_IRUGO, proc_smaps_operations),
-       REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+       REG("smaps",     S_IRUGO, proc_tid_smaps_operations),
+       REG("pagemap",    S_IRUGO, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
        DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -3172,7 +3366,7 @@ static const struct pid_entry tid_base_stuff[] = {
        INF("wchan",     S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_STACKTRACE
-       ONE("stack",      S_IRUSR, proc_pid_stack),
+       ONE("stack",      S_IRUGO, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
        INF("schedstat", S_IRUGO, proc_pid_schedstat),
@@ -3191,13 +3385,16 @@ static const struct pid_entry tid_base_stuff[] = {
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
-       REG("sessionid",  S_IRUSR, proc_sessionid_operations),
+       REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-       INF("io",       S_IRUGO, proc_tid_io_accounting),
+       INF("io",       S_IRUSR, proc_tid_io_accounting),
+#endif
+#ifdef CONFIG_HARDWALL
+       INF("hardwall",   S_IRUGO, proc_pid_hardwall),
 #endif
 };
 
@@ -3239,8 +3436,8 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
        inode->i_fop = &proc_tid_base_operations;
        inode->i_flags|=S_IMMUTABLE;
 
-       inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
-               ARRAY_SIZE(tid_base_stuff));
+       set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff,
+                                                 ARRAY_SIZE(tid_base_stuff)));
 
        d_set_d_op(dentry, &pid_dentry_operations);
 
@@ -3448,6 +3645,7 @@ static const struct inode_operations proc_task_inode_operations = {
        .lookup         = proc_task_lookup,
        .getattr        = proc_task_getattr,
        .setattr        = proc_setattr,
+       .permission     = proc_pid_permission,
 };
 
 static const struct file_operations proc_task_operations = {