rcu: Permit call_rcu() from CPU_DYING notifiers
[linux-flexiantxendom0-3.2.10.git] / kernel / sys.c
index 5761c53..e7006eb 100644 (file)
@@ -4,15 +4,15 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/utsname.h>
 #include <linux/mman.h>
-#include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
+#include <linux/kmod.h>
 #include <linux/perf_event.h>
 #include <linux/resource.h>
 #include <linux/kernel.h>
@@ -38,6 +38,8 @@
 #include <linux/fs_struct.h>
 #include <linux/gfp.h>
 #include <linux/syscore_ops.h>
+#include <linux/version.h>
+#include <linux/ctype.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -45,6 +47,8 @@
 #include <linux/user_namespace.h>
 
 #include <linux/kmsg_dump.h>
+/* Move somewhere else to avoid recompiling? */
+#include <generated/utsrelease.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -120,16 +124,33 @@ EXPORT_SYMBOL(cad_pid);
 void (*pm_power_off_prepare)(void);
 
 /*
+ * Returns true if current's euid is same as p's uid or euid,
+ * or has CAP_SYS_NICE to p's user_ns.
+ *
+ * Called with rcu_read_lock, creds are safe
+ */
+static bool set_one_prio_perm(struct task_struct *p)
+{
+       const struct cred *cred = current_cred(), *pcred = __task_cred(p);
+
+       if (pcred->user->user_ns == cred->user->user_ns &&
+           (pcred->uid  == cred->euid ||
+            pcred->euid == cred->euid))
+               return true;
+       if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
+               return true;
+       return false;
+}
+
+/*
  * set the priority of a task
  * - the caller must hold the RCU read lock
  */
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
-       const struct cred *cred = current_cred(), *pcred = __task_cred(p);
        int no_nice;
 
-       if (pcred->uid  != cred->euid &&
-           pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
+       if (!set_one_prio_perm(p)) {
                error = -EPERM;
                goto out;
        }
@@ -297,12 +318,43 @@ void kernel_restart_prepare(char *cmd)
 {
        blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
        system_state = SYSTEM_RESTART;
+       usermodehelper_disable();
        device_shutdown();
-       sysdev_shutdown();
        syscore_shutdown();
 }
 
 /**
+ *     register_reboot_notifier - Register function to be called at reboot time
+ *     @nb: Info about notifier function to be called
+ *
+ *     Registers a function with the list of functions
+ *     to be called at reboot time.
+ *
+ *     Currently always returns zero, as blocking_notifier_chain_register()
+ *     always returns zero.
+ */
+int register_reboot_notifier(struct notifier_block *nb)
+{
+       return blocking_notifier_chain_register(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(register_reboot_notifier);
+
+/**
+ *     unregister_reboot_notifier - Unregister previously registered reboot notifier
+ *     @nb: Hook to be unregistered
+ *
+ *     Unregisters a previously registered reboot
+ *     notifier function.
+ *
+ *     Returns zero on success, or %-ENOENT on failure.
+ */
+int unregister_reboot_notifier(struct notifier_block *nb)
+{
+       return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(unregister_reboot_notifier);
+
+/**
  *     kernel_restart - reboot the system
  *     @cmd: pointer to buffer containing command to execute for restart
  *             or %NULL
@@ -327,6 +379,7 @@ static void kernel_shutdown_prepare(enum system_states state)
        blocking_notifier_call_chain(&reboot_notifier_list,
                (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
        system_state = state;
+       usermodehelper_disable();
        device_shutdown();
 }
 /**
@@ -337,7 +390,6 @@ static void kernel_shutdown_prepare(enum system_states state)
 void kernel_halt(void)
 {
        kernel_shutdown_prepare(SYSTEM_HALT);
-       sysdev_shutdown();
        syscore_shutdown();
        printk(KERN_EMERG "System halted.\n");
        kmsg_dump(KMSG_DUMP_HALT);
@@ -357,7 +409,6 @@ void kernel_power_off(void)
        if (pm_power_off_prepare)
                pm_power_off_prepare();
        disable_nonboot_cpus();
-       sysdev_shutdown();
        syscore_shutdown();
        printk(KERN_EMERG "Power down.\n");
        kmsg_dump(KMSG_DUMP_POWEROFF);
@@ -393,6 +444,15 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                        magic2 != LINUX_REBOOT_MAGIC2C))
                return -EINVAL;
 
+       /*
+        * If pid namespaces are enabled and the current task is in a child
+        * pid_namespace, the command is handled by reboot_pid_ns() which will
+        * call do_exit().
+        */
+       ret = reboot_pid_ns(task_active_pid_ns(current), cmd);
+       if (ret)
+               return ret;
+
        /* Instead of trying to make the power_off code look like
         * halt when pm_power_off is not set do it the easy way.
         */
@@ -506,7 +566,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
        if (rgid != (gid_t) -1) {
                if (old->gid == rgid ||
                    old->egid == rgid ||
-                   capable(CAP_SETGID))
+                   nsown_capable(CAP_SETGID))
                        new->gid = rgid;
                else
                        goto error;
@@ -515,7 +575,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
                if (old->gid == egid ||
                    old->egid == egid ||
                    old->sgid == egid ||
-                   capable(CAP_SETGID))
+                   nsown_capable(CAP_SETGID))
                        new->egid = egid;
                else
                        goto error;
@@ -550,7 +610,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
        old = current_cred();
 
        retval = -EPERM;
-       if (capable(CAP_SETGID))
+       if (nsown_capable(CAP_SETGID))
                new->gid = new->egid = new->sgid = new->fsgid = gid;
        else if (gid == old->gid || gid == old->sgid)
                new->egid = new->fsgid = gid;
@@ -575,11 +635,18 @@ static int set_user(struct cred *new)
        if (!new_user)
                return -EAGAIN;
 
+       /*
+        * We don't fail in case of NPROC limit excess here because too many
+        * poorly written programs don't check set*uid() return code, assuming
+        * it never fails if called by root.  We may still enforce NPROC limit
+        * for programs doing set*uid()+execve() by harmlessly deferring the
+        * failure to the execve() stage.
+        */
        if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
-                       new_user != INIT_USER) {
-               free_uid(new_user);
-               return -EAGAIN;
-       }
+                       new_user != INIT_USER)
+               current->flags |= PF_NPROC_EXCEEDED;
+       else
+               current->flags &= ~PF_NPROC_EXCEEDED;
 
        free_uid(new->user);
        new->user = new_user;
@@ -617,7 +684,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
                new->uid = ruid;
                if (old->uid != ruid &&
                    old->euid != ruid &&
-                   !capable(CAP_SETUID))
+                   !nsown_capable(CAP_SETUID))
                        goto error;
        }
 
@@ -626,7 +693,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
                if (old->uid != euid &&
                    old->euid != euid &&
                    old->suid != euid &&
-                   !capable(CAP_SETUID))
+                   !nsown_capable(CAP_SETUID))
                        goto error;
        }
 
@@ -674,7 +741,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
        old = current_cred();
 
        retval = -EPERM;
-       if (capable(CAP_SETUID)) {
+       if (nsown_capable(CAP_SETUID)) {
                new->suid = new->uid = uid;
                if (uid != old->uid) {
                        retval = set_user(new);
@@ -716,7 +783,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
        old = current_cred();
 
        retval = -EPERM;
-       if (!capable(CAP_SETUID)) {
+       if (!nsown_capable(CAP_SETUID)) {
                if (ruid != (uid_t) -1 && ruid != old->uid &&
                    ruid != old->euid  && ruid != old->suid)
                        goto error;
@@ -780,7 +847,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
        old = current_cred();
 
        retval = -EPERM;
-       if (!capable(CAP_SETGID)) {
+       if (!nsown_capable(CAP_SETGID)) {
                if (rgid != (gid_t) -1 && rgid != old->gid &&
                    rgid != old->egid  && rgid != old->sgid)
                        goto error;
@@ -840,7 +907,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
 
        if (uid == old->uid  || uid == old->euid  ||
            uid == old->suid || uid == old->fsuid ||
-           capable(CAP_SETUID)) {
+           nsown_capable(CAP_SETUID)) {
                if (uid != old_fsuid) {
                        new->fsuid = uid;
                        if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -873,7 +940,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
 
        if (gid == old->gid  || gid == old->egid  ||
            gid == old->sgid || gid == old->fsgid ||
-           capable(CAP_SETGID)) {
+           nsown_capable(CAP_SETGID)) {
                if (gid != old_fsgid) {
                        new->fsgid = gid;
                        goto change_okay;
@@ -1108,6 +1175,34 @@ DECLARE_RWSEM(uts_sem);
 #define override_architecture(name)    0
 #endif
 
+/*
+ * Work around broken programs that cannot handle "Linux 3.0".
+ * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
+ */
+static int override_release(char __user *release, int len)
+{
+       int ret = 0;
+       char buf[65];
+
+       if (current->personality & UNAME26) {
+               char *rest = UTS_RELEASE;
+               int ndots = 0;
+               unsigned v;
+
+               while (*rest) {
+                       if (*rest == '.' && ++ndots >= 3)
+                               break;
+                       if (!isdigit(*rest) && *rest != '.')
+                               break;
+                       rest++;
+               }
+               v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
+               snprintf(buf, len, "2.6.%u%s", v, rest);
+               ret = copy_to_user(release, buf, len);
+       }
+       return ret;
+}
+
 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
 {
        int errno = 0;
@@ -1117,6 +1212,8 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
                errno = -EFAULT;
        up_read(&uts_sem);
 
+       if (!errno && override_release(name->release, sizeof(name->release)))
+               errno = -EFAULT;
        if (!errno && override_architecture(name))
                errno = -EFAULT;
        return errno;
@@ -1138,6 +1235,8 @@ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
                error = -EFAULT;
        up_read(&uts_sem);
 
+       if (!error && override_release(name->release, sizeof(name->release)))
+               error = -EFAULT;
        if (!error && override_architecture(name))
                error = -EFAULT;
        return error;
@@ -1172,6 +1271,8 @@ SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
 
        if (!error && override_architecture(name))
                error = -EFAULT;
+       if (!error && override_release(name->release, sizeof(name->release)))
+               error = -EFAULT;
        return error ? -EFAULT : 0;
 }
 #endif
@@ -1183,6 +1284,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 
        if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;
+
        if (len < 0 || len > __NEW_UTS_LEN)
                return -EINVAL;
        down_write(&uts_sem);
@@ -1194,6 +1296,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
                memset(u->nodename + len, 0, sizeof(u->nodename) - len);
                errno = 0;
        }
+       uts_proc_notify(UTS_PROC_HOSTNAME);
        up_write(&uts_sem);
        return errno;
 }
@@ -1230,7 +1333,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
        int errno;
        char tmp[__NEW_UTS_LEN];
 
-       if (!capable(CAP_SYS_ADMIN))
+       if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        if (len < 0 || len > __NEW_UTS_LEN)
                return -EINVAL;
@@ -1244,6 +1347,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
                memset(u->domainname + len, 0, sizeof(u->domainname) - len);
                errno = 0;
        }
+       uts_proc_notify(UTS_PROC_DOMAINNAME);
        up_write(&uts_sem);
        return errno;
 }
@@ -1345,6 +1449,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
        rlim = tsk->signal->rlim + resource;
        task_lock(tsk->group_leader);
        if (new_rlim) {
+               /* Keep the capable check against init_user_ns until
+                  cgroups can contain all limits */
                if (new_rlim->rlim_max > rlim->rlim_max &&
                                !capable(CAP_SYS_RESOURCE))
                        retval = -EPERM;
@@ -1388,19 +1494,22 @@ static int check_prlimit_permission(struct task_struct *task)
 {
        const struct cred *cred = current_cred(), *tcred;
 
-       tcred = __task_cred(task);
-       if (current != task &&
-           (cred->uid != tcred->euid ||
-            cred->uid != tcred->suid ||
-            cred->uid != tcred->uid  ||
-            cred->gid != tcred->egid ||
-            cred->gid != tcred->sgid ||
-            cred->gid != tcred->gid) &&
-            !capable(CAP_SYS_RESOURCE)) {
-               return -EPERM;
-       }
+       if (current == task)
+               return 0;
 
-       return 0;
+       tcred = __task_cred(task);
+       if (cred->user->user_ns == tcred->user->user_ns &&
+           (cred->uid == tcred->euid &&
+            cred->uid == tcred->suid &&
+            cred->uid == tcred->uid  &&
+            cred->gid == tcred->egid &&
+            cred->gid == tcred->sgid &&
+            cred->gid == tcred->gid))
+               return 0;
+       if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
+               return 0;
+
+       return -EPERM;
 }
 
 SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
@@ -1505,7 +1614,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
        unsigned long maxrss = 0;
 
        memset((char *) r, 0, sizeof *r);
-       utime = stime = cputime_zero;
+       utime = stime = 0;
 
        if (who == RUSAGE_THREAD) {
                task_times(current, &utime, &stime);
@@ -1535,8 +1644,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 
                case RUSAGE_SELF:
                        thread_group_times(p, &tgutime, &tgstime);
-                       utime = cputime_add(utime, tgutime);
-                       stime = cputime_add(stime, tgstime);
+                       utime += tgutime;
+                       stime += tgstime;
                        r->ru_nvcsw += p->signal->nvcsw;
                        r->ru_nivcsw += p->signal->nivcsw;
                        r->ru_minflt += p->signal->min_flt;
@@ -1592,6 +1701,124 @@ SYSCALL_DEFINE1(umask, int, mask)
        return mask;
 }
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static int prctl_set_mm(int opt, unsigned long addr,
+                       unsigned long arg4, unsigned long arg5)
+{
+       unsigned long rlim = rlimit(RLIMIT_DATA);
+       unsigned long vm_req_flags;
+       unsigned long vm_bad_flags;
+       struct vm_area_struct *vma;
+       int error = 0;
+       struct mm_struct *mm = current->mm;
+
+       if (arg4 | arg5)
+               return -EINVAL;
+
+       if (!capable(CAP_SYS_RESOURCE))
+               return -EPERM;
+
+       if (addr >= TASK_SIZE)
+               return -EINVAL;
+
+       down_read(&mm->mmap_sem);
+       vma = find_vma(mm, addr);
+
+       if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
+               /* It must be existing VMA */
+               if (!vma || vma->vm_start > addr)
+                       goto out;
+       }
+
+       error = -EINVAL;
+       switch (opt) {
+       case PR_SET_MM_START_CODE:
+       case PR_SET_MM_END_CODE:
+               vm_req_flags = VM_READ | VM_EXEC;
+               vm_bad_flags = VM_WRITE | VM_MAYSHARE;
+
+               if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
+                   (vma->vm_flags & vm_bad_flags))
+                       goto out;
+
+               if (opt == PR_SET_MM_START_CODE)
+                       mm->start_code = addr;
+               else
+                       mm->end_code = addr;
+               break;
+
+       case PR_SET_MM_START_DATA:
+       case PR_SET_MM_END_DATA:
+               vm_req_flags = VM_READ | VM_WRITE;
+               vm_bad_flags = VM_EXEC | VM_MAYSHARE;
+
+               if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
+                   (vma->vm_flags & vm_bad_flags))
+                       goto out;
+
+               if (opt == PR_SET_MM_START_DATA)
+                       mm->start_data = addr;
+               else
+                       mm->end_data = addr;
+               break;
+
+       case PR_SET_MM_START_STACK:
+
+#ifdef CONFIG_STACK_GROWSUP
+               vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
+#else
+               vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
+#endif
+               if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
+                       goto out;
+
+               mm->start_stack = addr;
+               break;
+
+       case PR_SET_MM_START_BRK:
+               if (addr <= mm->end_data)
+                       goto out;
+
+               if (rlim < RLIM_INFINITY &&
+                   (mm->brk - addr) +
+                   (mm->end_data - mm->start_data) > rlim)
+                       goto out;
+
+               mm->start_brk = addr;
+               break;
+
+       case PR_SET_MM_BRK:
+               if (addr <= mm->end_data)
+                       goto out;
+
+               if (rlim < RLIM_INFINITY &&
+                   (addr - mm->start_brk) +
+                   (mm->end_data - mm->start_data) > rlim)
+                       goto out;
+
+               mm->brk = addr;
+               break;
+
+       default:
+               error = -EINVAL;
+               goto out;
+       }
+
+       error = 0;
+
+out:
+       up_read(&mm->mmap_sem);
+
+       return error;
+}
+#else /* CONFIG_CHECKPOINT_RESTORE */
+static int prctl_set_mm(int opt, unsigned long addr,
+                       unsigned long arg4, unsigned long arg5)
+{
+       return -EINVAL;
+}
+#endif
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                unsigned long, arg4, unsigned long, arg5)
 {
@@ -1662,6 +1889,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                              sizeof(me->comm) - 1) < 0)
                                return -EFAULT;
                        set_task_comm(me, comm);
+                       proc_comm_connector(me);
                        return 0;
                case PR_GET_NAME:
                        get_task_comm(comm, me);
@@ -1740,6 +1968,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        else
                                error = PR_MCE_KILL_DEFAULT;
                        break;
+               case PR_SET_MM:
+                       error = prctl_set_mm(arg2, arg3, arg4, arg5);
+                       break;
+               case PR_SET_CHILD_SUBREAPER:
+                       me->signal->is_child_subreaper = !!arg2;
+                       error = 0;
+                       break;
+               case PR_GET_CHILD_SUBREAPER:
+                       error = put_user(me->signal->is_child_subreaper,
+                                        (int __user *) arg2);
+                       break;
                default:
                        error = -EINVAL;
                        break;