UBUNTU: Ubuntu-2.6.38-12.51

[linux-flexiantxendom0-natty.git] / kernel / cpuset.c
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index d109467..e92e981 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -105,7 +105,7 @@ struct cpuset {
         /* for custom sched domain */
         int relax_domain_level;
  
-       /* used for walking a cpuset heirarchy */
+       /* used for walking a cpuset hierarchy */
         struct list_head stack_list;
  };
  
@@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock);
   * users. If someone tries to mount the "cpuset" filesystem, we
   * silently switch it to mount "cgroup" instead
   */
-static int cpuset_get_sb(struct file_system_type *fs_type,
-                        int flags, const char *unused_dev_name,
-                        void *data, struct vfsmount *mnt)
+static struct dentry *cpuset_mount(struct file_system_type *fs_type,
+                        int flags, const char *unused_dev_name, void *data)
  {
         struct file_system_type *cgroup_fs = get_fs_type("cgroup");
-       int ret = -ENODEV;
+       struct dentry *ret = ERR_PTR(-ENODEV);
         if (cgroup_fs) {
                 char mountopts[] =
                         "cpuset,noprefix,"
                         "release_agent=/sbin/cpuset_release_agent";
-               ret = cgroup_fs->get_sb(cgroup_fs, flags,
-                                          unused_dev_name, mountopts, mnt);
+               ret = cgroup_fs->mount(cgroup_fs, flags,
+                                          unused_dev_name, mountopts);
                 put_filesystem(cgroup_fs);
         }
         return ret;
@@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type,
  
  static struct file_system_type cpuset_fs_type = {
         .name = "cpuset",
-       .get_sb = cpuset_get_sb,
+       .mount = cpuset_mount,
  };
  
  /*
@@ -946,16 +945,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
   * In order to avoid seeing no nodes if the old and new nodes are disjoint,
   * we structure updates as setting all new allowed nodes, then clearing newly
   * disallowed ones.
- *
- * Called with task's alloc_lock held
   */
  static void cpuset_change_task_nodemask(struct task_struct *tsk,
                                         nodemask_t *newmems)
  {
+repeat:
+       /*
+        * Allow tasks that have access to memory reserves because they have
+        * been OOM killed to get memory anywhere.
+        */
+       if (unlikely(test_thread_flag(TIF_MEMDIE)))
+               return;
+       if (current->flags & PF_EXITING) /* Let dying task have memory */
+               return;
+
+       task_lock(tsk);
         nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-       mpol_rebind_task(tsk, &tsk->mems_allowed);
-       mpol_rebind_task(tsk, newmems);
+       mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
+
+
+       /*
+        * ensure checking ->mems_allowed_change_disable after setting all new
+        * allowed nodes.
+        *
+        * the read-side task can see an nodemask with new allowed nodes and
+        * old allowed nodes. and if it allocates page when cpuset clears newly
+        * disallowed ones continuous, it can see the new allowed bits.
+        *
+        * And if setting all new allowed nodes is after the checking, setting
+        * all new allowed nodes and clearing newly disallowed ones will be done
+        * continuous, and the read-side task may find no node to alloc page.
+        */
+       smp_mb();
+
+       /*
+        * Allocation of memory is very fast, we needn't sleep when waiting
+        * for the read-side.
+        */
+       while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
+               task_unlock(tsk);
+               if (!task_curr(tsk))
+                       yield();
+               goto repeat;
+       }
+
+       /*
+        * ensure checking ->mems_allowed_change_disable before clearing all new
+        * disallowed nodes.
+        *
+        * if clearing newly disallowed bits before the checking, the read-side
+        * task may find no node to alloc page.
+        */
+       smp_mb();
+
+       mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
         tsk->mems_allowed = *newmems;
+       task_unlock(tsk);
  }
  
  /*
@@ -978,9 +1023,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
         cs = cgroup_cs(scan->cg);
         guarantee_online_mems(cs, newmems);
  
-       task_lock(p);
         cpuset_change_task_nodemask(p, newmems);
-       task_unlock(p);
  
         NODEMASK_FREE(newmems);
  
@@ -1353,7 +1396,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
         if (tsk->flags & PF_THREAD_BOUND)
                 return -EINVAL;
  
-       ret = security_task_setscheduler(tsk, 0, NULL);
+       ret = security_task_setscheduler(tsk);
         if (ret)
                 return ret;
         if (threadgroup) {
@@ -1361,7 +1404,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
  
                 rcu_read_lock();
                 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                       ret = security_task_setscheduler(c, 0, NULL);
+                       ret = security_task_setscheduler(c);
                         if (ret) {
                                 rcu_read_unlock();
                                 return ret;
@@ -1383,9 +1426,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
         err = set_cpus_allowed_ptr(tsk, cpus_attach);
         WARN_ON_ONCE(err);
  
-       task_lock(tsk);
         cpuset_change_task_nodemask(tsk, to);
-       task_unlock(tsk);
         cpuset_update_task_spread_flag(cs, tsk);
  
  }
@@ -1534,8 +1575,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
                 return -ENODEV;
  
         trialcs = alloc_trial_cpuset(cs);
-       if (!trialcs)
-               return -ENOMEM;
+       if (!trialcs) {
+               retval = -ENOMEM;
+               goto out;
+       }
  
         switch (cft->private) {
         case FILE_CPULIST:
@@ -1550,6 +1593,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
         }
  
         free_trial_cpuset(trialcs);
+out:
         cgroup_unlock();
         return retval;
  }
@@ -2071,31 +2115,17 @@ static void scan_for_empty_cpusets(struct cpuset *root)
   * but making no active use of cpusets.
   *
   * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_online_map on each CPU hotplug (cpuhp) event.
+ * cpu_active_mask on each CPU hotplug (cpuhp) event.
   *
   * Called within get_online_cpus().  Needs to call cgroup_lock()
   * before calling generate_sched_domains().
   */
-static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
-                               unsigned long phase, void *unused_cpu)
+void cpuset_update_active_cpus(void)
  {
         struct sched_domain_attr *attr;
         cpumask_var_t *doms;
         int ndoms;
  
-       switch (phase) {
-       case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-       case CPU_DOWN_PREPARE:
-       case CPU_DOWN_PREPARE_FROZEN:
-       case CPU_DOWN_FAILED:
-       case CPU_DOWN_FAILED_FROZEN:
-               break;
-
-       default:
-               return NOTIFY_DONE;
-       }
-
         cgroup_lock();
         mutex_lock(&callback_mutex);
         cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
@@ -2106,8 +2136,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
  
         /* Have scheduler rebuild the domains */
         partition_sched_domains(ndoms, doms, attr);
-
-       return NOTIFY_OK;
  }
  
  #ifdef CONFIG_MEMORY_HOTPLUG
@@ -2161,7 +2189,6 @@ void __init cpuset_init_smp(void)
         cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
         top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
  
-       hotcpu_notifier(cpuset_track_online_cpus, 0);
         hotplug_memory_notifier(cpuset_track_online_nodes, 10);
  
         cpuset_wq = create_singlethread_workqueue("cpuset");
@@ -2182,19 +2209,52 @@ void __init cpuset_init_smp(void)
  void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
  {
         mutex_lock(&callback_mutex);
-       cpuset_cpus_allowed_locked(tsk, pmask);
+       task_lock(tsk);
+       guarantee_online_cpus(task_cs(tsk), pmask);
+       task_unlock(tsk);
         mutex_unlock(&callback_mutex);
  }
  
-/**
- * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
- * Must be called with callback_mutex held.
- **/
-void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
+int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
  {
-       task_lock(tsk);
-       guarantee_online_cpus(task_cs(tsk), pmask);
-       task_unlock(tsk);
+       const struct cpuset *cs;
+       int cpu;
+
+       rcu_read_lock();
+       cs = task_cs(tsk);
+       if (cs)
+               cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+       rcu_read_unlock();
+
+       /*
+        * We own tsk->cpus_allowed, nobody can change it under us.
+        *
+        * But we used cs && cs->cpus_allowed lockless and thus can
+        * race with cgroup_attach_task() or update_cpumask() and get
+        * the wrong tsk->cpus_allowed. However, both cases imply the
+        * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
+        * which takes task_rq_lock().
+        *
+        * If we are called after it dropped the lock we must see all
+        * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
+        * set any mask even if it is not right from task_cs() pov,
+        * the pending set_cpus_allowed_ptr() will fix things.
+        */
+
+       cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
+       if (cpu >= nr_cpu_ids) {
+               /*
+                * Either tsk->cpus_allowed is wrong (see above) or it
+                * is actually empty. The latter case is only possible
+                * if we are racing with remove_tasks_in_empty_cpuset().
+                * Like above we can temporary set any mask and rely on
+                * set_cpus_allowed_ptr() as synchronization point.
+                */
+               cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+               cpu = cpumask_any(cpu_active_mask);
+       }
+
+       return cpu;
  }
  
  void cpuset_init_current_mems_allowed(void)
@@ -2383,22 +2443,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
  }
  
  /**
- * cpuset_lock - lock out any changes to cpuset structures
- *
- * The out of memory (oom) code needs to mutex_lock cpusets
- * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset.  Expose callback_mutex via this
- * cpuset_lock() routine, so the oom code can lock it, before
- * locking the task list.  The tasklist_lock is a spinlock, so
- * must be taken inside callback_mutex.
- */
-
-void cpuset_lock(void)
-{
-       mutex_lock(&callback_mutex);
-}
-
-/**
   * cpuset_unlock - release lock on cpuset changes
   *
   * Undo the lock taken in a previous cpuset_lock() call.
@@ -2410,7 +2454,8 @@ void cpuset_unlock(void)
  }
  
  /**
- * cpuset_mem_spread_node() - On which node to begin search for a page
+ * cpuset_mem_spread_node() - On which node to begin search for a file page
+ * cpuset_slab_spread_node() - On which node to begin search for a slab page
   *
   * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
   * tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -2435,16 +2480,27 @@ void cpuset_unlock(void)
   * See kmem_cache_alloc_node().
   */
  
-int cpuset_mem_spread_node(void)
+static int cpuset_spread_node(int *rotor)
  {
         int node;
  
-       node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
+       node = next_node(*rotor, current->mems_allowed);
         if (node == MAX_NUMNODES)
                 node = first_node(current->mems_allowed);
-       current->cpuset_mem_spread_rotor = node;
+       *rotor = node;
         return node;
  }
+
+int cpuset_mem_spread_node(void)
+{
+       return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
+}
+
+int cpuset_slab_spread_node(void)
+{
+       return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
+}
+
  EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
  
  /**