cpuset: mm: reduce large amounts of memory barrier related damage v3

author Mel Gorman <mgorman@suse.de>

Wed, 21 Mar 2012 23:34:11 +0000 (16:34 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 22 Mar 2012 00:54:59 +0000 (17:54 -0700)
author Mel Gorman <mgorman@suse.de>
Wed, 21 Mar 2012 23:34:11 +0000 (16:34 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 22 Mar 2012 00:54:59 +0000 (17:54 -0700)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h

index e9eaec5..7a7e5fd 100644 (file)
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -89,42 +89,33 @@ extern void rebuild_sched_domains(void);
  extern void cpuset_print_task_mems_allowed(struct task_struct *p);
  
  /*
- * reading current mems_allowed and mempolicy in the fastpath must protected
- * by get_mems_allowed()
+ * get_mems_allowed is required when making decisions involving mems_allowed
+ * such as during page allocation. mems_allowed can be updated in parallel
+ * and depending on the new value an operation can fail potentially causing
+ * process failure. A retry loop with get_mems_allowed and put_mems_allowed
+ * prevents these artificial failures.
   */
-static inline void get_mems_allowed(void)
+static inline unsigned int get_mems_allowed(void)
  {
-       current->mems_allowed_change_disable++;
-
-       /*
-        * ensure that reading mems_allowed and mempolicy happens after the
-        * update of ->mems_allowed_change_disable.
-        *
-        * the write-side task finds ->mems_allowed_change_disable is not 0,
-        * and knows the read-side task is reading mems_allowed or mempolicy,
-        * so it will clear old bits lazily.
-        */
-       smp_mb();
+       return read_seqcount_begin(&current->mems_allowed_seq);
  }
  
-static inline void put_mems_allowed(void)
+/*
+ * If this returns false, the operation that took place after get_mems_allowed
+ * may have failed. It is up to the caller to retry the operation if
+ * appropriate.
+ */
+static inline bool put_mems_allowed(unsigned int seq)
  {
-       /*
-        * ensure that reading mems_allowed and mempolicy before reducing
-        * mems_allowed_change_disable.
-        *
-        * the write-side task will know that the read-side task is still
-        * reading mems_allowed or mempolicy, don't clears old bits in the
-        * nodemask.
-        */
-       smp_mb();
-       --ACCESS_ONCE(current->mems_allowed_change_disable);
+       return !read_seqcount_retry(&current->mems_allowed_seq, seq);
  }
  
  static inline void set_mems_allowed(nodemask_t nodemask)
  {
         task_lock(current);
+       write_seqcount_begin(&current->mems_allowed_seq);
         current->mems_allowed = nodemask;
+       write_seqcount_end(&current->mems_allowed_seq);
         task_unlock(current);
  }
  
@@ -234,12 +225,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
  {
  }
  
-static inline void get_mems_allowed(void)
+static inline unsigned int get_mems_allowed(void)
  {
+       return 0;
  }
  
-static inline void put_mems_allowed(void)
+static inline bool put_mems_allowed(unsigned int seq)
  {
+       return true;
  }
  
  #endif /* !CONFIG_CPUSETS */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h

index f994d51..e4baff5 100644 (file)
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -29,6 +29,13 @@ extern struct fs_struct init_fs;
  #define INIT_GROUP_RWSEM(sig)
  #endif
  
+#ifdef CONFIG_CPUSETS
+#define INIT_CPUSET_SEQ                                                        \
+       .mems_allowed_seq = SEQCNT_ZERO,
+#else
+#define INIT_CPUSET_SEQ
+#endif
+
  #define INIT_SIGNALS(sig) {                                            \
         .nr_threads     = 1,                                            \
         .wait_chldexit  = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
@@ -192,6 +199,7 @@ extern struct cred init_cred;
         INIT_FTRACE_GRAPH                                               \
         INIT_TRACE_RECURSION                                            \
         INIT_TASK_RCU_PREEMPT(tsk)                                      \
+       INIT_CPUSET_SEQ                                                 \
  }
  
  
diff --git a/include/linux/sched.h b/include/linux/sched.h

index e074e1e..0c147a4 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1514,7 +1514,7 @@ struct task_struct {
  #endif
  #ifdef CONFIG_CPUSETS
         nodemask_t mems_allowed;        /* Protected by alloc_lock */
-       int mems_allowed_change_disable;
+       seqcount_t mems_allowed_seq;    /* Seqence no to catch updates */
         int cpuset_mem_spread_rotor;
         int cpuset_slab_spread_rotor;
  #endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index 5d57583..1010cc6 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
  {
         bool need_loop;
  
-repeat:
         /*
          * Allow tasks that have access to memory reserves because they have
          * been OOM killed to get memory anywhere.
@@ -983,45 +982,19 @@ repeat:
          */
         need_loop = task_has_mempolicy(tsk) ||
                         !nodes_intersects(*newmems, tsk->mems_allowed);
-       nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-       mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
  
-       /*
-        * ensure checking ->mems_allowed_change_disable after setting all new
-        * allowed nodes.
-        *
-        * the read-side task can see an nodemask with new allowed nodes and
-        * old allowed nodes. and if it allocates page when cpuset clears newly
-        * disallowed ones continuous, it can see the new allowed bits.
-        *
-        * And if setting all new allowed nodes is after the checking, setting
-        * all new allowed nodes and clearing newly disallowed ones will be done
-        * continuous, and the read-side task may find no node to alloc page.
-        */
-       smp_mb();
+       if (need_loop)
+               write_seqcount_begin(&tsk->mems_allowed_seq);
  
-       /*
-        * Allocation of memory is very fast, we needn't sleep when waiting
-        * for the read-side.
-        */
-       while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
-               task_unlock(tsk);
-               if (!task_curr(tsk))
-                       yield();
-               goto repeat;
-       }
-
-       /*
-        * ensure checking ->mems_allowed_change_disable before clearing all new
-        * disallowed nodes.
-        *
-        * if clearing newly disallowed bits before the checking, the read-side
-        * task may find no node to alloc page.
-        */
-       smp_mb();
+       nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+       mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
  
         mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
         tsk->mems_allowed = *newmems;
+
+       if (need_loop)
+               write_seqcount_end(&tsk->mems_allowed_seq);
+
         task_unlock(tsk);
  }
  
diff --git a/kernel/fork.c b/kernel/fork.c

index a9e99f3..9cc227d 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1237,6 +1237,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
  #ifdef CONFIG_CPUSETS
         p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
         p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+       seqcount_init(&p->mems_allowed_seq);
  #endif
  #ifdef CONFIG_TRACE_IRQFLAGS
         p->irq_events = 0;
diff --git a/mm/filemap.c b/mm/filemap.c

index f323060..8430420 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -499,10 +499,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
         struct page *page;
  
         if (cpuset_do_page_mem_spread()) {
-               get_mems_allowed();
-               n = cpuset_mem_spread_node();
-               page = alloc_pages_exact_node(n, gfp, 0);
-               put_mems_allowed();
+               unsigned int cpuset_mems_cookie;
+               do {
+                       cpuset_mems_cookie = get_mems_allowed();
+                       n = cpuset_mem_spread_node();
+                       page = alloc_pages_exact_node(n, gfp, 0);
+               } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
+
                 return page;
         }
         return alloc_pages(gfp, 0);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 62f9fad..b1c3148 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -454,14 +454,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
                                 struct vm_area_struct *vma,
                                 unsigned long address, int avoid_reserve)
  {
-       struct page *page = NULL;
+       struct page *page;
         struct mempolicy *mpol;
         nodemask_t *nodemask;
         struct zonelist *zonelist;
         struct zone *zone;
         struct zoneref *z;
+       unsigned int cpuset_mems_cookie;
  
-       get_mems_allowed();
+retry_cpuset:
+       cpuset_mems_cookie = get_mems_allowed();
         zonelist = huge_zonelist(vma, address,
                                         htlb_alloc_mask, &mpol, &nodemask);
         /*
@@ -488,10 +490,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
                         }
                 }
         }
-err:
+
         mpol_cond_put(mpol);
-       put_mems_allowed();
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+               goto retry_cpuset;
         return page;
+
+err:
+       mpol_cond_put(mpol);
+       return NULL;
  }
  
  static void update_and_free_page(struct hstate *h, struct page *page)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 71e1a52..cfb6c86 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1850,18 +1850,24 @@ struct page *
  alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                 unsigned long addr, int node)
  {
-       struct mempolicy *pol = get_vma_policy(current, vma, addr);
+       struct mempolicy *pol;
         struct zonelist *zl;
         struct page *page;
+       unsigned int cpuset_mems_cookie;
+
+retry_cpuset:
+       pol = get_vma_policy(current, vma, addr);
+       cpuset_mems_cookie = get_mems_allowed();
  
-       get_mems_allowed();
         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
                 unsigned nid;
  
                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
                 mpol_cond_put(pol);
                 page = alloc_page_interleave(gfp, order, nid);
-               put_mems_allowed();
+               if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                       goto retry_cpuset;
+
                 return page;
         }
         zl = policy_zonelist(gfp, pol, node);
@@ -1872,7 +1878,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                 struct page *page =  __alloc_pages_nodemask(gfp, order,
                                                 zl, policy_nodemask(gfp, pol));
                 __mpol_put(pol);
-               put_mems_allowed();
+               if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                       goto retry_cpuset;
                 return page;
         }
         /*
@@ -1880,7 +1887,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
          */
         page = __alloc_pages_nodemask(gfp, order, zl,
                                       policy_nodemask(gfp, pol));
-       put_mems_allowed();
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+               goto retry_cpuset;
         return page;
  }
  
@@ -1907,11 +1915,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
  {
         struct mempolicy *pol = current->mempolicy;
         struct page *page;
+       unsigned int cpuset_mems_cookie;
  
         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
                 pol = &default_policy;
  
-       get_mems_allowed();
+retry_cpuset:
+       cpuset_mems_cookie = get_mems_allowed();
+
         /*
          * No reference counting needed for current->mempolicy
          * nor system default_policy
@@ -1922,7 +1933,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
                 page = __alloc_pages_nodemask(gfp, order,
                                 policy_zonelist(gfp, pol, numa_node_id()),
                                 policy_nodemask(gfp, pol));
-       put_mems_allowed();
+
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+               goto retry_cpuset;
+
         return page;
  }
  EXPORT_SYMBOL(alloc_pages_current);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 673596a..40de685 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2380,8 +2380,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  {
         enum zone_type high_zoneidx = gfp_zone(gfp_mask);
         struct zone *preferred_zone;
-       struct page *page;
+       struct page *page = NULL;
         int migratetype = allocflags_to_migratetype(gfp_mask);
+       unsigned int cpuset_mems_cookie;
  
         gfp_mask &= gfp_allowed_mask;
  
@@ -2400,15 +2401,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
         if (unlikely(!zonelist->_zonerefs->zone))
                 return NULL;
  
-       get_mems_allowed();
+retry_cpuset:
+       cpuset_mems_cookie = get_mems_allowed();
+
         /* The preferred zone is used for statistics later */
         first_zones_zonelist(zonelist, high_zoneidx,
                                 nodemask ? : &cpuset_current_mems_allowed,
                                 &preferred_zone);
-       if (!preferred_zone) {
-               put_mems_allowed();
-               return NULL;
-       }
+       if (!preferred_zone)
+               goto out;
  
         /* First allocation attempt */
         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2418,9 +2419,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                 page = __alloc_pages_slowpath(gfp_mask, order,
                                 zonelist, high_zoneidx, nodemask,
                                 preferred_zone, migratetype);
-       put_mems_allowed();
  
         trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+
+out:
+       /*
+        * When updating a task's mems_allowed, it is possible to race with
+        * parallel threads in such a way that an allocation can fail while
+        * the mask is being updated. If a page allocation is about to fail,
+        * check if the cpuset changed during allocation and if so, retry.
+        */
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+               goto retry_cpuset;
+
         return page;
  }
  EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2634,13 +2645,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
  bool skip_free_areas_node(unsigned int flags, int nid)
  {
         bool ret = false;
+       unsigned int cpuset_mems_cookie;
  
         if (!(flags & SHOW_MEM_FILTER_NODES))
                 goto out;
  
-       get_mems_allowed();
-       ret = !node_isset(nid, cpuset_current_mems_allowed);
-       put_mems_allowed();
+       do {
+               cpuset_mems_cookie = get_mems_allowed();
+               ret = !node_isset(nid, cpuset_current_mems_allowed);
+       } while (!put_mems_allowed(cpuset_mems_cookie));
  out:
         return ret;
  }
diff --git a/mm/slab.c b/mm/slab.c

index f0bd785..29c8716 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3284,12 +3284,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
         if (in_interrupt() || (flags & __GFP_THISNODE))
                 return NULL;
         nid_alloc = nid_here = numa_mem_id();
-       get_mems_allowed();
         if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
                 nid_alloc = cpuset_slab_spread_node();
         else if (current->mempolicy)
                 nid_alloc = slab_node(current->mempolicy);
-       put_mems_allowed();
         if (nid_alloc != nid_here)
                 return ____cache_alloc_node(cachep, flags, nid_alloc);
         return NULL;
@@ -3312,14 +3310,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
         enum zone_type high_zoneidx = gfp_zone(flags);
         void *obj = NULL;
         int nid;
+       unsigned int cpuset_mems_cookie;
  
         if (flags & __GFP_THISNODE)
                 return NULL;
  
-       get_mems_allowed();
-       zonelist = node_zonelist(slab_node(current->mempolicy), flags);
         local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
  
+retry_cpuset:
+       cpuset_mems_cookie = get_mems_allowed();
+       zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+
  retry:
         /*
          * Look through allowed nodes for objects available
@@ -3372,7 +3373,9 @@ retry:
                         }
                 }
         }
-       put_mems_allowed();
+
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
+               goto retry_cpuset;
         return obj;
  }
  
diff --git a/mm/slub.c b/mm/slub.c

index 4907563..f4a6229 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1581,6 +1581,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
         struct zone *zone;
         enum zone_type high_zoneidx = gfp_zone(flags);
         void *object;
+       unsigned int cpuset_mems_cookie;
  
         /*
          * The defrag ratio allows a configuration of the tradeoffs between
@@ -1604,23 +1605,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
                         get_cycles() % 1024 > s->remote_node_defrag_ratio)
                 return NULL;
  
-       get_mems_allowed();
-       zonelist = node_zonelist(slab_node(current->mempolicy), flags);
-       for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-               struct kmem_cache_node *n;
-
-               n = get_node(s, zone_to_nid(zone));
-
-               if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
-                               n->nr_partial > s->min_partial) {
-                       object = get_partial_node(s, n, c);
-                       if (object) {
-                               put_mems_allowed();
-                               return object;
+       do {
+               cpuset_mems_cookie = get_mems_allowed();
+               zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+               for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+                       struct kmem_cache_node *n;
+
+                       n = get_node(s, zone_to_nid(zone));
+
+                       if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
+                                       n->nr_partial > s->min_partial) {
+                               object = get_partial_node(s, n, c);
+                               if (object) {
+                                       /*
+                                        * Return the object even if
+                                        * put_mems_allowed indicated that
+                                        * the cpuset mems_allowed was
+                                        * updated in parallel. It's a
+                                        * harmless race between the alloc
+                                        * and the cpuset update.
+                                        */
+                                       put_mems_allowed(cpuset_mems_cookie);
+                                       return object;
+                               }
                         }
                 }
-       }
-       put_mems_allowed();
+       } while (!put_mems_allowed(cpuset_mems_cookie));
  #endif
         return NULL;
  }
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 440af1d..55d86c9 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2343,7 +2343,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
         unsigned long writeback_threshold;
         bool aborted_reclaim;
  
-       get_mems_allowed();
         delayacct_freepages_start();
  
         if (global_reclaim(sc))
@@ -2407,7 +2406,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
  
  out:
         delayacct_freepages_end();
-       put_mems_allowed();
  
         if (sc->nr_reclaimed)
                 return sc->nr_reclaimed;
author	Mel Gorman <mgorman@suse.de>
	Wed, 21 Mar 2012 23:34:11 +0000 (16:34 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 22 Mar 2012 00:54:59 +0000 (17:54 -0700)
include/linux/cpuset.h		patch \| blob \| history
include/linux/init_task.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
kernel/cpuset.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
mm/filemap.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/mempolicy.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/slab.c		patch \| blob \| history
mm/slub.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history