UBUNTU: Ubuntu-2.6.38-11.47

[linux-flexiantxendom0-natty.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 595d0ac..493b522 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -21,6 +21,7 @@
  #include <linux/pagemap.h>
  #include <linux/jiffies.h>
  #include <linux/bootmem.h>
+#include <linux/memblock.h>
  #include <linux/compiler.h>
  #include <linux/kernel.h>
  #include <linux/kmemcheck.h>
@@ -57,6 +58,22 @@
  #include <asm/div64.h>
  #include "internal.h"
  
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+DEFINE_PER_CPU(int, numa_node);
+EXPORT_PER_CPU_SYMBOL(numa_node);
+#endif
+
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+/*
+ * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
+ * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
+ * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
+ * defined in <linux/topology.h>.
+ */
+DEFINE_PER_CPU(int, _numa_mem_);               /* Kernel "local memory" node */
+EXPORT_PER_CPU_SYMBOL(_numa_mem_);
+#endif
+
  /*
   * Array of node states.
   */
@@ -87,19 +104,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
   * only be modified with pm_mutex held, unless the suspend/hibernate code is
   * guaranteed not to run in parallel with that modification).
   */
-void set_gfp_allowed_mask(gfp_t mask)
+
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
  {
         WARN_ON(!mutex_is_locked(&pm_mutex));
-       gfp_allowed_mask = mask;
+       if (saved_gfp_mask) {
+               gfp_allowed_mask = saved_gfp_mask;
+               saved_gfp_mask = 0;
+       }
  }
  
-gfp_t clear_gfp_allowed_mask(gfp_t mask)
+void pm_restrict_gfp_mask(void)
  {
-       gfp_t ret = gfp_allowed_mask;
-
         WARN_ON(!mutex_is_locked(&pm_mutex));
-       gfp_allowed_mask &= ~mask;
-       return ret;
+       WARN_ON(saved_gfp_mask);
+       saved_gfp_mask = gfp_allowed_mask;
+       gfp_allowed_mask &= ~GFP_IOFS;
  }
  #endif /* CONFIG_PM_SLEEP */
  
@@ -264,7 +286,7 @@ static void bad_page(struct page *page)
  
         /* Don't complain about poisoned pages */
         if (PageHWPoison(page)) {
-               __ClearPageBuddy(page);
+               reset_page_mapcount(page); /* remove PageBuddy */
                 return;
         }
  
@@ -295,7 +317,7 @@ static void bad_page(struct page *page)
         dump_stack();
  out:
         /* Leave bad fields for debug, except PageBuddy could make trouble */
-       __ClearPageBuddy(page);
+       reset_page_mapcount(page); /* remove PageBuddy */
         add_taint(TAINT_BAD_PAGE);
  }
  
@@ -335,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order)
         }
  }
  
+/* update __split_huge_page_refcount if you change this function */
  static int destroy_compound_page(struct page *page, unsigned long order)
  {
         int i;
@@ -404,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
   *
   * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
   */
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
-       unsigned long buddy_idx = page_idx ^ (1 << order);
-
-       return page + (buddy_idx - page_idx);
-}
-
  static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
  {
-       return (page_idx & ~(1 << order));
+       return page_idx ^ (1 << order);
  }
  
  /*
@@ -426,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
   * (c) a page and its buddy have the same order &&
   * (d) a page and its buddy are in the same zone.
   *
- * For recording whether a page is in the buddy system, we use PG_buddy.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
   *
   * For recording page's order, we use page_private(page).
   */
@@ -460,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
   * as necessary, plus some accounting needed to play nicely with other
   * parts of the VM system.
   * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
   * order is recorded in page_private(page) field.
   * So when we are allocating or freeing one, we can derive the state of the
   * other.  That is, if we allocate a small block, and both were   
@@ -477,6 +492,7 @@ static inline void __free_one_page(struct page *page,
  {
         unsigned long page_idx;
         unsigned long combined_idx;
+       unsigned long uninitialized_var(buddy_idx);
         struct page *buddy;
  
         if (unlikely(PageCompound(page)))
@@ -491,7 +507,8 @@ static inline void __free_one_page(struct page *page,
         VM_BUG_ON(bad_range(zone, page));
  
         while (order < MAX_ORDER-1) {
-               buddy = __page_find_buddy(page, page_idx, order);
+               buddy_idx = __find_buddy_index(page_idx, order);
+               buddy = page + (buddy_idx - page_idx);
                 if (!page_is_buddy(page, buddy, order))
                         break;
  
@@ -499,7 +516,7 @@ static inline void __free_one_page(struct page *page,
                 list_del(&buddy->lru);
                 zone->free_area[order].nr_free--;
                 rmv_page_order(buddy);
-               combined_idx = __find_combined_index(page_idx, order);
+               combined_idx = buddy_idx & page_idx;
                 page = page + (combined_idx - page_idx);
                 page_idx = combined_idx;
                 order++;
@@ -514,11 +531,12 @@ static inline void __free_one_page(struct page *page,
          * so it's less likely to be used soon and more likely to be merged
          * as a higher order page
          */
-       if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
+       if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
                 struct page *higher_page, *higher_buddy;
-               combined_idx = __find_combined_index(page_idx, order);
-               higher_page = page + combined_idx - page_idx;
-               higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+               combined_idx = buddy_idx & page_idx;
+               higher_page = page + (combined_idx - page_idx);
+               buddy_idx = __find_buddy_index(combined_idx, order + 1);
+               higher_buddy = page + (buddy_idx - combined_idx);
                 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
                         list_add_tail(&page->lru,
                                 &zone->free_area[order].free_list[migratetype]);
@@ -572,13 +590,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
  {
         int migratetype = 0;
         int batch_free = 0;
+       int to_free = count;
  
         spin_lock(&zone->lock);
         zone->all_unreclaimable = 0;
         zone->pages_scanned = 0;
  
-       __mod_zone_page_state(zone, NR_FREE_PAGES, count);
-       while (count) {
+       while (to_free) {
                 struct page *page;
                 struct list_head *list;
  
@@ -603,8 +621,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                         /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
                         __free_one_page(page, zone, 0, page_private(page));
                         trace_mm_page_pcpu_drain(page, 0, page_private(page));
-               } while (--count && --batch_free && !list_empty(list));
+               } while (--to_free && --batch_free && !list_empty(list));
         }
+       __mod_zone_page_state(zone, NR_FREE_PAGES, count);
         spin_unlock(&zone->lock);
  }
  
@@ -615,8 +634,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
         zone->all_unreclaimable = 0;
         zone->pages_scanned = 0;
  
-       __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
         __free_one_page(page, zone, order, migratetype);
+       __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
         spin_unlock(&zone->lock);
  }
  
@@ -628,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
         trace_mm_page_free_direct(page, order);
         kmemcheck_free_shadow(page, order);
  
-       for (i = 0; i < (1 << order); i++) {
-               struct page *pg = page + i;
-
-               if (PageAnon(pg))
-                       pg->mapping = NULL;
-               bad += free_pages_check(pg);
-       }
+       if (PageAnon(page))
+               page->mapping = NULL;
+       for (i = 0; i < (1 << order); i++)
+               bad += free_pages_check(page + i);
         if (bad)
                 return false;
  
@@ -1072,8 +1088,10 @@ static void drain_pages(unsigned int cpu)
                 pset = per_cpu_ptr(zone->pageset, cpu);
  
                 pcp = &pset->pcp;
-               free_pcppages_bulk(zone, pcp->count, pcp);
-               pcp->count = 0;
+               if (pcp->count) {
+                       free_pcppages_bulk(zone, pcp->count, pcp);
+                       pcp->count = 0;
+               }
                 local_irq_restore(flags);
         }
  }
@@ -1437,24 +1455,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  #endif /* CONFIG_FAIL_PAGE_ALLOC */
  
  /*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
   * of the allocation.
   */
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                     int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags, long free_pages)
  {
         /* free_pages my go negative - that's OK */
         long min = mark;
-       long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
         int o;
  
+       free_pages -= (1 << order) + 1;
         if (alloc_flags & ALLOC_HIGH)
                 min -= min / 2;
         if (alloc_flags & ALLOC_HARDER)
                 min -= min / 4;
  
         if (free_pages <= min + z->lowmem_reserve[classzone_idx])
-               return 0;
+               return false;
         for (o = 0; o < order; o++) {
                 /* At the next order, this order's pages become unavailable */
                 free_pages -= z->free_area[o].nr_free << o;
@@ -1463,9 +1481,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                 min >>= 1;
  
                 if (free_pages <= min)
-                       return 0;
+                       return false;
         }
-       return 1;
+       return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags)
+{
+       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                       zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags)
+{
+       long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+       if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+               free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                                               free_pages);
  }
  
  #ifdef CONFIG_NUMA
@@ -1722,7 +1759,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
         struct page *page;
  
         /* Acquire the OOM killer lock for the zones in zonelist */
-       if (!try_set_zone_oom(zonelist, gfp_mask)) {
+       if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
                 schedule_timeout_uninterruptible(1);
                 return NULL;
         }
@@ -1743,6 +1780,9 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                 /* The OOM killer will not help higher order allocs */
                 if (order > PAGE_ALLOC_COSTLY_ORDER)
                         goto out;
+               /* The OOM killer does not needlessly kill tasks for lowmem */
+               if (high_zoneidx < ZONE_NORMAL)
+                       goto out;
                 /*
                  * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
                  * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
@@ -1767,15 +1807,18 @@ static struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         struct zonelist *zonelist, enum zone_type high_zoneidx,
         nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int migratetype, unsigned long *did_some_progress)
+       int migratetype, unsigned long *did_some_progress,
+       bool sync_migration)
  {
         struct page *page;
  
         if (!order || compaction_deferred(preferred_zone))
                 return NULL;
  
+       current->flags |= PF_MEMALLOC;
         *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-                                                               nodemask);
+                                               nodemask, sync_migration);
+       current->flags &= ~PF_MEMALLOC;
         if (*did_some_progress != COMPACT_SKIPPED) {
  
                 /* Page migration frees to the PCP lists but we want merging */
@@ -1811,7 +1854,8 @@ static inline struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         struct zonelist *zonelist, enum zone_type high_zoneidx,
         nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int migratetype, unsigned long *did_some_progress)
+       int migratetype, unsigned long *did_some_progress,
+       bool sync_migration)
  {
         return NULL;
  }
@@ -1826,33 +1870,44 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
  {
         struct page *page = NULL;
         struct reclaim_state reclaim_state;
-       struct task_struct *p = current;
+       bool drained = false;
  
         cond_resched();
  
         /* We now go into synchronous reclaim */
         cpuset_memory_pressure_bump();
-       p->flags |= PF_MEMALLOC;
+       current->flags |= PF_MEMALLOC;
         lockdep_set_current_reclaim_state(gfp_mask);
         reclaim_state.reclaimed_slab = 0;
-       p->reclaim_state = &reclaim_state;
+       current->reclaim_state = &reclaim_state;
  
         *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
  
-       p->reclaim_state = NULL;
+       current->reclaim_state = NULL;
         lockdep_clear_current_reclaim_state();
-       p->flags &= ~PF_MEMALLOC;
+       current->flags &= ~PF_MEMALLOC;
  
         cond_resched();
  
-       if (order != 0)
-               drain_all_pages();
+       if (unlikely(!(*did_some_progress)))
+               return NULL;
  
-       if (likely(*did_some_progress))
-               page = get_page_from_freelist(gfp_mask, nodemask, order,
+retry:
+       page = get_page_from_freelist(gfp_mask, nodemask, order,
                                         zonelist, high_zoneidx,
                                         alloc_flags, preferred_zone,
                                         migratetype);
+
+       /*
+        * If an allocation failed after direct reclaim, it could be because
+        * pages are pinned on the per-cpu lists. Drain them and try again
+        */
+       if (!page && !drained) {
+               drain_all_pages();
+               drained = true;
+               goto retry;
+       }
+
         return page;
  }
  
@@ -1874,7 +1929,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
                         preferred_zone, migratetype);
  
                 if (!page && gfp_mask & __GFP_NOFAIL)
-                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                       wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
         } while (!page && (gfp_mask & __GFP_NOFAIL));
  
         return page;
@@ -1882,24 +1937,24 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
  
  static inline
  void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
-                                               enum zone_type high_zoneidx)
+                                               enum zone_type high_zoneidx,
+                                               enum zone_type classzone_idx)
  {
         struct zoneref *z;
         struct zone *zone;
  
         for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-               wakeup_kswapd(zone, order);
+               wakeup_kswapd(zone, order, classzone_idx);
  }
  
  static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
-       struct task_struct *p = current;
         int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
         const gfp_t wait = gfp_mask & __GFP_WAIT;
  
         /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
-       BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
+       BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
  
         /*
          * The caller may dip into page reserves a bit more if the caller
@@ -1907,21 +1962,26 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
          * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
          */
-       alloc_flags |= (gfp_mask & __GFP_HIGH);
+       alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
  
         if (!wait) {
-               alloc_flags |= ALLOC_HARDER;
+               /*
+                * Not worth trying to allocate harder for
+                * __GFP_NOMEMALLOC even if it can't schedule.
+                */
+               if  (!(gfp_mask & __GFP_NOMEMALLOC))
+                       alloc_flags |= ALLOC_HARDER;
                 /*
                  * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
                  * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
                  */
                 alloc_flags &= ~ALLOC_CPUSET;
-       } else if (unlikely(rt_task(p)) && !in_interrupt())
+       } else if (unlikely(rt_task(current)) && !in_interrupt())
                 alloc_flags |= ALLOC_HARDER;
  
         if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
                 if (!in_interrupt() &&
-                   ((p->flags & PF_MEMALLOC) ||
+                   ((current->flags & PF_MEMALLOC) ||
                      unlikely(test_thread_flag(TIF_MEMDIE))))
                         alloc_flags |= ALLOC_NO_WATERMARKS;
         }
@@ -1940,7 +2000,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         int alloc_flags;
         unsigned long pages_reclaimed = 0;
         unsigned long did_some_progress;
-       struct task_struct *p = current;
+       bool sync_migration = false;
  
         /*
          * In the slowpath, we sanity check order to avoid ever trying to
@@ -1965,7 +2025,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                 goto nopage;
  
  restart:
-       wake_all_kswapd(order, zonelist, high_zoneidx);
+       if (!(gfp_mask & __GFP_NO_KSWAPD))
+               wake_all_kswapd(order, zonelist, high_zoneidx,
+                                               zone_idx(preferred_zone));
  
         /*
          * OK, we're below the kswapd watermark and have kicked background
@@ -1974,6 +2036,15 @@ restart:
          */
         alloc_flags = gfp_to_alloc_flags(gfp_mask);
  
+       /*
+        * Find the true preferred zone if the allocation is unconstrained by
+        * cpusets.
+        */
+       if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+               first_zones_zonelist(zonelist, high_zoneidx, NULL,
+                                       &preferred_zone);
+
+rebalance:
         /* This is the last chance, in general, before the goto nopage. */
         page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                         high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -1981,7 +2052,6 @@ restart:
         if (page)
                 goto got_pg;
  
-rebalance:
         /* Allocate without watermarks if the context allows */
         if (alloc_flags & ALLOC_NO_WATERMARKS) {
                 page = __alloc_pages_high_priority(gfp_mask, order,
@@ -1996,21 +2066,26 @@ rebalance:
                 goto nopage;
  
         /* Avoid recursion of direct reclaim */
-       if (p->flags & PF_MEMALLOC)
+       if (current->flags & PF_MEMALLOC)
                 goto nopage;
  
         /* Avoid allocations with no watermarks from looping endlessly */
         if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
                 goto nopage;
  
-       /* Try direct compaction */
+       /*
+        * Try direct compaction. The first pass is asynchronous. Subsequent
+        * attempts after direct reclaim are synchronous
+        */
         page = __alloc_pages_direct_compact(gfp_mask, order,
                                         zonelist, high_zoneidx,
                                         nodemask,
                                         alloc_flags, preferred_zone,
-                                       migratetype, &did_some_progress);
+                                       migratetype, &did_some_progress,
+                                       sync_migration);
         if (page)
                 goto got_pg;
+       sync_migration = true;
  
         /* Try direct reclaim and then allocating */
         page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2036,15 +2111,23 @@ rebalance:
                         if (page)
                                 goto got_pg;
  
-                       /*
-                        * The OOM killer does not trigger for high-order
-                        * ~__GFP_NOFAIL allocations so if no progress is being
-                        * made, there are no other options and retrying is
-                        * unlikely to help.
-                        */
-                       if (order > PAGE_ALLOC_COSTLY_ORDER &&
-                                               !(gfp_mask & __GFP_NOFAIL))
-                               goto nopage;
+                       if (!(gfp_mask & __GFP_NOFAIL)) {
+                               /*
+                                * The oom killer is not called for high-order
+                                * allocations that may fail, so if no progress
+                                * is being made, there are no other options and
+                                * retrying is unlikely to help.
+                                */
+                               if (order > PAGE_ALLOC_COSTLY_ORDER)
+                                       goto nopage;
+                               /*
+                                * The oom killer is not called for lowmem
+                                * allocations to prevent needlessly killing
+                                * innocent tasks.
+                                */
+                               if (high_zoneidx < ZONE_NORMAL)
+                                       goto nopage;
+                       }
  
                         goto restart;
                 }
@@ -2054,15 +2137,29 @@ rebalance:
         pages_reclaimed += did_some_progress;
         if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
                 /* Wait for some write requests to complete then retry */
-               congestion_wait(BLK_RW_ASYNC, HZ/50);
+               wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                 goto rebalance;
+       } else {
+               /*
+                * High-order allocations do not necessarily loop after
+                * direct reclaim and reclaim/compaction depends on compaction
+                * being called after reclaim so call directly if necessary
+                */
+               page = __alloc_pages_direct_compact(gfp_mask, order,
+                                       zonelist, high_zoneidx,
+                                       nodemask,
+                                       alloc_flags, preferred_zone,
+                                       migratetype, &did_some_progress,
+                                       sync_migration);
+               if (page)
+                       goto got_pg;
         }
  
  nopage:
         if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
                 printk(KERN_WARNING "%s: page allocation failure."
                         " order:%d, mode:0x%x\n",
-                       p->comm, order, gfp_mask);
+                       current->comm, order, gfp_mask);
                 dump_stack();
                 show_mem();
         }
@@ -2105,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  
         get_mems_allowed();
         /* The preferred zone is used for statistics later */
-       first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+       first_zones_zonelist(zonelist, high_zoneidx,
+                               nodemask ? : &cpuset_current_mems_allowed,
+                               &preferred_zone);
         if (!preferred_zone) {
                 put_mems_allowed();
                 return NULL;
@@ -2539,9 +2638,16 @@ static int __parse_numa_zonelist_order(char *s)
  
  static __init int setup_numa_zonelist_order(char *s)
  {
-       if (s)
-               return __parse_numa_zonelist_order(s);
-       return 0;
+       int ret;
+
+       if (!s)
+               return 0;
+
+       ret = __parse_numa_zonelist_order(s);
+       if (ret == 0)
+               strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+
+       return ret;
  }
  early_param("numa_zonelist_order", setup_numa_zonelist_order);
  
@@ -2571,8 +2677,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
                         strncpy((char*)table->data, saved_string,
                                 NUMA_ZONELIST_ORDER_LEN);
                         user_zonelist_order = oldval;
-               } else if (oldval != user_zonelist_order)
-                       build_all_zonelists();
+               } else if (oldval != user_zonelist_order) {
+                       mutex_lock(&zonelists_mutex);
+                       build_all_zonelists(NULL);
+                       mutex_unlock(&zonelists_mutex);
+               }
         }
  out:
         mutex_unlock(&zl_order_mutex);
@@ -2853,6 +2962,24 @@ static void build_zonelist_cache(pg_data_t *pgdat)
                 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
  }
  
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+/*
+ * Return node id of node used for "local" allocations.
+ * I.e., first node id of first zone in arg node's generic zonelist.
+ * Used for initializing percpu 'numa_mem', which is used primarily
+ * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
+ */
+int local_memory_node(int node)
+{
+       struct zone *zone;
+
+       (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
+                                  gfp_zone(GFP_KERNEL),
+                                  NULL,
+                                  &zone);
+       return zone->node;
+}
+#endif
  
  #else  /* CONFIG_NUMA */
  
@@ -2922,9 +3049,16 @@ static void build_zonelist_cache(pg_data_t *pgdat)
   */
  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
  static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+static void setup_zone_pageset(struct zone *zone);
+
+/*
+ * Global mutex to protect against size modification of zonelists
+ * as well as to serialize pageset setup for the new populated zone.
+ */
+DEFINE_MUTEX(zonelists_mutex);
  
  /* return values int ....just for stop_machine() */
-static int __build_all_zonelists(void *dummy)
+static __init_refok int __build_all_zonelists(void *data)
  {
         int nid;
         int cpu;
@@ -2952,13 +3086,31 @@ static int __build_all_zonelists(void *dummy)
          * needs the percpu allocator in order to allocate its pagesets
          * (a chicken-egg dilemma).
          */
-       for_each_possible_cpu(cpu)
+       for_each_possible_cpu(cpu) {
                 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
  
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+               /*
+                * We now know the "local memory node" for each node--
+                * i.e., the node of the first zone in the generic zonelist.
+                * Set up numa_mem percpu variable for on-line cpus.  During
+                * boot, only the boot cpu should be on-line;  we'll init the
+                * secondary cpus' numa_mem as they come on-line.  During
+                * node/memory hotplug, we'll fixup all on-line cpus.
+                */
+               if (cpu_online(cpu))
+                       set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
+#endif
+       }
+
         return 0;
  }
  
-void build_all_zonelists(void)
+/*
+ * Called with zonelists_mutex held always
+ * unless system_state == SYSTEM_BOOTING.
+ */
+void build_all_zonelists(void *data)
  {
         set_zonelist_order();
  
@@ -2969,6 +3121,10 @@ void build_all_zonelists(void)
         } else {
                 /* we have to stop all cpus to guarantee there is no user
                    of zonelist */
+#ifdef CONFIG_MEMORY_HOTPLUG
+               if (data)
+                       setup_zone_pageset((struct zone *)data);
+#endif
                 stop_machine(__build_all_zonelists, NULL, NULL);
                 /* cpuset refresh routine should be here */
         }
@@ -3342,7 +3498,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
  
         if (!slab_is_available()) {
                 zone->wait_table = (wait_queue_head_t *)
-                       alloc_bootmem_node(pgdat, alloc_size);
+                       alloc_bootmem_node_nopanic(pgdat, alloc_size);
         } else {
                 /*
                  * This case means that a zone whose size was 0 gets new memory
@@ -3542,6 +3698,41 @@ void __init free_bootmem_with_active_regions(int nid,
         }
  }
  
+#ifdef CONFIG_HAVE_MEMBLOCK
+u64 __init find_memory_core_early(int nid, u64 size, u64 align,
+                                       u64 goal, u64 limit)
+{
+       int i;
+
+       /* Need to go over early_node_map to find out good range for node */
+       for_each_active_range_index_in_nid(i, nid) {
+               u64 addr;
+               u64 ei_start, ei_last;
+               u64 final_start, final_end;
+
+               ei_last = early_node_map[i].end_pfn;
+               ei_last <<= PAGE_SHIFT;
+               ei_start = early_node_map[i].start_pfn;
+               ei_start <<= PAGE_SHIFT;
+
+               final_start = max(ei_start, goal);
+               final_end = min(ei_last, limit);
+
+               if (final_start >= final_end)
+                       continue;
+
+               addr = memblock_find_in_range(final_start, final_end, size, align);
+
+               if (addr == MEMBLOCK_ERROR)
+                       continue;
+
+               return addr;
+       }
+
+       return MEMBLOCK_ERROR;
+}
+#endif
+
  int __init add_from_early_node_map(struct range *range, int az,
                                    int nr_range, int nid)
  {
@@ -3561,38 +3752,26 @@ int __init add_from_early_node_map(struct range *range, int az,
  void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
                                         u64 goal, u64 limit)
  {
-       int i;
         void *ptr;
+       u64 addr;
  
-       /* need to go over early_node_map to find out good range for node */
-       for_each_active_range_index_in_nid(i, nid) {
-               u64 addr;
-               u64 ei_start, ei_last;
-
-               ei_last = early_node_map[i].end_pfn;
-               ei_last <<= PAGE_SHIFT;
-               ei_start = early_node_map[i].start_pfn;
-               ei_start <<= PAGE_SHIFT;
-               addr = find_early_area(ei_start, ei_last,
-                                        goal, limit, size, align);
+       if (limit > memblock.current_limit)
+               limit = memblock.current_limit;
  
-               if (addr == -1ULL)
-                       continue;
+       addr = find_memory_core_early(nid, size, align, goal, limit);
  
-#if 0
-               printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
-                               nid,
-                               ei_start, ei_last, goal, limit, size,
-                               align, addr);
-#endif
-
-               ptr = phys_to_virt(addr);
-               memset(ptr, 0, size);
-               reserve_early_without_check(addr, addr + size, "BOOTMEM");
-               return ptr;
-       }
+       if (addr == MEMBLOCK_ERROR)
+               return NULL;
  
-       return NULL;
+       ptr = phys_to_virt(addr);
+       memset(ptr, 0, size);
+       memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
+       /*
+        * The min_count is set to 0 so that bootmem allocated blocks
+        * are never reported as leaks.
+        */
+       kmemleak_alloc(ptr, size, 0, 0);
+       return ptr;
  }
  #endif
  
@@ -3892,10 +4071,11 @@ static void __init setup_usemap(struct pglist_data *pgdat,
         unsigned long usemapsize = usemap_size(zonesize);
         zone->pageblock_flags = NULL;
         if (usemapsize)
-               zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
+               zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
+                                                                  usemapsize);
  }
  #else
-static void inline setup_usemap(struct pglist_data *pgdat,
+static inline void setup_usemap(struct pglist_data *pgdat,
                                 struct zone *zone, unsigned long zonesize) {}
  #endif /* CONFIG_SPARSEMEM */
  
@@ -4011,8 +4191,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                 zone_seqlock_init(zone);
                 zone->zone_pgdat = pgdat;
  
-               zone->prev_priority = DEF_PRIORITY;
-
                 zone_pcp_init(zone);
                 for_each_lru(l) {
                         INIT_LIST_HEAD(&zone->lru[l].list);
@@ -4060,7 +4238,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                 size =  (end - start) * sizeof(struct page);
                 map = alloc_remap(pgdat->node_id, size);
                 if (!map)
-                       map = alloc_bootmem_node(pgdat, size);
+                       map = alloc_bootmem_node_nopanic(pgdat, size);
                 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
         }
  #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -5082,9 +5260,9 @@ void *__init alloc_large_system_hash(const char *tablename,
         if (!table)
                 panic("Failed to allocate %s hash table\n", tablename);
  
-       printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
+       printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
                tablename,
-              (1U << log2qty),
+              (1UL << log2qty),
                ilog2(size) - PAGE_SHIFT,
                size);
  
@@ -5181,12 +5359,64 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
   * page allocater never alloc memory from ISOLATE block.
   */
  
+static int
+__count_immobile_pages(struct zone *zone, struct page *page, int count)
+{
+       unsigned long pfn, iter, found;
+       /*
+        * For avoiding noise data, lru_add_drain_all() should be called
+        * If ZONE_MOVABLE, the zone never contains immobile pages
+        */
+       if (zone_idx(zone) == ZONE_MOVABLE)
+               return true;
+
+       if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
+               return true;
+
+       pfn = page_to_pfn(page);
+       for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
+               unsigned long check = pfn + iter;
+
+               if (!pfn_valid_within(check))
+                       continue;
+
+               page = pfn_to_page(check);
+               if (!page_count(page)) {
+                       if (PageBuddy(page))
+                               iter += (1 << page_order(page)) - 1;
+                       continue;
+               }
+               if (!PageLRU(page))
+                       found++;
+               /*
+                * If there are RECLAIMABLE pages, we need to check it.
+                * But now, memory offline itself doesn't call shrink_slab()
+                * and it still to be fixed.
+                */
+               /*
+                * If the page is not RAM, page_count()should be 0.
+                * we don't need more check. This is an _used_ not-movable page.
+                *
+                * The problematic thing here is PG_reserved pages. PG_reserved
+                * is set to both of a memory hole page and a _used_ kernel
+                * page at boot.
+                */
+               if (found > count)
+                       return false;
+       }
+       return true;
+}
+
+bool is_pageblock_removable_nolock(struct page *page)
+{
+       struct zone *zone = page_zone(page);
+       return __count_immobile_pages(zone, page, 0);
+}
+
  int set_migratetype_isolate(struct page *page)
  {
         struct zone *zone;
-       struct page *curr_page;
-       unsigned long flags, pfn, iter;
-       unsigned long immobile = 0;
+       unsigned long flags, pfn;
         struct memory_isolate_notify arg;
         int notifier_ret;
         int ret = -EBUSY;
@@ -5196,11 +5426,6 @@ int set_migratetype_isolate(struct page *page)
         zone_idx = zone_idx(zone);
  
         spin_lock_irqsave(&zone->lock, flags);
-       if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
-           zone_idx == ZONE_MOVABLE) {
-               ret = 0;
-               goto out;
-       }
  
         pfn = page_to_pfn(page);
         arg.start_pfn = pfn;
@@ -5220,23 +5445,20 @@ int set_migratetype_isolate(struct page *page)
          */
         notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
         notifier_ret = notifier_to_errno(notifier_ret);
-       if (notifier_ret || !arg.pages_found)
+       if (notifier_ret)
                 goto out;
-
-       for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
-               if (!pfn_valid_within(pfn))
-                       continue;
-
-               curr_page = pfn_to_page(iter);
-               if (!page_count(curr_page) || PageLRU(curr_page))
-                       continue;
-
-               immobile++;
-       }
-
-       if (arg.pages_found == immobile)
+       /*
+        * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+        * We just check MOVABLE pages.
+        */
+       if (__count_immobile_pages(zone, page, arg.pages_found))
                 ret = 0;
  
+       /*
+        * immobile means "not-on-lru" paes. If immobile is larger than
+        * removable-by-driver pages reported by notifier, we'll fail.
+        */
+
  out:
         if (!ret) {
                 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
@@ -5355,7 +5577,6 @@ static struct trace_print_flags pageflag_names[] = {
         {1UL << PG_swapcache,           "swapcache"     },
         {1UL << PG_mappedtodisk,        "mappedtodisk"  },
         {1UL << PG_reclaim,             "reclaim"       },
-       {1UL << PG_buddy,               "buddy"         },
         {1UL << PG_swapbacked,          "swapbacked"    },
         {1UL << PG_unevictable,         "unevictable"   },
  #ifdef CONFIG_MMU
@@ -5403,7 +5624,7 @@ void dump_page(struct page *page)
  {
         printk(KERN_ALERT
                "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
-               page, page_count(page), page_mapcount(page),
+               page, atomic_read(&page->_count), page_mapcount(page),
                 page->mapping, page->index);
         dump_page_flags(page->flags);
  }