- Update to 3.4-rc7.

[linux-flexiantxendom0-3.2.10.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 75ace52..e5a3966 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -30,6 +30,7 @@
  #include <linux/pagevec.h>
  #include <linux/blkdev.h>
  #include <linux/slab.h>
+#include <linux/ratelimit.h>
  #include <linux/oom.h>
  #include <linux/notifier.h>
  #include <linux/topology.h>
@@ -39,6 +40,7 @@
  #include <linux/memory_hotplug.h>
  #include <linux/nodemask.h>
  #include <linux/vmalloc.h>
+#include <linux/vmstat.h>
  #include <linux/mempolicy.h>
  #include <linux/stop_machine.h>
  #include <linux/sort.h>
@@ -54,6 +56,8 @@
  #include <trace/events/kmem.h>
  #include <linux/ftrace_event.h>
  #include <linux/memcontrol.h>
+#include <linux/prefetch.h>
+#include <linux/page-debug-flags.h>
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
@@ -93,6 +97,14 @@ EXPORT_SYMBOL(node_states);
  
  unsigned long totalram_pages __read_mostly;
  unsigned long totalreserve_pages __read_mostly;
+/*
+ * When calculating the number of globally allowed dirty pages, there
+ * is a certain number of per-zone reserves that should not be
+ * considered dirtyable memory.  This is the sum of those reserves
+ * over all existing zones that contribute dirtyable memory.
+ */
+unsigned long dirty_balance_reserve __read_mostly;
+
  int percpu_pagelist_fraction;
  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
  
@@ -124,6 +136,13 @@ void pm_restrict_gfp_mask(void)
         saved_gfp_mask = gfp_allowed_mask;
         gfp_allowed_mask &= ~GFP_IOFS;
  }
+
+bool pm_suspended_storage(void)
+{
+       if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+               return false;
+       return true;
+}
  #endif /* CONFIG_PM_SLEEP */
  
  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -178,39 +197,17 @@ static unsigned long __meminitdata nr_kernel_pages;
  static unsigned long __meminitdata nr_all_pages;
  static unsigned long __meminitdata dma_reserve;
  
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-  /*
-   * MAX_ACTIVE_REGIONS determines the maximum number of distinct
-   * ranges of memory (RAM) that may be registered with add_active_range().
-   * Ranges passed to add_active_range() will be merged if possible
-   * so the number of times add_active_range() can be called is
-   * related to the number of nodes and the number of holes
-   */
-  #ifdef CONFIG_MAX_ACTIVE_REGIONS
-    /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
-    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
-  #else
-    #if MAX_NUMNODES >= 32
-      /* If there can be many nodes, allow up to 50 holes per node */
-      #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
-    #else
-      /* By default, allow up to 256 distinct regions */
-      #define MAX_ACTIVE_REGIONS 256
-    #endif
-  #endif
-
-  static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
-  static int __meminitdata nr_nodemap_entries;
-  static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
-  static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-  static unsigned long __initdata required_kernelcore;
-  static unsigned long __initdata required_movablecore;
-  static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
-
-  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
-  int movable_zone;
-  EXPORT_SYMBOL(movable_zone);
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+static unsigned long __initdata required_kernelcore;
+static unsigned long __initdata required_movablecore;
+static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+
+/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
+int movable_zone;
+EXPORT_SYMBOL(movable_zone);
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
  
  #if MAX_NUMNODES > 1
  int nr_node_ids __read_mostly = MAX_NUMNODES;
@@ -315,6 +312,7 @@ static void bad_page(struct page *page)
                 current->comm, page_to_pfn(page));
         dump_page(page);
  
+       print_modules();
         dump_stack();
  out:
         /* Leave bad fields for debug, except PageBuddy could make trouble */
@@ -329,8 +327,8 @@ out:
   *
   * The remaining PAGE_SIZE pages are called "tail pages".
   *
- * All pages have PG_compound set.  All pages have their ->private pointing at
- * the head page (even the head page has this).
+ * All pages have PG_compound set.  All tail pages have their ->first_page
+ * pointing at the head page.
   *
   * The first tail page's ->lru.next holds the address of the compound page's
   * put_page() function.  Its ->lru.prev holds the order of allocation.
@@ -352,8 +350,8 @@ void prep_compound_page(struct page *page, unsigned long order)
         __SetPageHead(page);
         for (i = 1; i < nr_pages; i++) {
                 struct page *p = page + i;
-
                 __SetPageTail(p);
+               set_page_count(p, 0);
                 p->first_page = page;
         }
  }
@@ -399,6 +397,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
                 clear_highpage(page + i);
  }
  
+#ifdef CONFIG_DEBUG_PAGEALLOC
+unsigned int _debug_guardpage_minorder;
+
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+       unsigned long res;
+
+       if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
+               printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+               return 0;
+       }
+       _debug_guardpage_minorder = res;
+       printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+       return 0;
+}
+__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+
+static inline void set_page_guard_flag(struct page *page)
+{
+       __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+
+static inline void clear_page_guard_flag(struct page *page)
+{
+       __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+#else
+static inline void set_page_guard_flag(struct page *page) { }
+static inline void clear_page_guard_flag(struct page *page) { }
+#endif
+
  static inline void set_page_order(struct page *page, int order)
  {
         set_page_private(page, order);
@@ -456,6 +485,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
         if (page_zone_id(page) != page_zone_id(buddy))
                 return 0;
  
+       if (page_is_guard(buddy) && page_order(buddy) == order) {
+               VM_BUG_ON(page_count(buddy) != 0);
+               return 1;
+       }
+
         if (PageBuddy(buddy) && page_order(buddy) == order) {
                 VM_BUG_ON(page_count(buddy) != 0);
                 return 1;
@@ -512,11 +546,19 @@ static inline void __free_one_page(struct page *page,
                 buddy = page + (buddy_idx - page_idx);
                 if (!page_is_buddy(page, buddy, order))
                         break;
-
-               /* Our buddy is free, merge with it and move up one order. */
-               list_del(&buddy->lru);
-               zone->free_area[order].nr_free--;
-               rmv_page_order(buddy);
+               /*
+                * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
+                * merge with it and move up one order.
+                */
+               if (page_is_guard(buddy)) {
+                       clear_page_guard_flag(buddy);
+                       set_page_private(page, 0);
+                       __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+               } else {
+                       list_del(&buddy->lru);
+                       zone->free_area[order].nr_free--;
+                       rmv_page_order(buddy);
+               }
                 combined_idx = buddy_idx & page_idx;
                 page = page + (combined_idx - page_idx);
                 page_idx = combined_idx;
@@ -650,7 +692,14 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
         int i;
         int bad = 0;
  
-       trace_mm_page_free_direct(page, order);
+#ifdef CONFIG_XEN
+       if (PageForeign(page)) {
+               PageForeignDestructor(page, order);
+               return false;
+       }
+#endif
+
+       trace_mm_page_free(page, order);
         kmemcheck_free_shadow(page, order);
  
         if (PageAnon(page))
@@ -676,6 +725,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
         unsigned long flags;
         int wasMlocked = __TestClearPageMlocked(page);
  
+#ifdef CONFIG_XEN
+       WARN_ON(PageForeign(page) && wasMlocked);
+#endif
         if (!free_pages_prepare(page, order))
                 return;
  
@@ -688,32 +740,23 @@ static void __free_pages_ok(struct page *page, unsigned int order)
         local_irq_restore(flags);
  }
  
-/*
- * permit the bootmem allocator to evade page validation on high-order frees
- */
  void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
  {
-       if (order == 0) {
-               __ClearPageReserved(page);
-               set_page_count(page, 0);
-               set_page_refcounted(page);
-               __free_page(page);
-       } else {
-               int loop;
+       unsigned int nr_pages = 1 << order;
+       unsigned int loop;
  
-               prefetchw(page);
-               for (loop = 0; loop < BITS_PER_LONG; loop++) {
-                       struct page *p = &page[loop];
-
-                       if (loop + 1 < BITS_PER_LONG)
-                               prefetchw(p + 1);
-                       __ClearPageReserved(p);
-                       set_page_count(p, 0);
-               }
+       prefetchw(page);
+       for (loop = 0; loop < nr_pages; loop++) {
+               struct page *p = &page[loop];
  
-               set_page_refcounted(page);
-               __free_pages(page, order);
+               if (loop + 1 < nr_pages)
+                       prefetchw(p + 1);
+               __ClearPageReserved(p);
+               set_page_count(p, 0);
         }
+
+       set_page_refcounted(page);
+       __free_pages(page, order);
  }
  
  
@@ -742,6 +785,23 @@ static inline void expand(struct zone *zone, struct page *page,
                 high--;
                 size >>= 1;
                 VM_BUG_ON(bad_range(zone, &page[size]));
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+               if (high < debug_guardpage_minorder()) {
+                       /*
+                        * Mark as guard pages (or page), that will allow to
+                        * merge back to allocator when buddy will be freed.
+                        * Corresponding page table entries will not be touched,
+                        * pages will stay not present in virtual address space
+                        */
+                       INIT_LIST_HEAD(&page[size].lru);
+                       set_page_guard_flag(&page[size]);
+                       set_page_private(&page[size], high);
+                       /* Guard pages are not available for any usage */
+                       __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+                       continue;
+               }
+#endif
                 list_add(&page[size].lru, &area->free_list[migratetype]);
                 area->nr_free++;
                 set_page_order(&page[size], high);
@@ -942,7 +1002,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
                          * If breaking a large block of pages, move all free
                          * pages to the preferred allocation list. If falling
                          * back for a reclaimable kernel allocation, be more
-                        * agressive about taking ownership of free pages
+                        * aggressive about taking ownership of free pages
                          */
                         if (unlikely(current_order >= (pageblock_order >> 1)) ||
                                         start_migratetype == MIGRATE_RECLAIMABLE ||
@@ -1111,11 +1171,47 @@ void drain_local_pages(void *arg)
  }
  
  /*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * Note that this code is protected against sending an IPI to an offline
+ * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
+ * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
+ * nothing keeps CPUs from showing up after we populated the cpumask and
+ * before the call to on_each_cpu_mask().
   */
  void drain_all_pages(void)
  {
-       on_each_cpu(drain_local_pages, NULL, 1);
+       int cpu;
+       struct per_cpu_pageset *pcp;
+       struct zone *zone;
+
+       /*
+        * Allocate in the BSS so we wont require allocation in
+        * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
+        */
+       static cpumask_t cpus_with_pcps;
+
+       /*
+        * We don't care about racing with CPU hotplug event
+        * as offline notification will cause the notified
+        * cpu to drain that CPU pcps and on_each_cpu_mask
+        * disables preemption as part of its processing
+        */
+       for_each_online_cpu(cpu) {
+               bool has_pcps = false;
+               for_each_populated_zone(zone) {
+                       pcp = per_cpu_ptr(zone->pageset, cpu);
+                       if (pcp->pcp.count) {
+                               has_pcps = true;
+                               break;
+                       }
+               }
+               if (has_pcps)
+                       cpumask_set_cpu(cpu, &cpus_with_pcps);
+               else
+                       cpumask_clear_cpu(cpu, &cpus_with_pcps);
+       }
+       on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
  }
  
  #ifdef CONFIG_HIBERNATION
@@ -1166,6 +1262,9 @@ void free_hot_cold_page(struct page *page, int cold)
         int migratetype;
         int wasMlocked = __TestClearPageMlocked(page);
  
+#ifdef CONFIG_XEN
+       WARN_ON(PageForeign(page) && wasMlocked);
+#endif
         if (!free_pages_prepare(page, 0))
                 return;
  
@@ -1207,6 +1306,19 @@ out:
  }
  
  /*
+ * Free a list of 0-order pages
+ */
+void free_hot_cold_page_list(struct list_head *list, int cold)
+{
+       struct page *page, *next;
+
+       list_for_each_entry_safe(page, next, list, lru) {
+               trace_mm_page_free_batched(page, cold);
+               free_hot_cold_page(page, cold);
+       }
+}
+
+/*
   * split_page takes a non-compound higher-order page, and splits it into
   * n (1<<order) sub-pages: page[0..n]
   * Each sub-page must be freed individually.
@@ -1367,21 +1479,12 @@ failed:
  
  #ifdef CONFIG_FAIL_PAGE_ALLOC
  
-static struct fail_page_alloc_attr {
+static struct {
         struct fault_attr attr;
  
         u32 ignore_gfp_highmem;
         u32 ignore_gfp_wait;
         u32 min_order;
-
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-
-       struct dentry *ignore_gfp_highmem_file;
-       struct dentry *ignore_gfp_wait_file;
-       struct dentry *min_order_file;
-
-#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
-
  } fail_page_alloc = {
         .attr = FAULT_ATTR_INITIALIZER,
         .ignore_gfp_wait = 1,
@@ -1413,38 +1516,29 @@ static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  
  static int __init fail_page_alloc_debugfs(void)
  {
-       mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+       umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
         struct dentry *dir;
-       int err;
-
-       err = init_fault_attr_dentries(&fail_page_alloc.attr,
-                                      "fail_page_alloc");
-       if (err)
-               return err;
-       dir = fail_page_alloc.attr.dentries.dir;
-
-       fail_page_alloc.ignore_gfp_wait_file =
-               debugfs_create_bool("ignore-gfp-wait", mode, dir,
-                                     &fail_page_alloc.ignore_gfp_wait);
-
-       fail_page_alloc.ignore_gfp_highmem_file =
-               debugfs_create_bool("ignore-gfp-highmem", mode, dir,
-                                     &fail_page_alloc.ignore_gfp_highmem);
-       fail_page_alloc.min_order_file =
-               debugfs_create_u32("min-order", mode, dir,
-                                  &fail_page_alloc.min_order);
-
-       if (!fail_page_alloc.ignore_gfp_wait_file ||
-            !fail_page_alloc.ignore_gfp_highmem_file ||
-            !fail_page_alloc.min_order_file) {
-               err = -ENOMEM;
-               debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
-               debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
-               debugfs_remove(fail_page_alloc.min_order_file);
-               cleanup_fault_attr_dentries(&fail_page_alloc.attr);
-       }
  
-       return err;
+       dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
+                                       &fail_page_alloc.attr);
+       if (IS_ERR(dir))
+               return PTR_ERR(dir);
+
+       if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
+                               &fail_page_alloc.ignore_gfp_wait))
+               goto fail;
+       if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+                               &fail_page_alloc.ignore_gfp_highmem))
+               goto fail;
+       if (!debugfs_create_u32("min-order", mode, dir,
+                               &fail_page_alloc.min_order))
+               goto fail;
+
+       return 0;
+fail:
+       debugfs_remove_recursive(dir);
+
+       return -ENOMEM;
  }
  
  late_initcall(fail_page_alloc_debugfs);
@@ -1471,7 +1565,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
         long min = mark;
         int o;
  
-       free_pages -= (1 << order) + 1;
+       free_pages -= (1 << order) - 1;
         if (alloc_flags & ALLOC_HIGH)
                 min -= min / 2;
         if (alloc_flags & ALLOC_HARDER)
@@ -1613,6 +1707,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
         set_bit(i, zlc->fullzones);
  }
  
+/*
+ * clear all zones full, called after direct reclaim makes progress so that
+ * a zone that was recently full is not skipped over for up to a second
+ */
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+
+       zlc = zonelist->zlcache_ptr;
+       if (!zlc)
+               return;
+
+       bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+}
+
  #else  /* CONFIG_NUMA */
  
  static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1629,6 +1738,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
  static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
  {
  }
+
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+}
  #endif /* CONFIG_NUMA */
  
  /*
@@ -1661,7 +1774,36 @@ zonelist_scan:
                                 continue;
                 if ((alloc_flags & ALLOC_CPUSET) &&
                         !cpuset_zone_allowed_softwall(zone, gfp_mask))
-                               goto try_next_zone;
+                               continue;
+               /*
+                * When allocating a page cache page for writing, we
+                * want to get it from a zone that is within its dirty
+                * limit, such that no single zone holds more than its
+                * proportional share of globally allowed dirty pages.
+                * The dirty limits take into account the zone's
+                * lowmem reserves and high watermark so that kswapd
+                * should be able to balance it without having to
+                * write pages from its LRU list.
+                *
+                * This may look like it could increase pressure on
+                * lower zones by failing allocations in higher zones
+                * before they are full.  But the pages that do spill
+                * over are limited as the lower zones are protected
+                * by this very same mechanism.  It should not become
+                * a practical burden to them.
+                *
+                * XXX: For now, allow allocations to potentially
+                * exceed the per-zone dirty limit in the slowpath
+                * (ALLOC_WMARK_LOW unset) before going into reclaim,
+                * which is important when on a NUMA setup the allowed
+                * zones are together not big enough to reach the
+                * global limit.  The proper fix for these situations
+                * will require awareness of zones in the
+                * dirty-throttling and the flusher threads.
+                */
+               if ((alloc_flags & ALLOC_WMARK_LOW) &&
+                   (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+                       goto this_zone_full;
  
                 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
                 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1673,17 +1815,36 @@ zonelist_scan:
                                     classzone_idx, alloc_flags))
                                 goto try_this_zone;
  
+                       if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
+                               /*
+                                * we do zlc_setup if there are multiple nodes
+                                * and before considering the first zone allowed
+                                * by the cpuset.
+                                */
+                               allowednodes = zlc_setup(zonelist, alloc_flags);
+                               zlc_active = 1;
+                               did_zlc_setup = 1;
+                       }
+
                         if (zone_reclaim_mode == 0)
                                 goto this_zone_full;
  
+                       /*
+                        * As we may have just activated ZLC, check if the first
+                        * eligible zone has failed zone_reclaim recently.
+                        */
+                       if (NUMA_BUILD && zlc_active &&
+                               !zlc_zone_worth_trying(zonelist, z, allowednodes))
+                               continue;
+
                         ret = zone_reclaim(zone, gfp_mask, order);
                         switch (ret) {
                         case ZONE_RECLAIM_NOSCAN:
                                 /* did not scan */
-                               goto try_next_zone;
+                               continue;
                         case ZONE_RECLAIM_FULL:
                                 /* scanned but unreclaimable */
-                               goto this_zone_full;
+                               continue;
                         default:
                                 /* did we reclaim enough */
                                 if (!zone_watermark_ok(zone, order, mark,
@@ -1700,16 +1861,6 @@ try_this_zone:
  this_zone_full:
                 if (NUMA_BUILD)
                         zlc_mark_zone_full(zonelist, z);
-try_next_zone:
-               if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
-                       /*
-                        * we do zlc_setup after the first zone is tried but only
-                        * if there are multiple nodes make it worthwhile
-                        */
-                       allowednodes = zlc_setup(zonelist, alloc_flags);
-                       zlc_active = 1;
-                       did_zlc_setup = 1;
-               }
         }
  
         if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
@@ -1734,14 +1885,79 @@ static inline bool should_suppress_show_mem(void)
         return ret;
  }
  
+static DEFINE_RATELIMIT_STATE(nopage_rs,
+               DEFAULT_RATELIMIT_INTERVAL,
+               DEFAULT_RATELIMIT_BURST);
+
+void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
+{
+       unsigned int filter = SHOW_MEM_FILTER_NODES;
+
+       if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+           debug_guardpage_minorder() > 0)
+               return;
+
+       /*
+        * This documents exceptions given to allocations in certain
+        * contexts that are allowed to allocate outside current's set
+        * of allowed nodes.
+        */
+       if (!(gfp_mask & __GFP_NOMEMALLOC))
+               if (test_thread_flag(TIF_MEMDIE) ||
+                   (current->flags & (PF_MEMALLOC | PF_EXITING)))
+                       filter &= ~SHOW_MEM_FILTER_NODES;
+       if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+               filter &= ~SHOW_MEM_FILTER_NODES;
+
+       if (fmt) {
+               struct va_format vaf;
+               va_list args;
+
+               va_start(args, fmt);
+
+               vaf.fmt = fmt;
+               vaf.va = &args;
+
+               pr_warn("%pV", &vaf);
+
+               va_end(args);
+       }
+
+       if (!(gfp_mask & __GFP_WAIT)) {
+               pr_info("The following is only an harmless informational message.\n");
+               pr_info("Unless you get a _continuous_flood_ of these messages it means\n");
+               pr_info("everything is working fine. Allocations from irqs cannot be\n");
+               pr_info("perfectly reliable and the kernel is designed to handle that.\n");
+       }
+       pr_info("%s: page allocation failure. order:%d, mode:0x%x\n",
+               current->comm, order, gfp_mask);
+
+       dump_stack();
+       if (!should_suppress_show_mem())
+               show_mem(filter);
+}
+
  static inline int
  should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+                               unsigned long did_some_progress,
                                 unsigned long pages_reclaimed)
  {
         /* Do not loop if specifically requested */
         if (gfp_mask & __GFP_NORETRY)
                 return 0;
  
+       /* Always retry if specifically requested */
+       if (gfp_mask & __GFP_NOFAIL)
+               return 1;
+
+       /*
+        * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
+        * making forward progress without invoking OOM. Suspend also disables
+        * storage devices so kswapd will not help. Bail if we are suspending.
+        */
+       if (!did_some_progress && pm_suspended_storage())
+               return 0;
+
         /*
          * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
          * means __GFP_NOFAIL, but that may not be true in other
@@ -1760,13 +1976,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
         if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
                 return 1;
  
-       /*
-        * Don't let big-order allocations loop unless the caller
-        * explicitly requests that.
-        */
-       if (gfp_mask & __GFP_NOFAIL)
-               return 1;
-
         return 0;
  }
  
@@ -1814,7 +2023,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                         goto out;
         }
         /* Exhausted what can be done so it's blamo time */
-       out_of_memory(zonelist, gfp_mask, order, nodemask);
+       out_of_memory(zonelist, gfp_mask, order, nodemask, false);
  
  out:
         clear_zonelist_oom(zonelist, gfp_mask);
@@ -1827,14 +2036,20 @@ static struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         struct zonelist *zonelist, enum zone_type high_zoneidx,
         nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int migratetype, unsigned long *did_some_progress,
-       bool sync_migration)
+       int migratetype, bool sync_migration,
+       bool *deferred_compaction,
+       unsigned long *did_some_progress)
  {
         struct page *page;
  
-       if (!order || compaction_deferred(preferred_zone))
+       if (!order)
                 return NULL;
  
+       if (compaction_deferred(preferred_zone, order)) {
+               *deferred_compaction = true;
+               return NULL;
+       }
+
         current->flags |= PF_MEMALLOC;
         *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                 nodemask, sync_migration);
@@ -1852,6 +2067,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                 if (page) {
                         preferred_zone->compact_considered = 0;
                         preferred_zone->compact_defer_shift = 0;
+                       if (order >= preferred_zone->compact_order_failed)
+                               preferred_zone->compact_order_failed = order + 1;
                         count_vm_event(COMPACTSUCCESS);
                         return page;
                 }
@@ -1862,7 +2079,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                  * but not enough to satisfy watermarks.
                  */
                 count_vm_event(COMPACTFAIL);
-               defer_compaction(preferred_zone);
+
+               /*
+                * As async compaction considers a subset of pageblocks, only
+                * defer if the failure was a sync compaction failure.
+                */
+               if (sync_migration)
+                       defer_compaction(preferred_zone, order);
  
                 cond_resched();
         }
@@ -1874,8 +2097,9 @@ static inline struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         struct zonelist *zonelist, enum zone_type high_zoneidx,
         nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int migratetype, unsigned long *did_some_progress,
-       bool sync_migration)
+       int migratetype, bool sync_migration,
+       bool *deferred_compaction,
+       unsigned long *did_some_progress)
  {
         return NULL;
  }
@@ -1912,6 +2136,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
         if (unlikely(!(*did_some_progress)))
                 return NULL;
  
+       /* After successful reclaim, reconsider all zones for allocation */
+       if (NUMA_BUILD)
+               zlc_clear_zones_full(zonelist);
+
  retry:
         page = get_page_from_freelist(gfp_mask, nodemask, order,
                                         zonelist, high_zoneidx,
@@ -2021,6 +2249,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         unsigned long pages_reclaimed = 0;
         unsigned long did_some_progress;
         bool sync_migration = false;
+       bool deferred_compaction = false;
  
         /*
          * In the slowpath, we sanity check order to avoid ever trying to
@@ -2064,6 +2293,7 @@ restart:
                 first_zones_zonelist(zonelist, high_zoneidx, NULL,
                                         &preferred_zone);
  
+rebalance:
         /* This is the last chance, in general, before the goto nopage. */
         page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                         high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2071,7 +2301,6 @@ restart:
         if (page)
                 goto got_pg;
  
-rebalance:
         /* Allocate without watermarks if the context allows */
         if (alloc_flags & ALLOC_NO_WATERMARKS) {
                 page = __alloc_pages_high_priority(gfp_mask, order,
@@ -2101,11 +2330,21 @@ rebalance:
                                         zonelist, high_zoneidx,
                                         nodemask,
                                         alloc_flags, preferred_zone,
-                                       migratetype, &did_some_progress,
-                                       sync_migration);
+                                       migratetype, sync_migration,
+                                       &deferred_compaction,
+                                       &did_some_progress);
         if (page)
                 goto got_pg;
-       sync_migration = !(gfp_mask & __GFP_NO_KSWAPD);
+       sync_migration = true;
+
+       /*
+        * If compaction is deferred for high-order allocations, it is because
+        * sync compaction recently failed. In this is the case and the caller
+        * has requested the system not be heavily disrupted, fail the
+        * allocation now instead of entering direct reclaim
+        */
+       if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+               goto nopage;
  
         /* Try direct reclaim and then allocating */
         page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2124,6 +2363,10 @@ rebalance:
                 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
                         if (oom_killer_disabled)
                                 goto nopage;
+                       /* Coredumps can quickly deplete all memory reserves */
+                       if ((current->flags & PF_DUMPCORE) &&
+                           !(gfp_mask & __GFP_NOFAIL))
+                               goto nopage;
                         page = __alloc_pages_may_oom(gfp_mask, order,
                                         zonelist, high_zoneidx,
                                         nodemask, preferred_zone,
@@ -2155,7 +2398,8 @@ rebalance:
  
         /* Check if we should retry the allocation */
         pages_reclaimed += did_some_progress;
-       if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+       if (should_alloc_retry(gfp_mask, order, did_some_progress,
+                                               pages_reclaimed)) {
                 /* Wait for some write requests to complete then retry */
                 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                 goto rebalance;
@@ -2169,40 +2413,15 @@ rebalance:
                                         zonelist, high_zoneidx,
                                         nodemask,
                                         alloc_flags, preferred_zone,
-                                       migratetype, &did_some_progress,
-                                       sync_migration);
+                                       migratetype, sync_migration,
+                                       &deferred_compaction,
+                                       &did_some_progress);
                 if (page)
                         goto got_pg;
         }
  
  nopage:
-       if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
-               unsigned int filter = SHOW_MEM_FILTER_NODES;
-
-               /*
-                * This documents exceptions given to allocations in certain
-                * contexts that are allowed to allocate outside current's set
-                * of allowed nodes.
-                */
-               if (!(gfp_mask & __GFP_NOMEMALLOC))
-                       if (test_thread_flag(TIF_MEMDIE) ||
-                           (current->flags & (PF_MEMALLOC | PF_EXITING)))
-                               filter &= ~SHOW_MEM_FILTER_NODES;
-               if (in_interrupt() || !wait)
-                       filter &= ~SHOW_MEM_FILTER_NODES;
-
-               if (!wait) {
-                       pr_info("The following is only an harmless informational message.\n");
-                       pr_info("Unless you get a _continuous_flood_ of these messages it means\n");
-                       pr_info("everything is working fine. Allocations from irqs cannot be\n");
-                       pr_info("perfectly reliable and the kernel is designed to handle that.\n");
-               }
-               pr_info("%s: page allocation failure. order:%d, mode:0x%x\n",
-                       current->comm, order, gfp_mask);
-               dump_stack();
-               if (!should_suppress_show_mem())
-                       show_mem(filter);
-       }
+       warn_alloc_failed(gfp_mask, order, NULL);
         return page;
  got_pg:
         if (kmemcheck_enabled)
@@ -2220,8 +2439,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  {
         enum zone_type high_zoneidx = gfp_zone(gfp_mask);
         struct zone *preferred_zone;
-       struct page *page;
+       struct page *page = NULL;
         int migratetype = allocflags_to_migratetype(gfp_mask);
+       unsigned int cpuset_mems_cookie;
  
         gfp_mask &= gfp_allowed_mask;
  
@@ -2240,15 +2460,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
         if (unlikely(!zonelist->_zonerefs->zone))
                 return NULL;
  
-       get_mems_allowed();
+retry_cpuset:
+       cpuset_mems_cookie = get_mems_allowed();
+
         /* The preferred zone is used for statistics later */
         first_zones_zonelist(zonelist, high_zoneidx,
                                 nodemask ? : &cpuset_current_mems_allowed,
                                 &preferred_zone);
-       if (!preferred_zone) {
-               put_mems_allowed();
-               return NULL;
-       }
+       if (!preferred_zone)
+               goto out;
  
         /* First allocation attempt */
         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2258,9 +2478,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                 page = __alloc_pages_slowpath(gfp_mask, order,
                                 zonelist, high_zoneidx, nodemask,
                                 preferred_zone, migratetype);
-       put_mems_allowed();
  
         trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+
+out:
+       /*
+        * When updating a task's mems_allowed, it is possible to race with
+        * parallel threads in such a way that an allocation can fail while
+        * the mask is being updated. If a page allocation is about to fail,
+        * check if the cpuset changed during allocation and if so, retry.
+        */
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+               goto retry_cpuset;
+
         return page;
  }
  EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2291,16 +2521,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
  }
  EXPORT_SYMBOL(get_zeroed_page);
  
-void __pagevec_free(struct pagevec *pvec)
-{
-       int i = pagevec_count(pvec);
-
-       while (--i >= 0) {
-               trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
-               free_hot_cold_page(pvec->pages[i], pvec->cold);
-       }
-}
-
  void __free_pages(struct page *page, unsigned int order)
  {
         if (put_page_testzero(page)) {
@@ -2323,6 +2543,21 @@ void free_pages(unsigned long addr, unsigned int order)
  
  EXPORT_SYMBOL(free_pages);
  
+static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
+{
+       if (addr) {
+               unsigned long alloc_end = addr + (PAGE_SIZE << order);
+               unsigned long used = addr + PAGE_ALIGN(size);
+
+               split_page(virt_to_page((void *)addr), order);
+               while (used < alloc_end) {
+                       free_page(used);
+                       used += PAGE_SIZE;
+               }
+       }
+       return (void *)addr;
+}
+
  /**
   * alloc_pages_exact - allocate an exact number physically-contiguous pages.
   * @size: the number of bytes to allocate
@@ -2342,22 +2577,33 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
         unsigned long addr;
  
         addr = __get_free_pages(gfp_mask, order);
-       if (addr) {
-               unsigned long alloc_end = addr + (PAGE_SIZE << order);
-               unsigned long used = addr + PAGE_ALIGN(size);
-
-               split_page(virt_to_page((void *)addr), order);
-               while (used < alloc_end) {
-                       free_page(used);
-                       used += PAGE_SIZE;
-               }
-       }
-
-       return (void *)addr;
+       return make_alloc_exact(addr, order, size);
  }
  EXPORT_SYMBOL(alloc_pages_exact);
  
  /**
+ * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
+ *                        pages on a node.
+ * @nid: the preferred node ID where memory should be allocated
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation
+ *
+ * Like alloc_pages_exact(), but try to allocate on node nid first before falling
+ * back.
+ * Note this is not alloc_pages_exact_node() which allocates on a specific node,
+ * but is not exact.
+ */
+void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
+{
+       unsigned order = get_order(size);
+       struct page *p = alloc_pages_node(nid, gfp_mask, order);
+       if (!p)
+               return NULL;
+       return make_alloc_exact((unsigned long)page_address(p), order, size);
+}
+EXPORT_SYMBOL(alloc_pages_exact_nid);
+
+/**
   * free_pages_exact - release memory allocated via alloc_pages_exact()
   * @virt: the value returned by alloc_pages_exact.
   * @size: size of allocation, same value as passed to alloc_pages_exact().
@@ -2452,20 +2698,21 @@ void si_meminfo_node(struct sysinfo *val, int nid)
  #endif
  
  /*
- * Determine whether the zone's node should be displayed or not, depending on
- * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas().
+ * Determine whether the node should be displayed or not, depending on whether
+ * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
   */
-static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone)
+bool skip_free_areas_node(unsigned int flags, int nid)
  {
         bool ret = false;
+       unsigned int cpuset_mems_cookie;
  
         if (!(flags & SHOW_MEM_FILTER_NODES))
                 goto out;
  
-       get_mems_allowed();
-       ret = !node_isset(zone->zone_pgdat->node_id,
-                               cpuset_current_mems_allowed);
-       put_mems_allowed();
+       do {
+               cpuset_mems_cookie = get_mems_allowed();
+               ret = !node_isset(nid, cpuset_current_mems_allowed);
+       } while (!put_mems_allowed(cpuset_mems_cookie));
  out:
         return ret;
  }
@@ -2479,13 +2726,13 @@ out:
   * Suppresses nodes that are not allowed by current's cpuset if
   * SHOW_MEM_FILTER_NODES is passed.
   */
-void __show_free_areas(unsigned int filter)
+void show_free_areas(unsigned int filter)
  {
         int cpu;
         struct zone *zone;
  
         for_each_populated_zone(zone) {
-               if (skip_free_areas_zone(filter, zone))
+               if (skip_free_areas_node(filter, zone_to_nid(zone)))
                         continue;
                 show_node(zone);
                 printk("%s per-cpu:\n", zone->name);
@@ -2528,7 +2775,7 @@ void __show_free_areas(unsigned int filter)
         for_each_populated_zone(zone) {
                 int i;
  
-               if (skip_free_areas_zone(filter, zone))
+               if (skip_free_areas_node(filter, zone_to_nid(zone)))
                         continue;
                 show_node(zone);
                 printk("%s"
@@ -2597,7 +2844,7 @@ void __show_free_areas(unsigned int filter)
         for_each_populated_zone(zone) {
                 unsigned long nr[MAX_ORDER], flags, order, total = 0;
  
-               if (skip_free_areas_zone(filter, zone))
+               if (skip_free_areas_node(filter, zone_to_nid(zone)))
                         continue;
                 show_node(zone);
                 printk("%s: ", zone->name);
@@ -2618,11 +2865,6 @@ void __show_free_areas(unsigned int filter)
         show_swap_cache_info();
  }
  
-void show_free_areas(void)
-{
-       __show_free_areas(0);
-}
-
  static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
  {
         zoneref->zone = zone;
@@ -3182,7 +3424,7 @@ static __init_refok int __build_all_zonelists(void *data)
   * Called with zonelists_mutex held always
   * unless system_state == SYSTEM_BOOTING.
   */
-void build_all_zonelists(void *data)
+void __ref build_all_zonelists(void *data)
  {
         set_zonelist_order();
  
@@ -3293,6 +3535,20 @@ static inline unsigned long wait_table_bits(unsigned long size)
  #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
  
  /*
+ * Check if a pageblock contains reserved pages
+ */
+static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
+{
+       unsigned long pfn;
+
+       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+               if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
+                       return 1;
+       }
+       return 0;
+}
+
+/*
   * Mark a number of pageblocks as MIGRATE_RESERVE. The number
   * of blocks reserved is based on min_wmark_pages(zone). The memory within
   * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
@@ -3301,14 +3557,20 @@ static inline unsigned long wait_table_bits(unsigned long size)
   */
  static void setup_zone_migrate_reserve(struct zone *zone)
  {
-       unsigned long start_pfn, pfn, end_pfn;
+       unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
         struct page *page;
         unsigned long block_migratetype;
         int reserve;
  
-       /* Get the start pfn, end pfn and the number of blocks to reserve */
+       /*
+        * Get the start pfn, end pfn and the number of blocks to reserve
+        * We have to be careful to be aligned to pageblock_nr_pages to
+        * make sure that we always check pfn_valid for the first page in
+        * the block.
+        */
         start_pfn = zone->zone_start_pfn;
         end_pfn = start_pfn + zone->spanned_pages;
+       start_pfn = roundup(start_pfn, pageblock_nr_pages);
         reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
                                                         pageblock_order;
  
@@ -3330,24 +3592,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                 if (page_to_nid(page) != zone_to_nid(zone))
                         continue;
  
-               /* Blocks with reserved pages will never free, skip them. */
-               if (PageReserved(page))
-                       continue;
-
                 block_migratetype = get_pageblock_migratetype(page);
  
-               /* If this block is reserved, account for it */
-               if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
-                       reserve--;
-                       continue;
-               }
+               /* Only test what is necessary when the reserves are not met */
+               if (reserve > 0) {
+                       /*
+                        * Blocks with reserved pages will never free, skip
+                        * them.
+                        */
+                       block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+                       if (pageblock_is_reserved(pfn, block_end_pfn))
+                               continue;
  
-               /* Suitable for reserving if this block is movable */
-               if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
-                       set_pageblock_migratetype(page, MIGRATE_RESERVE);
-                       move_freepages_block(zone, page, MIGRATE_RESERVE);
-                       reserve--;
-                       continue;
+                       /* If this block is reserved, account for it */
+                       if (block_migratetype == MIGRATE_RESERVE) {
+                               reserve--;
+                               continue;
+                       }
+
+                       /* Suitable for reserving if this block is movable */
+                       if (block_migratetype == MIGRATE_MOVABLE) {
+                               set_pageblock_migratetype(page,
+                                                       MIGRATE_RESERVE);
+                               move_freepages_block(zone, page,
+                                                       MIGRATE_RESERVE);
+                               reserve--;
+                               continue;
+                       }
                 }
  
                 /*
@@ -3520,7 +3791,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                 pcp->batch = PAGE_SHIFT * 8;
  }
  
-static __meminit void setup_zone_pageset(struct zone *zone)
+static void setup_zone_pageset(struct zone *zone)
  {
         int cpu;
  
@@ -3570,7 +3841,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
  
         if (!slab_is_available()) {
                 zone->wait_table = (wait_queue_head_t *)
-                       alloc_bootmem_node(pgdat, alloc_size);
+                       alloc_bootmem_node_nopanic(pgdat, alloc_size);
         } else {
                 /*
                  * This case means that a zone whose size was 0 gets new memory
@@ -3659,35 +3930,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
         return 0;
  }
  
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-/*
- * Basic iterator support. Return the first range of PFNs for a node
- * Note: nid == MAX_NUMNODES returns first region regardless of node
- */
-static int __meminit first_active_region_index_in_nid(int nid)
-{
-       int i;
-
-       for (i = 0; i < nr_nodemap_entries; i++)
-               if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
-                       return i;
-
-       return -1;
-}
-
-/*
- * Basic iterator support. Return the next active range of PFNs for a node
- * Note: nid == MAX_NUMNODES returns next region regardless of node
- */
-static int __meminit next_active_region_index_in_nid(int index, int nid)
-{
-       for (index = index + 1; index < nr_nodemap_entries; index++)
-               if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
-                       return index;
-
-       return -1;
-}
-
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
  #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
  /*
   * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
@@ -3697,15 +3940,12 @@ static int __meminit next_active_region_index_in_nid(int index, int nid)
   */
  int __meminit __early_pfn_to_nid(unsigned long pfn)
  {
-       int i;
-
-       for (i = 0; i < nr_nodemap_entries; i++) {
-               unsigned long start_pfn = early_node_map[i].start_pfn;
-               unsigned long end_pfn = early_node_map[i].end_pfn;
+       unsigned long start_pfn, end_pfn;
+       int i, nid;
  
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
                 if (start_pfn <= pfn && pfn < end_pfn)
-                       return early_node_map[i].nid;
-       }
+                       return nid;
         /* This is a memory hole */
         return -1;
  }
@@ -3734,11 +3974,6 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
  }
  #endif
  
-/* Basic iterator support to walk early_node_map[] */
-#define for_each_active_range_index_in_nid(i, nid) \
-       for (i = first_active_region_index_in_nid(nid); i != -1; \
-                               i = next_active_region_index_in_nid(i, nid))
-
  /**
   * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
   * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
@@ -3748,122 +3983,22 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
   * add_active_ranges() contain no holes and may be freed, this
   * this function may be used instead of calling free_bootmem() manually.
   */
-void __init free_bootmem_with_active_regions(int nid,
-                                               unsigned long max_low_pfn)
-{
-       int i;
-
-       for_each_active_range_index_in_nid(i, nid) {
-               unsigned long size_pages = 0;
-               unsigned long end_pfn = early_node_map[i].end_pfn;
-
-               if (early_node_map[i].start_pfn >= max_low_pfn)
-                       continue;
-
-               if (end_pfn > max_low_pfn)
-                       end_pfn = max_low_pfn;
-
-               size_pages = end_pfn - early_node_map[i].start_pfn;
-               free_bootmem_node(NODE_DATA(early_node_map[i].nid),
-                               PFN_PHYS(early_node_map[i].start_pfn),
-                               size_pages << PAGE_SHIFT);
-       }
-}
-
-#ifdef CONFIG_HAVE_MEMBLOCK
-/*
- * Basic iterator support. Return the last range of PFNs for a node
- * Note: nid == MAX_NUMNODES returns last region regardless of node
- */
-static int __meminit last_active_region_index_in_nid(int nid)
-{
-       int i;
-
-       for (i = nr_nodemap_entries - 1; i >= 0; i--)
-               if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
-                       return i;
-
-       return -1;
-}
-
-/*
- * Basic iterator support. Return the previous active range of PFNs for a node
- * Note: nid == MAX_NUMNODES returns next region regardless of node
- */
-static int __meminit previous_active_region_index_in_nid(int index, int nid)
+void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
  {
-       for (index = index - 1; index >= 0; index--)
-               if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
-                       return index;
-
-       return -1;
-}
-
-#define for_each_active_range_index_in_nid_reverse(i, nid) \
-       for (i = last_active_region_index_in_nid(nid); i != -1; \
-                               i = previous_active_region_index_in_nid(i, nid))
-
-u64 __init find_memory_core_early(int nid, u64 size, u64 align,
-                                       u64 goal, u64 limit)
-{
-       int i;
-
-       /* Need to go over early_node_map to find out good range for node */
-       for_each_active_range_index_in_nid_reverse(i, nid) {
-               u64 addr;
-               u64 ei_start, ei_last;
-               u64 final_start, final_end;
-
-               ei_last = early_node_map[i].end_pfn;
-               ei_last <<= PAGE_SHIFT;
-               ei_start = early_node_map[i].start_pfn;
-               ei_start <<= PAGE_SHIFT;
-
-               final_start = max(ei_start, goal);
-               final_end = min(ei_last, limit);
-
-               if (final_start >= final_end)
-                       continue;
-
-               addr = memblock_find_in_range(final_start, final_end, size, align);
-
-               if (addr == MEMBLOCK_ERROR)
-                       continue;
-
-               return addr;
-       }
-
-       return MEMBLOCK_ERROR;
-}
-#endif
+       unsigned long start_pfn, end_pfn;
+       int i, this_nid;
  
-int __init add_from_early_node_map(struct range *range, int az,
-                                  int nr_range, int nid)
-{
-       int i;
-       u64 start, end;
+       for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
+               start_pfn = min(start_pfn, max_low_pfn);
+               end_pfn = min(end_pfn, max_low_pfn);
  
-       /* need to go over early_node_map to find out good range for node */
-       for_each_active_range_index_in_nid(i, nid) {
-               start = early_node_map[i].start_pfn;
-               end = early_node_map[i].end_pfn;
-               nr_range = add_range(range, az, nr_range, start, end);
+               if (start_pfn < end_pfn)
+                       free_bootmem_node(NODE_DATA(this_nid),
+                                         PFN_PHYS(start_pfn),
+                                         (end_pfn - start_pfn) << PAGE_SHIFT);
         }
-       return nr_range;
  }
  
-void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
-{
-       int i;
-       int ret;
-
-       for_each_active_range_index_in_nid(i, nid) {
-               ret = work_fn(early_node_map[i].start_pfn,
-                             early_node_map[i].end_pfn, data);
-               if (ret)
-                       break;
-       }
-}
  /**
   * sparse_memory_present_with_active_regions - Call memory_present for each active range
   * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -3874,12 +4009,11 @@ void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
   */
  void __init sparse_memory_present_with_active_regions(int nid)
  {
-       int i;
+       unsigned long start_pfn, end_pfn;
+       int i, this_nid;
  
-       for_each_active_range_index_in_nid(i, nid)
-               memory_present(early_node_map[i].nid,
-                               early_node_map[i].start_pfn,
-                               early_node_map[i].end_pfn);
+       for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
+               memory_present(this_nid, start_pfn, end_pfn);
  }
  
  /**
@@ -3896,13 +4030,15 @@ void __init sparse_memory_present_with_active_regions(int nid)
  void __meminit get_pfn_range_for_nid(unsigned int nid,
                         unsigned long *start_pfn, unsigned long *end_pfn)
  {
+       unsigned long this_start_pfn, this_end_pfn;
         int i;
+
         *start_pfn = -1UL;
         *end_pfn = 0;
  
-       for_each_active_range_index_in_nid(i, nid) {
-               *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
-               *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
+       for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
+               *start_pfn = min(*start_pfn, this_start_pfn);
+               *end_pfn = max(*end_pfn, this_end_pfn);
         }
  
         if (*start_pfn == -1UL)
@@ -3932,7 +4068,7 @@ static void __init find_usable_zone_for_movable(void)
  
  /*
   * The zone ranges provided by the architecture do not include ZONE_MOVABLE
- * because it is sized independant of architecture. Unlike the other zones,
+ * because it is sized independent of architecture. Unlike the other zones,
   * the starting point for ZONE_MOVABLE is not fixed. It may be different
   * in each node depending on the size of each node and how evenly kernelcore
   * is distributed. This helper function adjusts the zone ranges
@@ -4005,46 +4141,16 @@ unsigned long __meminit __absent_pages_in_range(int nid,
                                 unsigned long range_start_pfn,
                                 unsigned long range_end_pfn)
  {
-       int i = 0;
-       unsigned long prev_end_pfn = 0, hole_pages = 0;
-       unsigned long start_pfn;
-
-       /* Find the end_pfn of the first active range of pfns in the node */
-       i = first_active_region_index_in_nid(nid);
-       if (i == -1)
-               return 0;
-
-       prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
-
-       /* Account for ranges before physical memory on this node */
-       if (early_node_map[i].start_pfn > range_start_pfn)
-               hole_pages = prev_end_pfn - range_start_pfn;
-
-       /* Find all holes for the zone within the node */
-       for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
-
-               /* No need to continue if prev_end_pfn is outside the zone */
-               if (prev_end_pfn >= range_end_pfn)
-                       break;
-
-               /* Make sure the end of the zone is not within the hole */
-               start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
-               prev_end_pfn = max(prev_end_pfn, range_start_pfn);
+       unsigned long nr_absent = range_end_pfn - range_start_pfn;
+       unsigned long start_pfn, end_pfn;
+       int i;
  
-               /* Update the hole size cound and move on */
-               if (start_pfn > range_start_pfn) {
-                       BUG_ON(prev_end_pfn > start_pfn);
-                       hole_pages += start_pfn - prev_end_pfn;
-               }
-               prev_end_pfn = early_node_map[i].end_pfn;
+       for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+               start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
+               end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+               nr_absent -= end_pfn - start_pfn;
         }
-
-       /* Account for ranges past physical memory on this node */
-       if (range_end_pfn > prev_end_pfn)
-               hole_pages += range_end_pfn -
-                               max(range_start_pfn, prev_end_pfn);
-
-       return hole_pages;
+       return nr_absent;
  }
  
  /**
@@ -4065,14 +4171,14 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
                                         unsigned long zone_type,
                                         unsigned long *ignored)
  {
+       unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+       unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
         unsigned long node_start_pfn, node_end_pfn;
         unsigned long zone_start_pfn, zone_end_pfn;
  
         get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
-       zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
-                                                       node_start_pfn);
-       zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
-                                                       node_end_pfn);
+       zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+       zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
  
         adjust_zone_range_for_zone_movable(nid, zone_type,
                         node_start_pfn, node_end_pfn,
@@ -4080,7 +4186,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
         return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
  }
  
-#else
+#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
  static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                         unsigned long zone_type,
                                         unsigned long *zones_size)
@@ -4098,7 +4204,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
         return zholes_size[zone_type];
  }
  
-#endif
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
  
  static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
                 unsigned long *zones_size, unsigned long *zholes_size)
@@ -4147,7 +4253,8 @@ static void __init setup_usemap(struct pglist_data *pgdat,
         unsigned long usemapsize = usemap_size(zonesize);
         zone->pageblock_flags = NULL;
         if (usemapsize)
-               zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
+               zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
+                                                                  usemapsize);
  }
  #else
  static inline void setup_usemap(struct pglist_data *pgdat,
@@ -4217,7 +4324,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
         for (j = 0; j < MAX_NR_ZONES; j++) {
                 struct zone *zone = pgdat->node_zones + j;
                 unsigned long size, realsize, memmap_pages;
-               enum lru_list l;
+               enum lru_list lru;
  
                 size = zone_spanned_pages_in_node(nid, j, zones_size);
                 realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4267,10 +4374,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                 zone->zone_pgdat = pgdat;
  
                 zone_pcp_init(zone);
-               for_each_lru(l) {
-                       INIT_LIST_HEAD(&zone->lru[l].list);
-                       zone->reclaim_stat.nr_saved_scan[l] = 0;
-               }
+               for_each_lru(lru)
+                       INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
                 zone->reclaim_stat.recent_rotated[0] = 0;
                 zone->reclaim_stat.recent_rotated[1] = 0;
                 zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4313,7 +4418,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                 size =  (end - start) * sizeof(struct page);
                 map = alloc_remap(pgdat->node_id, size);
                 if (!map)
-                       map = alloc_bootmem_node(pgdat, size);
+                       map = alloc_bootmem_node_nopanic(pgdat, size);
                 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
         }
  #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -4322,10 +4427,10 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
          */
         if (pgdat == NODE_DATA(0)) {
                 mem_map = NODE_DATA(0)->node_mem_map;
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
                 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
                         mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
         }
  #endif
  #endif /* CONFIG_FLAT_NODE_MEM_MAP */
@@ -4350,7 +4455,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
         free_area_init_core(pgdat, zones_size, zholes_size);
  }
  
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
  
  #if MAX_NUMNODES > 1
  /*
@@ -4372,178 +4477,64 @@ static inline void setup_nr_node_ids(void)
  #endif
  
  /**
- * add_active_range - Register a range of PFNs backed by physical memory
- * @nid: The node ID the range resides on
- * @start_pfn: The start PFN of the available physical memory
- * @end_pfn: The end PFN of the available physical memory
+ * node_map_pfn_alignment - determine the maximum internode alignment
+ *
+ * This function should be called after node map is populated and sorted.
+ * It calculates the maximum power of two alignment which can distinguish
+ * all the nodes.
+ *
+ * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
+ * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
+ * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
+ * shifted, 1GiB is enough and this function will indicate so.
   *
- * These ranges are stored in an early_node_map[] and later used by
- * free_area_init_nodes() to calculate zone sizes and holes. If the
- * range spans a memory hole, it is up to the architecture to ensure
- * the memory is not freed by the bootmem allocator. If possible
- * the range being registered will be merged with existing ranges.
+ * This is used to test whether pfn -> nid mapping of the chosen memory
+ * model has fine enough granularity to avoid incorrect mapping for the
+ * populated node map.
+ *
+ * Returns the determined alignment in pfn's.  0 if there is no alignment
+ * requirement (single node).
   */
-void __init add_active_range(unsigned int nid, unsigned long start_pfn,
-                                               unsigned long end_pfn)
+unsigned long __init node_map_pfn_alignment(void)
  {
-       int i;
-
-       mminit_dprintk(MMINIT_TRACE, "memory_register",
-                       "Entering add_active_range(%d, %#lx, %#lx) "
-                       "%d entries of %d used\n",
-                       nid, start_pfn, end_pfn,
-                       nr_nodemap_entries, MAX_ACTIVE_REGIONS);
-
-       mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
+       unsigned long accl_mask = 0, last_end = 0;
+       unsigned long start, end, mask;
+       int last_nid = -1;
+       int i, nid;
  
-       /* Merge with existing active regions if possible */
-       for (i = 0; i < nr_nodemap_entries; i++) {
-               if (early_node_map[i].nid != nid)
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
+               if (!start || last_nid < 0 || last_nid == nid) {
+                       last_nid = nid;
+                       last_end = end;
                         continue;
-
-               /* Skip if an existing region covers this new one */
-               if (start_pfn >= early_node_map[i].start_pfn &&
-                               end_pfn <= early_node_map[i].end_pfn)
-                       return;
-
-               /* Merge forward if suitable */
-               if (start_pfn <= early_node_map[i].end_pfn &&
-                               end_pfn > early_node_map[i].end_pfn) {
-                       early_node_map[i].end_pfn = end_pfn;
-                       return;
                 }
  
-               /* Merge backward if suitable */
-               if (start_pfn < early_node_map[i].start_pfn &&
-                               end_pfn >= early_node_map[i].start_pfn) {
-                       early_node_map[i].start_pfn = start_pfn;
-                       return;
-               }
-       }
+               /*
+                * Start with a mask granular enough to pin-point to the
+                * start pfn and tick off bits one-by-one until it becomes
+                * too coarse to separate the current node from the last.
+                */
+               mask = ~((1 << __ffs(start)) - 1);
+               while (mask && last_end <= (start & (mask << 1)))
+                       mask <<= 1;
  
-       /* Check that early_node_map is large enough */
-       if (i >= MAX_ACTIVE_REGIONS) {
-               printk(KERN_CRIT "More than %d memory regions, truncating\n",
-                                                       MAX_ACTIVE_REGIONS);
-               return;
+               /* accumulate all internode masks */
+               accl_mask |= mask;
         }
  
-       early_node_map[i].nid = nid;
-       early_node_map[i].start_pfn = start_pfn;
-       early_node_map[i].end_pfn = end_pfn;
-       nr_nodemap_entries = i + 1;
-}
-
-/**
- * remove_active_range - Shrink an existing registered range of PFNs
- * @nid: The node id the range is on that should be shrunk
- * @start_pfn: The new PFN of the range
- * @end_pfn: The new PFN of the range
- *
- * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
- * The map is kept near the end physical page range that has already been
- * registered. This function allows an arch to shrink an existing registered
- * range.
- */
-void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
-                               unsigned long end_pfn)
-{
-       int i, j;
-       int removed = 0;
-
-       printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
-                         nid, start_pfn, end_pfn);
-
-       /* Find the old active region end and shrink */
-       for_each_active_range_index_in_nid(i, nid) {
-               if (early_node_map[i].start_pfn >= start_pfn &&
-                   early_node_map[i].end_pfn <= end_pfn) {
-                       /* clear it */
-                       early_node_map[i].start_pfn = 0;
-                       early_node_map[i].end_pfn = 0;
-                       removed = 1;
-                       continue;
-               }
-               if (early_node_map[i].start_pfn < start_pfn &&
-                   early_node_map[i].end_pfn > start_pfn) {
-                       unsigned long temp_end_pfn = early_node_map[i].end_pfn;
-                       early_node_map[i].end_pfn = start_pfn;
-                       if (temp_end_pfn > end_pfn)
-                               add_active_range(nid, end_pfn, temp_end_pfn);
-                       continue;
-               }
-               if (early_node_map[i].start_pfn >= start_pfn &&
-                   early_node_map[i].end_pfn > end_pfn &&
-                   early_node_map[i].start_pfn < end_pfn) {
-                       early_node_map[i].start_pfn = end_pfn;
-                       continue;
-               }
-       }
-
-       if (!removed)
-               return;
-
-       /* remove the blank ones */
-       for (i = nr_nodemap_entries - 1; i > 0; i--) {
-               if (early_node_map[i].nid != nid)
-                       continue;
-               if (early_node_map[i].end_pfn)
-                       continue;
-               /* we found it, get rid of it */
-               for (j = i; j < nr_nodemap_entries - 1; j++)
-                       memcpy(&early_node_map[j], &early_node_map[j+1],
-                               sizeof(early_node_map[j]));
-               j = nr_nodemap_entries - 1;
-               memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
-               nr_nodemap_entries--;
-       }
-}
-
-/**
- * remove_all_active_ranges - Remove all currently registered regions
- *
- * During discovery, it may be found that a table like SRAT is invalid
- * and an alternative discovery method must be used. This function removes
- * all currently registered regions.
- */
-void __init remove_all_active_ranges(void)
-{
-       memset(early_node_map, 0, sizeof(early_node_map));
-       nr_nodemap_entries = 0;
-}
-
-/* Compare two active node_active_regions */
-static int __init cmp_node_active_region(const void *a, const void *b)
-{
-       struct node_active_region *arange = (struct node_active_region *)a;
-       struct node_active_region *brange = (struct node_active_region *)b;
-
-       /* Done this way to avoid overflows */
-       if (arange->start_pfn > brange->start_pfn)
-               return 1;
-       if (arange->start_pfn < brange->start_pfn)
-               return -1;
-
-       return 0;
-}
-
-/* sort the node_map by start_pfn */
-void __init sort_node_map(void)
-{
-       sort(early_node_map, (size_t)nr_nodemap_entries,
-                       sizeof(struct node_active_region),
-                       cmp_node_active_region, NULL);
+       /* convert mask to number of pages */
+       return ~accl_mask + 1;
  }
  
  /* Find the lowest pfn for a node */
  static unsigned long __init find_min_pfn_for_node(int nid)
  {
-       int i;
         unsigned long min_pfn = ULONG_MAX;
+       unsigned long start_pfn;
+       int i;
  
-       /* Assuming a sorted map, the first range found has the starting pfn */
-       for_each_active_range_index_in_nid(i, nid)
-               min_pfn = min(min_pfn, early_node_map[i].start_pfn);
+       for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
+               min_pfn = min(min_pfn, start_pfn);
  
         if (min_pfn == ULONG_MAX) {
                 printk(KERN_WARNING
@@ -4572,15 +4563,16 @@ unsigned long __init find_min_pfn_with_active_regions(void)
   */
  static unsigned long __init early_calculate_totalpages(void)
  {
-       int i;
         unsigned long totalpages = 0;
+       unsigned long start_pfn, end_pfn;
+       int i, nid;
+
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+               unsigned long pages = end_pfn - start_pfn;
  
-       for (i = 0; i < nr_nodemap_entries; i++) {
-               unsigned long pages = early_node_map[i].end_pfn -
-                                               early_node_map[i].start_pfn;
                 totalpages += pages;
                 if (pages)
-                       node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
+                       node_set_state(nid, N_HIGH_MEMORY);
         }
         return totalpages;
  }
@@ -4591,7 +4583,7 @@ static unsigned long __init early_calculate_totalpages(void)
   * memory. When they don't, some nodes will have more kernelcore than
   * others
   */
-static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
+static void __init find_zone_movable_pfns_for_nodes(void)
  {
         int i, nid;
         unsigned long usable_startpfn;
@@ -4635,6 +4627,8 @@ restart:
         /* Spread kernelcore memory as evenly as possible throughout nodes */
         kernelcore_node = required_kernelcore / usable_nodes;
         for_each_node_state(nid, N_HIGH_MEMORY) {
+               unsigned long start_pfn, end_pfn;
+
                 /*
                  * Recalculate kernelcore_node if the division per node
                  * now exceeds what is necessary to satisfy the requested
@@ -4651,13 +4645,10 @@ restart:
                 kernelcore_remaining = kernelcore_node;
  
                 /* Go through each range of PFNs within this node */
-               for_each_active_range_index_in_nid(i, nid) {
-                       unsigned long start_pfn, end_pfn;
+               for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
                         unsigned long size_pages;
  
-                       start_pfn = max(early_node_map[i].start_pfn,
-                                               zone_movable_pfn[nid]);
-                       end_pfn = early_node_map[i].end_pfn;
+                       start_pfn = max(start_pfn, zone_movable_pfn[nid]);
                         if (start_pfn >= end_pfn)
                                 continue;
  
@@ -4738,8 +4729,10 @@ static void check_for_regular_memory(pg_data_t *pgdat)
  
         for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
                 struct zone *zone = &pgdat->node_zones[zone_type];
-               if (zone->present_pages)
+               if (zone->present_pages) {
                         node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+                       break;
+               }
         }
  #endif
  }
@@ -4759,11 +4752,8 @@ static void check_for_regular_memory(pg_data_t *pgdat)
   */
  void __init free_area_init_nodes(unsigned long *max_zone_pfn)
  {
-       unsigned long nid;
-       int i;
-
-       /* Sort early_node_map as initialisation assumes it is sorted */
-       sort_node_map();
+       unsigned long start_pfn, end_pfn;
+       int i, nid;
  
         /* Record where the zone boundaries are */
         memset(arch_zone_lowest_possible_pfn, 0,
@@ -4785,7 +4775,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
  
         /* Find the PFNs that ZONE_MOVABLE begins at in each node */
         memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
-       find_zone_movable_pfns_for_nodes(zone_movable_pfn);
+       find_zone_movable_pfns_for_nodes();
  
         /* Print out the zone ranges */
         printk("Zone PFN ranges:\n");
@@ -4810,11 +4800,9 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
         }
  
         /* Print out the early_node_map[] */
-       printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
-       for (i = 0; i < nr_nodemap_entries; i++)
-               printk("  %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
-                                               early_node_map[i].start_pfn,
-                                               early_node_map[i].end_pfn);
+       printk("Early memory PFN ranges\n");
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
+               printk("  %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn);
  
         /* Initialise every node */
         mminit_verify_pageflags_layout();
@@ -4867,7 +4855,7 @@ static int __init cmdline_parse_movablecore(char *p)
  early_param("kernelcore", cmdline_parse_kernelcore);
  early_param("movablecore", cmdline_parse_movablecore);
  
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
  
  /**
   * set_dma_reserve - set the specified number of pages reserved in the first zone
@@ -4897,6 +4885,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
         int cpu = (unsigned long)hcpu;
  
         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+               lru_add_drain_cpu(cpu);
                 drain_pages(cpu);
  
                 /*
@@ -4951,8 +4940,19 @@ static void calculate_totalreserve_pages(void)
                         if (max > zone->present_pages)
                                 max = zone->present_pages;
                         reserve_pages += max;
+                       /*
+                        * Lowmem reserves are not available to
+                        * GFP_HIGHUSER page cache allocations and
+                        * kswapd tries to balance zones to their high
+                        * watermark.  As a result, neither should be
+                        * regarded as dirtyable memory, to prevent a
+                        * situation where reclaim has to clean pages
+                        * in order to balance the zones.
+                        */
+                       zone->dirty_balance_reserve = max;
                 }
         }
+       dirty_balance_reserve = reserve_pages;
         totalreserve_pages = reserve_pages;
  }
  
@@ -5053,6 +5053,22 @@ void setup_per_zone_wmarks(void)
                 spin_unlock_irqrestore(&zone->lock, flags);
         }
  
+#ifdef CONFIG_XEN
+       for_each_populated_zone(zone) {
+               unsigned int cpu;
+
+               for_each_online_cpu(cpu) {
+                       unsigned long high;
+
+                       high = percpu_pagelist_fraction
+                              ? zone->present_pages / percpu_pagelist_fraction
+                              : 5 * zone_batchsize(zone);
+                       setup_pagelist_highmark(
+                               per_cpu_ptr(zone->pageset, cpu), high);
+               }
+       }
+#endif
+
         /* update totalreserve_pages */
         calculate_totalreserve_pages();
  }
@@ -5078,7 +5094,7 @@ void setup_per_zone_wmarks(void)
   *    1TB     101        10GB
   *   10TB     320        32GB
   */
-void calculate_zone_inactive_ratio(struct zone *zone)
+static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
  {
         unsigned int gb, ratio;
  
@@ -5092,7 +5108,7 @@ void calculate_zone_inactive_ratio(struct zone *zone)
         zone->inactive_ratio = ratio;
  }
  
-static void __init setup_per_zone_inactive_ratio(void)
+static void __meminit setup_per_zone_inactive_ratio(void)
  {
         struct zone *zone;
  
@@ -5124,7 +5140,7 @@ static void __init setup_per_zone_inactive_ratio(void)
   * 8192MB:     11584k
   * 16384MB:    16384k
   */
-static int __init init_per_zone_wmark_min(void)
+int __meminit init_per_zone_wmark_min(void)
  {
         unsigned long lowmem_kbytes;
  
@@ -5136,6 +5152,7 @@ static int __init init_per_zone_wmark_min(void)
         if (min_free_kbytes > 65536)
                 min_free_kbytes = 65536;
         setup_per_zone_wmarks();
+       refresh_zone_stat_thresholds();
         setup_per_zone_lowmem_reserve();
         setup_per_zone_inactive_ratio();
         return 0;
@@ -5221,7 +5238,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
         int ret;
  
         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
-       if (!write || (ret == -EINVAL))
+       if (!write || (ret < 0))
                 return ret;
         for_each_populated_zone(zone) {
                 for_each_possible_cpu(cpu) {
@@ -5298,6 +5315,7 @@ void *__init alloc_large_system_hash(const char *tablename,
                 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
                 do_div(max, bucketsize);
         }
+       max = min(max, 0x80000000ULL);
  
         if (numentries > max)
                 numentries = max;
@@ -5475,7 +5493,25 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
  
  bool is_pageblock_removable_nolock(struct page *page)
  {
-       struct zone *zone = page_zone(page);
+       struct zone *zone;
+       unsigned long pfn;
+
+       /*
+        * We have to be careful here because we are iterating over memory
+        * sections which are not zone aware so we might end up outside of
+        * the zone but still within the section.
+        * We have to take care about the node as well. If the node is offline
+        * its NODE_DATA will be NULL - see page_zone.
+        */
+       if (!node_online(page_to_nid(page)))
+               return false;
+
+       zone = page_zone(page);
+       pfn = page_to_pfn(page);
+       if (zone->zone_start_pfn > pfn ||
+                       zone->zone_start_pfn + zone->spanned_pages <= pfn)
+               return false;
+
         return __count_immobile_pages(zone, page, 0);
  }
  
@@ -5486,10 +5522,8 @@ int set_migratetype_isolate(struct page *page)
         struct memory_isolate_notify arg;
         int notifier_ret;
         int ret = -EBUSY;
-       int zone_idx;
  
         zone = page_zone(page);
-       zone_idx = zone_idx(zone);
  
         spin_lock_irqsave(&zone->lock, flags);