- Update to 3.4-rc7.

[linux-flexiantxendom0-3.2.10.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 9404b38..e5a3966 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -692,6 +692,13 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
         int i;
         int bad = 0;
  
+#ifdef CONFIG_XEN
+       if (PageForeign(page)) {
+               PageForeignDestructor(page, order);
+               return false;
+       }
+#endif
+
         trace_mm_page_free(page, order);
         kmemcheck_free_shadow(page, order);
  
@@ -718,6 +725,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
         unsigned long flags;
         int wasMlocked = __TestClearPageMlocked(page);
  
+#ifdef CONFIG_XEN
+       WARN_ON(PageForeign(page) && wasMlocked);
+#endif
         if (!free_pages_prepare(page, order))
                 return;
  
@@ -1161,11 +1171,47 @@ void drain_local_pages(void *arg)
  }
  
  /*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * Note that this code is protected against sending an IPI to an offline
+ * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
+ * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
+ * nothing keeps CPUs from showing up after we populated the cpumask and
+ * before the call to on_each_cpu_mask().
   */
  void drain_all_pages(void)
  {
-       on_each_cpu(drain_local_pages, NULL, 1);
+       int cpu;
+       struct per_cpu_pageset *pcp;
+       struct zone *zone;
+
+       /*
+        * Allocate in the BSS so we wont require allocation in
+        * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
+        */
+       static cpumask_t cpus_with_pcps;
+
+       /*
+        * We don't care about racing with CPU hotplug event
+        * as offline notification will cause the notified
+        * cpu to drain that CPU pcps and on_each_cpu_mask
+        * disables preemption as part of its processing
+        */
+       for_each_online_cpu(cpu) {
+               bool has_pcps = false;
+               for_each_populated_zone(zone) {
+                       pcp = per_cpu_ptr(zone->pageset, cpu);
+                       if (pcp->pcp.count) {
+                               has_pcps = true;
+                               break;
+                       }
+               }
+               if (has_pcps)
+                       cpumask_set_cpu(cpu, &cpus_with_pcps);
+               else
+                       cpumask_clear_cpu(cpu, &cpus_with_pcps);
+       }
+       on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
  }
  
  #ifdef CONFIG_HIBERNATION
@@ -1216,6 +1262,9 @@ void free_hot_cold_page(struct page *page, int cold)
         int migratetype;
         int wasMlocked = __TestClearPageMlocked(page);
  
+#ifdef CONFIG_XEN
+       WARN_ON(PageForeign(page) && wasMlocked);
+#endif
         if (!free_pages_prepare(page, 0))
                 return;
  
@@ -1874,7 +1923,13 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
                 va_end(args);
         }
  
-       pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
+       if (!(gfp_mask & __GFP_WAIT)) {
+               pr_info("The following is only an harmless informational message.\n");
+               pr_info("Unless you get a _continuous_flood_ of these messages it means\n");
+               pr_info("everything is working fine. Allocations from irqs cannot be\n");
+               pr_info("perfectly reliable and the kernel is designed to handle that.\n");
+       }
+       pr_info("%s: page allocation failure. order:%d, mode:0x%x\n",
                 current->comm, order, gfp_mask);
  
         dump_stack();
@@ -1968,7 +2023,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                         goto out;
         }
         /* Exhausted what can be done so it's blamo time */
-       out_of_memory(zonelist, gfp_mask, order, nodemask);
+       out_of_memory(zonelist, gfp_mask, order, nodemask, false);
  
  out:
         clear_zonelist_oom(zonelist, gfp_mask);
@@ -1981,14 +2036,20 @@ static struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         struct zonelist *zonelist, enum zone_type high_zoneidx,
         nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int migratetype, unsigned long *did_some_progress,
-       bool sync_migration)
+       int migratetype, bool sync_migration,
+       bool *deferred_compaction,
+       unsigned long *did_some_progress)
  {
         struct page *page;
  
-       if (!order || compaction_deferred(preferred_zone))
+       if (!order)
                 return NULL;
  
+       if (compaction_deferred(preferred_zone, order)) {
+               *deferred_compaction = true;
+               return NULL;
+       }
+
         current->flags |= PF_MEMALLOC;
         *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                 nodemask, sync_migration);
@@ -2006,6 +2067,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                 if (page) {
                         preferred_zone->compact_considered = 0;
                         preferred_zone->compact_defer_shift = 0;
+                       if (order >= preferred_zone->compact_order_failed)
+                               preferred_zone->compact_order_failed = order + 1;
                         count_vm_event(COMPACTSUCCESS);
                         return page;
                 }
@@ -2016,7 +2079,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                  * but not enough to satisfy watermarks.
                  */
                 count_vm_event(COMPACTFAIL);
-               defer_compaction(preferred_zone);
+
+               /*
+                * As async compaction considers a subset of pageblocks, only
+                * defer if the failure was a sync compaction failure.
+                */
+               if (sync_migration)
+                       defer_compaction(preferred_zone, order);
  
                 cond_resched();
         }
@@ -2028,8 +2097,9 @@ static inline struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         struct zonelist *zonelist, enum zone_type high_zoneidx,
         nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-       int migratetype, unsigned long *did_some_progress,
-       bool sync_migration)
+       int migratetype, bool sync_migration,
+       bool *deferred_compaction,
+       unsigned long *did_some_progress)
  {
         return NULL;
  }
@@ -2179,6 +2249,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         unsigned long pages_reclaimed = 0;
         unsigned long did_some_progress;
         bool sync_migration = false;
+       bool deferred_compaction = false;
  
         /*
          * In the slowpath, we sanity check order to avoid ever trying to
@@ -2259,12 +2330,22 @@ rebalance:
                                         zonelist, high_zoneidx,
                                         nodemask,
                                         alloc_flags, preferred_zone,
-                                       migratetype, &did_some_progress,
-                                       sync_migration);
+                                       migratetype, sync_migration,
+                                       &deferred_compaction,
+                                       &did_some_progress);
         if (page)
                 goto got_pg;
         sync_migration = true;
  
+       /*
+        * If compaction is deferred for high-order allocations, it is because
+        * sync compaction recently failed. In this is the case and the caller
+        * has requested the system not be heavily disrupted, fail the
+        * allocation now instead of entering direct reclaim
+        */
+       if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+               goto nopage;
+
         /* Try direct reclaim and then allocating */
         page = __alloc_pages_direct_reclaim(gfp_mask, order,
                                         zonelist, high_zoneidx,
@@ -2282,6 +2363,10 @@ rebalance:
                 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
                         if (oom_killer_disabled)
                                 goto nopage;
+                       /* Coredumps can quickly deplete all memory reserves */
+                       if ((current->flags & PF_DUMPCORE) &&
+                           !(gfp_mask & __GFP_NOFAIL))
+                               goto nopage;
                         page = __alloc_pages_may_oom(gfp_mask, order,
                                         zonelist, high_zoneidx,
                                         nodemask, preferred_zone,
@@ -2328,8 +2413,9 @@ rebalance:
                                         zonelist, high_zoneidx,
                                         nodemask,
                                         alloc_flags, preferred_zone,
-                                       migratetype, &did_some_progress,
-                                       sync_migration);
+                                       migratetype, sync_migration,
+                                       &deferred_compaction,
+                                       &did_some_progress);
                 if (page)
                         goto got_pg;
         }
@@ -2353,8 +2439,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  {
         enum zone_type high_zoneidx = gfp_zone(gfp_mask);
         struct zone *preferred_zone;
-       struct page *page;
+       struct page *page = NULL;
         int migratetype = allocflags_to_migratetype(gfp_mask);
+       unsigned int cpuset_mems_cookie;
  
         gfp_mask &= gfp_allowed_mask;
  
@@ -2373,15 +2460,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
         if (unlikely(!zonelist->_zonerefs->zone))
                 return NULL;
  
-       get_mems_allowed();
+retry_cpuset:
+       cpuset_mems_cookie = get_mems_allowed();
+
         /* The preferred zone is used for statistics later */
         first_zones_zonelist(zonelist, high_zoneidx,
                                 nodemask ? : &cpuset_current_mems_allowed,
                                 &preferred_zone);
-       if (!preferred_zone) {
-               put_mems_allowed();
-               return NULL;
-       }
+       if (!preferred_zone)
+               goto out;
  
         /* First allocation attempt */
         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2391,9 +2478,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                 page = __alloc_pages_slowpath(gfp_mask, order,
                                 zonelist, high_zoneidx, nodemask,
                                 preferred_zone, migratetype);
-       put_mems_allowed();
  
         trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+
+out:
+       /*
+        * When updating a task's mems_allowed, it is possible to race with
+        * parallel threads in such a way that an allocation can fail while
+        * the mask is being updated. If a page allocation is about to fail,
+        * check if the cpuset changed during allocation and if so, retry.
+        */
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+               goto retry_cpuset;
+
         return page;
  }
  EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2607,13 +2704,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
  bool skip_free_areas_node(unsigned int flags, int nid)
  {
         bool ret = false;
+       unsigned int cpuset_mems_cookie;
  
         if (!(flags & SHOW_MEM_FILTER_NODES))
                 goto out;
  
-       get_mems_allowed();
-       ret = !node_isset(nid, cpuset_current_mems_allowed);
-       put_mems_allowed();
+       do {
+               cpuset_mems_cookie = get_mems_allowed();
+               ret = !node_isset(nid, cpuset_current_mems_allowed);
+       } while (!put_mems_allowed(cpuset_mems_cookie));
  out:
         return ret;
  }
@@ -3900,18 +3999,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
         }
  }
  
-int __init add_from_early_node_map(struct range *range, int az,
-                                  int nr_range, int nid)
-{
-       unsigned long start_pfn, end_pfn;
-       int i;
-
-       /* need to go over early_node_map to find out good range for node */
-       for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
-               nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
-       return nr_range;
-}
-
  /**
   * sparse_memory_present_with_active_regions - Call memory_present for each active range
   * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -4237,7 +4324,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
         for (j = 0; j < MAX_NR_ZONES; j++) {
                 struct zone *zone = pgdat->node_zones + j;
                 unsigned long size, realsize, memmap_pages;
-               enum lru_list l;
+               enum lru_list lru;
  
                 size = zone_spanned_pages_in_node(nid, j, zones_size);
                 realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4287,8 +4374,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                 zone->zone_pgdat = pgdat;
  
                 zone_pcp_init(zone);
-               for_each_lru(l)
-                       INIT_LIST_HEAD(&zone->lruvec.lists[l]);
+               for_each_lru(lru)
+                       INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
                 zone->reclaim_stat.recent_rotated[0] = 0;
                 zone->reclaim_stat.recent_rotated[1] = 0;
                 zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4496,7 +4583,7 @@ static unsigned long __init early_calculate_totalpages(void)
   * memory. When they don't, some nodes will have more kernelcore than
   * others
   */
-static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
+static void __init find_zone_movable_pfns_for_nodes(void)
  {
         int i, nid;
         unsigned long usable_startpfn;
@@ -4688,7 +4775,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
  
         /* Find the PFNs that ZONE_MOVABLE begins at in each node */
         memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
-       find_zone_movable_pfns_for_nodes(zone_movable_pfn);
+       find_zone_movable_pfns_for_nodes();
  
         /* Print out the zone ranges */
         printk("Zone PFN ranges:\n");
@@ -4798,6 +4885,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
         int cpu = (unsigned long)hcpu;
  
         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+               lru_add_drain_cpu(cpu);
                 drain_pages(cpu);
  
                 /*
@@ -4965,6 +5053,22 @@ void setup_per_zone_wmarks(void)
                 spin_unlock_irqrestore(&zone->lock, flags);
         }
  
+#ifdef CONFIG_XEN
+       for_each_populated_zone(zone) {
+               unsigned int cpu;
+
+               for_each_online_cpu(cpu) {
+                       unsigned long high;
+
+                       high = percpu_pagelist_fraction
+                              ? zone->present_pages / percpu_pagelist_fraction
+                              : 5 * zone_batchsize(zone);
+                       setup_pagelist_highmark(
+                               per_cpu_ptr(zone->pageset, cpu), high);
+               }
+       }
+#endif
+
         /* update totalreserve_pages */
         calculate_totalreserve_pages();
  }
@@ -5134,7 +5238,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
         int ret;
  
         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
-       if (!write || (ret == -EINVAL))
+       if (!write || (ret < 0))
                 return ret;
         for_each_populated_zone(zone) {
                 for_each_possible_cpu(cpu) {
@@ -5211,6 +5315,7 @@ void *__init alloc_large_system_hash(const char *tablename,
                 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
                 do_div(max, bucketsize);
         }
+       max = min(max, 0x80000000ULL);
  
         if (numentries > max)
                 numentries = max;
@@ -5388,7 +5493,25 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
  
  bool is_pageblock_removable_nolock(struct page *page)
  {
-       struct zone *zone = page_zone(page);
+       struct zone *zone;
+       unsigned long pfn;
+
+       /*
+        * We have to be careful here because we are iterating over memory
+        * sections which are not zone aware so we might end up outside of
+        * the zone but still within the section.
+        * We have to take care about the node as well. If the node is offline
+        * its NODE_DATA will be NULL - see page_zone.
+        */
+       if (!node_online(page_to_nid(page)))
+               return false;
+
+       zone = page_zone(page);
+       pfn = page_to_pfn(page);
+       if (zone->zone_start_pfn > pfn ||
+                       zone->zone_start_pfn + zone->spanned_pages <= pfn)
+               return false;
+
         return __count_immobile_pages(zone, page, 0);
  }