int i;
int bad = 0;
+#ifdef CONFIG_XEN
+ if (PageForeign(page)) {
+ PageForeignDestructor(page, order);
+ return false;
+ }
+#endif
+
trace_mm_page_free(page, order);
kmemcheck_free_shadow(page, order);
unsigned long flags;
int wasMlocked = __TestClearPageMlocked(page);
+#ifdef CONFIG_XEN
+ WARN_ON(PageForeign(page) && wasMlocked);
+#endif
if (!free_pages_prepare(page, order))
return;
}
/*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * Note that this code is protected against sending an IPI to an offline
+ * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
+ * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
+ * nothing keeps CPUs from showing up after we populated the cpumask and
+ * before the call to on_each_cpu_mask().
*/
void drain_all_pages(void)
{
- on_each_cpu(drain_local_pages, NULL, 1);
+ int cpu;
+ struct per_cpu_pageset *pcp;
+ struct zone *zone;
+
+ /*
+ * Allocate in the BSS so we wont require allocation in
+ * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
+ */
+ static cpumask_t cpus_with_pcps;
+
+ /*
+ * We don't care about racing with CPU hotplug event
+ * as offline notification will cause the notified
+ * cpu to drain that CPU pcps and on_each_cpu_mask
+ * disables preemption as part of its processing
+ */
+ for_each_online_cpu(cpu) {
+ bool has_pcps = false;
+ for_each_populated_zone(zone) {
+ pcp = per_cpu_ptr(zone->pageset, cpu);
+ if (pcp->pcp.count) {
+ has_pcps = true;
+ break;
+ }
+ }
+ if (has_pcps)
+ cpumask_set_cpu(cpu, &cpus_with_pcps);
+ else
+ cpumask_clear_cpu(cpu, &cpus_with_pcps);
+ }
+ on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
}
#ifdef CONFIG_HIBERNATION
int migratetype;
int wasMlocked = __TestClearPageMlocked(page);
+#ifdef CONFIG_XEN
+ WARN_ON(PageForeign(page) && wasMlocked);
+#endif
if (!free_pages_prepare(page, 0))
return;
va_end(args);
}
- pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
+ if (!(gfp_mask & __GFP_WAIT)) {
+ pr_info("The following is only an harmless informational message.\n");
+ pr_info("Unless you get a _continuous_flood_ of these messages it means\n");
+ pr_info("everything is working fine. Allocations from irqs cannot be\n");
+ pr_info("perfectly reliable and the kernel is designed to handle that.\n");
+ }
+ pr_info("%s: page allocation failure. order:%d, mode:0x%x\n",
current->comm, order, gfp_mask);
dump_stack();
goto out;
}
/* Exhausted what can be done so it's blamo time */
- out_of_memory(zonelist, gfp_mask, order, nodemask);
+ out_of_memory(zonelist, gfp_mask, order, nodemask, false);
out:
clear_zonelist_oom(zonelist, gfp_mask);
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress,
- bool sync_migration)
+ int migratetype, bool sync_migration,
+ bool *deferred_compaction,
+ unsigned long *did_some_progress)
{
struct page *page;
- if (!order || compaction_deferred(preferred_zone))
+ if (!order)
return NULL;
+ if (compaction_deferred(preferred_zone, order)) {
+ *deferred_compaction = true;
+ return NULL;
+ }
+
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
nodemask, sync_migration);
if (page) {
preferred_zone->compact_considered = 0;
preferred_zone->compact_defer_shift = 0;
+ if (order >= preferred_zone->compact_order_failed)
+ preferred_zone->compact_order_failed = order + 1;
count_vm_event(COMPACTSUCCESS);
return page;
}
* but not enough to satisfy watermarks.
*/
count_vm_event(COMPACTFAIL);
- defer_compaction(preferred_zone);
+
+ /*
+ * As async compaction considers a subset of pageblocks, only
+ * defer if the failure was a sync compaction failure.
+ */
+ if (sync_migration)
+ defer_compaction(preferred_zone, order);
cond_resched();
}
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress,
- bool sync_migration)
+ int migratetype, bool sync_migration,
+ bool *deferred_compaction,
+ unsigned long *did_some_progress)
{
return NULL;
}
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
bool sync_migration = false;
+ bool deferred_compaction = false;
/*
* In the slowpath, we sanity check order to avoid ever trying to
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress,
- sync_migration);
+ migratetype, sync_migration,
+ &deferred_compaction,
+ &did_some_progress);
if (page)
goto got_pg;
sync_migration = true;
+ /*
+ * If compaction is deferred for high-order allocations, it is because
+ * sync compaction recently failed. In this is the case and the caller
+ * has requested the system not be heavily disrupted, fail the
+ * allocation now instead of entering direct reclaim
+ */
+ if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+ goto nopage;
+
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
if (oom_killer_disabled)
goto nopage;
+ /* Coredumps can quickly deplete all memory reserves */
+ if ((current->flags & PF_DUMPCORE) &&
+ !(gfp_mask & __GFP_NOFAIL))
+ goto nopage;
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
nodemask, preferred_zone,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress,
- sync_migration);
+ migratetype, sync_migration,
+ &deferred_compaction,
+ &did_some_progress);
if (page)
goto got_pg;
}
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
- struct page *page;
+ struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
+ unsigned int cpuset_mems_cookie;
gfp_mask &= gfp_allowed_mask;
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
- get_mems_allowed();
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
+
/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
- if (!preferred_zone) {
- put_mems_allowed();
- return NULL;
- }
+ if (!preferred_zone)
+ goto out;
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
- put_mems_allowed();
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+
+out:
+ /*
+ * When updating a task's mems_allowed, it is possible to race with
+ * parallel threads in such a way that an allocation can fail while
+ * the mask is being updated. If a page allocation is about to fail,
+ * check if the cpuset changed during allocation and if so, retry.
+ */
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
+
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
bool skip_free_areas_node(unsigned int flags, int nid)
{
bool ret = false;
+ unsigned int cpuset_mems_cookie;
if (!(flags & SHOW_MEM_FILTER_NODES))
goto out;
- get_mems_allowed();
- ret = !node_isset(nid, cpuset_current_mems_allowed);
- put_mems_allowed();
+ do {
+ cpuset_mems_cookie = get_mems_allowed();
+ ret = !node_isset(nid, cpuset_current_mems_allowed);
+ } while (!put_mems_allowed(cpuset_mems_cookie));
out:
return ret;
}
}
}
-int __init add_from_early_node_map(struct range *range, int az,
- int nr_range, int nid)
-{
- unsigned long start_pfn, end_pfn;
- int i;
-
- /* need to go over early_node_map to find out good range for node */
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
- nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
- return nr_range;
-}
-
/**
* sparse_memory_present_with_active_regions - Call memory_present for each active range
* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, memmap_pages;
- enum lru_list l;
+ enum lru_list lru;
size = zone_spanned_pages_in_node(nid, j, zones_size);
realsize = size - zone_absent_pages_in_node(nid, j,
zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
- for_each_lru(l)
- INIT_LIST_HEAD(&zone->lruvec.lists[l]);
+ for_each_lru(lru)
+ INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
zone->reclaim_stat.recent_rotated[0] = 0;
zone->reclaim_stat.recent_rotated[1] = 0;
zone->reclaim_stat.recent_scanned[0] = 0;
* memory. When they don't, some nodes will have more kernelcore than
* others
*/
-static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
+static void __init find_zone_movable_pfns_for_nodes(void)
{
int i, nid;
unsigned long usable_startpfn;
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
- find_zone_movable_pfns_for_nodes(zone_movable_pfn);
+ find_zone_movable_pfns_for_nodes();
/* Print out the zone ranges */
printk("Zone PFN ranges:\n");
int cpu = (unsigned long)hcpu;
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ lru_add_drain_cpu(cpu);
drain_pages(cpu);
/*
spin_unlock_irqrestore(&zone->lock, flags);
}
+#ifdef CONFIG_XEN
+ for_each_populated_zone(zone) {
+ unsigned int cpu;
+
+ for_each_online_cpu(cpu) {
+ unsigned long high;
+
+ high = percpu_pagelist_fraction
+ ? zone->present_pages / percpu_pagelist_fraction
+ : 5 * zone_batchsize(zone);
+ setup_pagelist_highmark(
+ per_cpu_ptr(zone->pageset, cpu), high);
+ }
+ }
+#endif
+
/* update totalreserve_pages */
calculate_totalreserve_pages();
}
int ret;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (!write || (ret == -EINVAL))
+ if (!write || (ret < 0))
return ret;
for_each_populated_zone(zone) {
for_each_possible_cpu(cpu) {
max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
do_div(max, bucketsize);
}
+ max = min(max, 0x80000000ULL);
if (numentries > max)
numentries = max;
bool is_pageblock_removable_nolock(struct page *page)
{
- struct zone *zone = page_zone(page);
+ struct zone *zone;
+ unsigned long pfn;
+
+ /*
+ * We have to be careful here because we are iterating over memory
+ * sections which are not zone aware so we might end up outside of
+ * the zone but still within the section.
+ * We have to take care about the node as well. If the node is offline
+ * its NODE_DATA will be NULL - see page_zone.
+ */
+ if (!node_online(page_to_nid(page)))
+ return false;
+
+ zone = page_zone(page);
+ pfn = page_to_pfn(page);
+ if (zone->zone_start_pfn > pfn ||
+ zone->zone_start_pfn + zone->spanned_pages <= pfn)
+ return false;
+
return __count_immobile_pages(zone, page, 0);
}