#include <linux/memcontrol.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
+#include <linux/oom.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
if (scanned == 0)
scanned = SWAP_CLUSTER_MAX;
- if (!down_read_trylock(&shrinker_rwsem))
- return 1; /* Assume we'll be able to shrink next time */
+ if (!down_read_trylock(&shrinker_rwsem)) {
+ /* Assume we'll be able to shrink next time */
+ ret = 1;
+ goto out;
+ }
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
shrinker->nr += total_scan;
}
up_read(&shrinker_rwsem);
+out:
+ cond_resched();
return ret;
}
case 0:
list_move(&page->lru, dst);
mem_cgroup_del_lru(page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
if (__isolate_lru_page(cursor_page, mode, file) == 0) {
list_move(&cursor_page->lru, dst);
mem_cgroup_del_lru(cursor_page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
nr_lumpy_taken++;
if (PageDirty(cursor_page))
nr_lumpy_dirty++;
struct page *page;
list_for_each_entry(page, page_list, lru) {
+ int numpages = hpage_nr_pages(page);
lru = page_lru_base_type(page);
if (PageActive(page)) {
lru += LRU_ACTIVE;
ClearPageActive(page);
- nr_active++;
+ nr_active += numpages;
}
if (count)
- count[lru]++;
+ count[lru] += numpages;
}
return nr_active;
add_page_to_lru_list(zone, page, lru);
if (is_active_lru(lru)) {
int file = is_file_lru(lru);
- reclaim_stat->recent_rotated[file]++;
+ int numpages = hpage_nr_pages(page);
+ reclaim_stat->recent_rotated[file] += numpages;
}
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
list_move(&page->lru, &zone->lru[lru].list);
mem_cgroup_add_lru_list(page, lru);
- pgmoved++;
+ pgmoved += hpage_nr_pages(page);
if (!pagevec_add(&pvec, page) || list_empty(list)) {
spin_unlock_irq(&zone->lru_lock);
}
if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
- nr_rotated++;
+ nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
return false;
- /*
- * If we failed to reclaim and have scanned the full list, stop.
- * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
- * faster but obviously would be less likely to succeed
- * allocation. If this is desirable, use GFP_REPEAT to decide
- * if both reclaimed and scanned should be checked or just
- * reclaimed
- */
- if (!nr_reclaimed && !nr_scanned)
- return false;
+ /* Consider stopping depending on scan and reclaim activity */
+ if (sc->gfp_mask & __GFP_REPEAT) {
+ /*
+ * For __GFP_REPEAT allocations, stop reclaiming if the
+ * full LRU list has been scanned and we are still failing
+ * to reclaim pages. This full LRU scan is potentially
+ * expensive but a __GFP_REPEAT caller really wants to succeed
+ */
+ if (!nr_reclaimed && !nr_scanned)
+ return false;
+ } else {
+ /*
+ * For non-__GFP_REPEAT allocations which can presumably
+ * fail without consequence, stop if we failed to reclaim
+ * any pages from the last SWAP_CLUSTER_MAX number of
+ * pages that were scanned. This will return to the
+ * caller faster at the risk reclaim/compaction and
+ * the resulting allocation attempt fails
+ */
+ if (!nr_reclaimed)
+ return false;
+ }
/*
* If we have not reclaimed enough pages for compaction and the
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
enum lru_list l;
- unsigned long nr_reclaimed;
+ unsigned long nr_reclaimed, nr_scanned;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
- unsigned long nr_scanned = sc->nr_scanned;
restart:
nr_reclaimed = 0;
+ nr_scanned = sc->nr_scanned;
get_scan_count(zone, sc, nr, priority);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
}
-/*
- * As hibernation is going on, kswapd is freezed so that it can't mark
- * the zone into all_unreclaimable. It can't handle OOM during hibernation.
- * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
- */
+/* All zones in zonelist are unreclaimable? */
static bool all_unreclaimable(struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
- bool all_unreclaimable = true;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
continue;
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- if (zone_reclaimable(zone)) {
- all_unreclaimable = false;
- break;
- }
+ if (!zone->all_unreclaimable)
+ return false;
}
- return all_unreclaimable;
+ return true;
}
/*
struct zone *preferred_zone;
first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
- NULL, &preferred_zone);
+ &cpuset_current_mems_allowed,
+ &preferred_zone);
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
}
}
if (sc->nr_reclaimed)
return sc->nr_reclaimed;
+ /*
+ * As hibernation is going on, kswapd is freezed so that it can't mark
+ * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
+ * check.
+ */
+ if (oom_killer_disabled)
+ return 0;
+
/* top priority shrink_zones still had more to do? don't OOM, then */
if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
return 1;
}
/* is kswapd sleeping prematurely? */
-static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+ int classzone_idx)
{
int i;
unsigned long balanced = 0;
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
- return 1;
+ return true;
/* Check the watermark levels */
- for (i = 0; i < pgdat->nr_zones; i++) {
+ for (i = 0; i <= classzone_idx; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable)
+ /*
+ * balance_pgdat() skips over all_unreclaimable after
+ * DEF_PRIORITY. Effectively, it considers them balanced so
+ * they must be considered balanced here as well if kswapd
+ * is to sleep
+ */
+ if (zone->all_unreclaimable) {
+ balanced += zone->present_pages;
continue;
+ }
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
- 0, 0))
+ i, 0))
all_zones_ok = false;
else
balanced += zone->present_pages;
* must be balanced
*/
if (order)
- return pgdat_balanced(pgdat, balanced, 0);
+ return !pgdat_balanced(pgdat, balanced, classzone_idx);
else
return !all_zones_ok;
}
* of pages is balanced across the zones.
*/
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
- int classzone_idx)
+ int *classzone_idx)
{
int all_zones_ok;
unsigned long balanced;
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
int nr_slab;
+ unsigned long balance_gap;
if (!populated_zone(zone))
continue;
mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
/*
- * We put equal pressure on every zone, unless one
- * zone has way too many pages free already.
+ * We put equal pressure on every zone, unless
+ * one zone has way too many pages free
+ * already. The "too many pages" is defined
+ * as the high wmark plus a "gap" where the
+ * gap is either the low watermark or 1%
+ * of the zone, whichever is smaller.
*/
+ balance_gap = min(low_wmark_pages(zone),
+ (zone->present_pages +
+ KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+ KSWAPD_ZONE_BALANCE_GAP_RATIO);
if (!zone_watermark_ok_safe(zone, order,
- 8*high_wmark_pages(zone), end_zone, 0))
+ high_wmark_pages(zone) + balance_gap,
+ end_zone, 0)) {
shrink_zone(priority, zone, &sc);
- reclaim_state->reclaimed_slab = 0;
- nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
- lru_pages);
- sc.nr_reclaimed += reclaim_state->reclaimed_slab;
- total_scanned += sc.nr_scanned;
- if (zone->all_unreclaimable)
- continue;
- if (nr_slab == 0 && !zone_reclaimable(zone))
- zone->all_unreclaimable = 1;
+
+ reclaim_state->reclaimed_slab = 0;
+ nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
+ lru_pages);
+ sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+ total_scanned += sc.nr_scanned;
+
+ if (nr_slab == 0 && !zone_reclaimable(zone))
+ zone->all_unreclaimable = 1;
+ }
+
/*
* If we've done a decent amount of scanning and
* the reclaim ratio is low, start doing writepage
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
- /*
- * Compact the zone for higher orders to reduce
- * latencies for higher-order allocations that
- * would ordinarily call try_to_compact_pages()
- */
- if (sc.order > PAGE_ALLOC_COSTLY_ORDER)
- compact_zone_order(zone, sc.order, sc.gfp_mask,
- false);
+ if (zone->all_unreclaimable) {
+ if (end_zone && end_zone == i)
+ end_zone--;
+ continue;
+ }
if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), end_zone, 0)) {
* spectulatively avoid congestion waits
*/
zone_clear_flag(zone, ZONE_CONGESTED);
- if (i <= classzone_idx)
+ if (i <= *classzone_idx)
balanced += zone->present_pages;
}
}
- if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, classzone_idx)))
+ if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
break; /* kswapd: all done */
/*
* OK, kswapd is getting into trouble. Take a nap, then take
* high-order: Balanced zones must make up at least 25% of the node
* for the node to be balanced
*/
- if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, classzone_idx)))) {
+ if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
cond_resched();
try_to_freeze();
* if another caller entered the allocator slow path while kswapd
* was awake, order will remain at the higher level
*/
+ *classzone_idx = end_zone;
return order;
}
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order)
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
{
long remaining = 0;
DEFINE_WAIT(wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
/* Try to sleep for a short interval */
- if (!sleeping_prematurely(pgdat, order, remaining)) {
+ if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
remaining = schedule_timeout(HZ/10);
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
- if (!sleeping_prematurely(pgdat, order, remaining)) {
+ if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
*/
static int kswapd(void *p)
{
- unsigned long order;
- int classzone_idx;
+ unsigned long order, new_order;
+ int classzone_idx, new_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
- order = 0;
- classzone_idx = MAX_NR_ZONES - 1;
+ order = new_order = 0;
+ classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
for ( ; ; ) {
- unsigned long new_order;
- int new_classzone_idx;
int ret;
- new_order = pgdat->kswapd_max_order;
- new_classzone_idx = pgdat->classzone_idx;
- pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = MAX_NR_ZONES - 1;
+ /*
+ * If the last balance_pgdat was unsuccessful it's unlikely a
+ * new request of a similar or harder type will succeed soon
+ * so consider going to sleep on the basis we reclaimed at
+ */
+ if (classzone_idx >= new_classzone_idx && order == new_order) {
+ new_order = pgdat->kswapd_max_order;
+ new_classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
+ }
+
if (order < new_order || classzone_idx > new_classzone_idx) {
/*
* Don't sleep if someone wants a larger 'order'
order = new_order;
classzone_idx = new_classzone_idx;
} else {
- kswapd_try_to_sleep(pgdat, order);
+ kswapd_try_to_sleep(pgdat, order, classzone_idx);
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
}
ret = try_to_freeze();
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- order = balance_pgdat(pgdat, order, classzone_idx);
+ order = balance_pgdat(pgdat, order, &classzone_idx);
}
}
return 0;