usb: gadget: pch_udc: Reduce redundant interrupt

[linux-flexiantxendom0.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 7037cc8..cb33d9c 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -41,6 +41,8 @@
  #include <linux/memcontrol.h>
  #include <linux/delayacct.h>
  #include <linux/sysctl.h>
+#include <linux/oom.h>
+#include <linux/prefetch.h>
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
@@ -93,8 +95,6 @@ struct scan_control {
         /* Can pages be swapped as part of reclaim? */
         int may_swap;
  
-       int swappiness;
-
         int order;
  
         /*
@@ -171,7 +171,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
                                 struct scan_control *sc, enum lru_list lru)
  {
         if (!scanning_global_lru(sc))
-               return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
+               return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
+                               zone_to_nid(zone), zone_idx(zone), BIT(lru));
  
         return zone_page_state(zone, NR_LRU_BASE + lru);
  }
@@ -182,7 +183,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
   */
  void register_shrinker(struct shrinker *shrinker)
  {
-       shrinker->nr = 0;
+       atomic_long_set(&shrinker->nr_in_batch, 0);
         down_write(&shrinker_rwsem);
         list_add_tail(&shrinker->list, &shrinker_list);
         up_write(&shrinker_rwsem);
@@ -200,6 +201,14 @@ void unregister_shrinker(struct shrinker *shrinker)
  }
  EXPORT_SYMBOL(unregister_shrinker);
  
+static inline int do_shrinker_shrink(struct shrinker *shrinker,
+                                    struct shrink_control *sc,
+                                    unsigned long nr_to_scan)
+{
+       sc->nr_to_scan = nr_to_scan;
+       return (*shrinker->shrink)(shrinker, sc);
+}
+
  #define SHRINK_BATCH 128
  /*
   * Call the shrink functions to age shrinkable caches
@@ -220,67 +229,114 @@ EXPORT_SYMBOL(unregister_shrinker);
   *
   * Returns the number of slab objects which we shrunk.
   */
-unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
-                       unsigned long lru_pages)
+unsigned long shrink_slab(struct shrink_control *shrink,
+                         unsigned long nr_pages_scanned,
+                         unsigned long lru_pages)
  {
         struct shrinker *shrinker;
         unsigned long ret = 0;
  
-       if (scanned == 0)
-               scanned = SWAP_CLUSTER_MAX;
+       if (nr_pages_scanned == 0)
+               nr_pages_scanned = SWAP_CLUSTER_MAX;
  
-       if (!down_read_trylock(&shrinker_rwsem))
-               return 1;       /* Assume we'll be able to shrink next time */
+       if (!down_read_trylock(&shrinker_rwsem)) {
+               /* Assume we'll be able to shrink next time */
+               ret = 1;
+               goto out;
+       }
  
         list_for_each_entry(shrinker, &shrinker_list, list) {
                 unsigned long long delta;
-               unsigned long total_scan;
-               unsigned long max_pass;
+               long total_scan;
+               long max_pass;
+               int shrink_ret = 0;
+               long nr;
+               long new_nr;
+               long batch_size = shrinker->batch ? shrinker->batch
+                                                 : SHRINK_BATCH;
+
+               max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+               if (max_pass <= 0)
+                       continue;
  
-               max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
-               delta = (4 * scanned) / shrinker->seeks;
+               /*
+                * copy the current shrinker scan count into a local variable
+                * and zero it so that other concurrent shrinker invocations
+                * don't also do this scanning work.
+                */
+               nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
+
+               total_scan = nr;
+               delta = (4 * nr_pages_scanned) / shrinker->seeks;
                 delta *= max_pass;
                 do_div(delta, lru_pages + 1);
-               shrinker->nr += delta;
-               if (shrinker->nr < 0) {
+               total_scan += delta;
+               if (total_scan < 0) {
                         printk(KERN_ERR "shrink_slab: %pF negative objects to "
                                "delete nr=%ld\n",
-                              shrinker->shrink, shrinker->nr);
-                       shrinker->nr = max_pass;
+                              shrinker->shrink, total_scan);
+                       total_scan = max_pass;
                 }
  
                 /*
+                * We need to avoid excessive windup on filesystem shrinkers
+                * due to large numbers of GFP_NOFS allocations causing the
+                * shrinkers to return -1 all the time. This results in a large
+                * nr being built up so when a shrink that can do some work
+                * comes along it empties the entire cache due to nr >>>
+                * max_pass.  This is bad for sustaining a working set in
+                * memory.
+                *
+                * Hence only allow the shrinker to scan the entire cache when
+                * a large delta change is calculated directly.
+                */
+               if (delta < max_pass / 4)
+                       total_scan = min(total_scan, max_pass / 2);
+
+               /*
                  * Avoid risking looping forever due to too large nr value:
                  * never try to free more than twice the estimate number of
                  * freeable entries.
                  */
-               if (shrinker->nr > max_pass * 2)
-                       shrinker->nr = max_pass * 2;
+               if (total_scan > max_pass * 2)
+                       total_scan = max_pass * 2;
  
-               total_scan = shrinker->nr;
-               shrinker->nr = 0;
+               trace_mm_shrink_slab_start(shrinker, shrink, nr,
+                                       nr_pages_scanned, lru_pages,
+                                       max_pass, delta, total_scan);
  
-               while (total_scan >= SHRINK_BATCH) {
-                       long this_scan = SHRINK_BATCH;
-                       int shrink_ret;
+               while (total_scan >= batch_size) {
                         int nr_before;
  
-                       nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
-                       shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
-                                                               gfp_mask);
+                       nr_before = do_shrinker_shrink(shrinker, shrink, 0);
+                       shrink_ret = do_shrinker_shrink(shrinker, shrink,
+                                                       batch_size);
                         if (shrink_ret == -1)
                                 break;
                         if (shrink_ret < nr_before)
                                 ret += nr_before - shrink_ret;
-                       count_vm_events(SLABS_SCANNED, this_scan);
-                       total_scan -= this_scan;
+                       count_vm_events(SLABS_SCANNED, batch_size);
+                       total_scan -= batch_size;
  
                         cond_resched();
                 }
  
-               shrinker->nr += total_scan;
+               /*
+                * move the unused scan count back into the shrinker in a
+                * manner that handles concurrent updates. If we exhausted the
+                * scan, there is no need to do an update.
+                */
+               if (total_scan > 0)
+                       new_nr = atomic_long_add_return(total_scan,
+                                       &shrinker->nr_in_batch);
+               else
+                       new_nr = atomic_long_read(&shrinker->nr_in_batch);
+
+               trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
         }
         up_read(&shrinker_rwsem);
+out:
+       cond_resched();
         return ret;
  }
  
@@ -358,7 +414,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
  static void handle_write_error(struct address_space *mapping,
                                 struct page *page, int error)
  {
-       lock_page_nosync(page);
+       lock_page(page);
         if (page_mapping(page) == mapping)
                 mapping_set_error(mapping, error);
         unlock_page(page);
@@ -439,15 +495,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                         return PAGE_ACTIVATE;
                 }
  
-               /*
-                * Wait on writeback if requested to. This happens when
-                * direct reclaiming a large contiguous area and the
-                * first attempt to free a range of pages fails.
-                */
-               if (PageWriteback(page) &&
-                   (sc->reclaim_mode & RECLAIM_MODE_SYNC))
-                       wait_on_page_writeback(page);
-
                 if (!PageWriteback(page)) {
                         /* synchronous write or broken a_ops? */
                         ClearPageReclaim(page);
@@ -514,7 +561,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
  
                 freepage = mapping->a_ops->freepage;
  
-               __remove_from_page_cache(page);
+               __delete_from_page_cache(page);
                 spin_unlock_irq(&mapping->tree_lock);
                 mem_cgroup_uncharge_cache_page(page);
  
@@ -586,13 +633,14 @@ redo:
                 lru = LRU_UNEVICTABLE;
                 add_page_to_unevictable_list(page);
                 /*
-                * When racing with an mlock clearing (page is
-                * unlocked), make sure that if the other thread does
-                * not observe our setting of PG_lru and fails
-                * isolation, we see PG_mlocked cleared below and move
+                * When racing with an mlock or AS_UNEVICTABLE clearing
+                * (page is unlocked) make sure that if the other thread
+                * does not observe our setting of PG_lru and fails
+                * isolation/check_move_unevictable_pages,
+                * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
                  * the page back to the evictable list.
                  *
-                * The other side is TestClearPageMlocked().
+                * The other side is TestClearPageMlocked() or shmem_lock().
                  */
                 smp_mb();
         }
@@ -703,7 +751,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
   */
  static unsigned long shrink_page_list(struct list_head *page_list,
                                       struct zone *zone,
-                                     struct scan_control *sc)
+                                     struct scan_control *sc,
+                                     int priority,
+                                     unsigned long *ret_nr_dirty,
+                                     unsigned long *ret_nr_writeback)
  {
         LIST_HEAD(ret_pages);
         LIST_HEAD(free_pages);
@@ -711,6 +762,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
         unsigned long nr_dirty = 0;
         unsigned long nr_congested = 0;
         unsigned long nr_reclaimed = 0;
+       unsigned long nr_writeback = 0;
  
         cond_resched();
  
@@ -747,13 +799,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
  
                 if (PageWriteback(page)) {
+                       nr_writeback++;
                         /*
-                        * Synchronous reclaim is performed in two passes,
-                        * first an asynchronous pass over the list to
-                        * start parallel writeback, and a second synchronous
-                        * pass to wait for the IO to complete.  Wait here
-                        * for any page for which writeback has already
-                        * started.
+                        * Synchronous reclaim cannot queue pages for
+                        * writeback due to the possibility of stack overflow
+                        * but if it encounters a page under writeback, wait
+                        * for the IO to complete.
                          */
                         if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
                             may_enter_fs)
@@ -809,6 +860,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 if (PageDirty(page)) {
                         nr_dirty++;
  
+                       /*
+                        * Only kswapd can writeback filesystem pages to
+                        * avoid risk of stack overflow but do not writeback
+                        * unless under significant pressure.
+                        */
+                       if (page_is_file_cache(page) &&
+                                       (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
+                               /*
+                                * Immediately reclaim when written back.
+                                * Similar in principal to deactivate_page()
+                                * except we already have the page isolated
+                                * and know it's dirty
+                                */
+                               inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
+                               SetPageReclaim(page);
+
+                               goto keep_locked;
+                       }
+
                         if (references == PAGEREF_RECLAIM_CLEAN)
                                 goto keep_locked;
                         if (!may_enter_fs)
@@ -936,13 +1006,15 @@ keep_lumpy:
          * back off and wait for congestion to clear because further reclaim
          * will encounter the same problem
          */
-       if (nr_dirty == nr_congested && nr_dirty != 0)
+       if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
                 zone_set_flag(zone, ZONE_CONGESTED);
  
         free_page_list(&free_pages);
  
         list_splice(&ret_pages, page_list);
         count_vm_events(PGACTIVATE, pgactivate);
+       *ret_nr_dirty += nr_dirty;
+       *ret_nr_writeback += nr_writeback;
         return nr_reclaimed;
  }
  
@@ -956,23 +1028,27 @@ keep_lumpy:
   *
   * returns 0 on success, -ve errno on failure.
   */
-int __isolate_lru_page(struct page *page, int mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
  {
+       bool all_lru_mode;
         int ret = -EINVAL;
  
         /* Only take pages on the LRU. */
         if (!PageLRU(page))
                 return ret;
  
+       all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+               (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
+
         /*
          * When checking the active state, we need to be sure we are
          * dealing with comparible boolean values.  Take the logical not
          * of each.
          */
-       if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+       if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
                 return ret;
  
-       if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
+       if (!all_lru_mode && !!page_is_file_cache(page) != file)
                 return ret;
  
         /*
@@ -985,6 +1061,12 @@ int __isolate_lru_page(struct page *page, int mode, int file)
  
         ret = -EBUSY;
  
+       if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page)))
+               return ret;
+
+       if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+               return ret;
+
         if (likely(get_page_unless_zero(page))) {
                 /*
                  * Be careful not to clear PageLRU until after we're
@@ -1020,7 +1102,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
   */
  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 struct list_head *src, struct list_head *dst,
-               unsigned long *scanned, int order, int mode, int file)
+               unsigned long *scanned, int order, isolate_mode_t mode,
+               int file)
  {
         unsigned long nr_taken = 0;
         unsigned long nr_lumpy_taken = 0;
@@ -1044,7 +1127,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 case 0:
                         list_move(&page->lru, dst);
                         mem_cgroup_del_lru(page);
-                       nr_taken++;
+                       nr_taken += hpage_nr_pages(page);
                         break;
  
                 case -EBUSY:
@@ -1065,7 +1148,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                  * surrounding the tag page.  Only take those pages of
                  * the same active state as that tag page.  We may safely
                  * round the target page pfn down to the requested order
-                * as the mem_map is guarenteed valid out to MAX_ORDER,
+                * as the mem_map is guaranteed valid out to MAX_ORDER,
                  * where that page is in a different zone we will detect
                  * it from its zone id and abort this block scan.
                  */
@@ -1102,14 +1185,26 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                         if (__isolate_lru_page(cursor_page, mode, file) == 0) {
                                 list_move(&cursor_page->lru, dst);
                                 mem_cgroup_del_lru(cursor_page);
-                               nr_taken++;
+                               nr_taken += hpage_nr_pages(page);
                                 nr_lumpy_taken++;
                                 if (PageDirty(cursor_page))
                                         nr_lumpy_dirty++;
                                 scan++;
                         } else {
-                               /* the page is freed already. */
-                               if (!page_count(cursor_page))
+                               /*
+                                * Check if the page is freed already.
+                                *
+                                * We can't use page_count() as that
+                                * requires compound_head and we don't
+                                * have a pin on the page here. If a
+                                * page is tail, we may or may not
+                                * have isolated the head, so assume
+                                * it's not free, it'd be tricky to
+                                * track the head status without a
+                                * page pin.
+                                */
+                               if (!PageTail(cursor_page) &&
+                                   !atomic_read(&cursor_page->_count))
                                         continue;
                                 break;
                         }
@@ -1133,8 +1228,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
  static unsigned long isolate_pages_global(unsigned long nr,
                                         struct list_head *dst,
                                         unsigned long *scanned, int order,
-                                       int mode, struct zone *z,
-                                       int active, int file)
+                                       isolate_mode_t mode,
+                                       struct zone *z, int active, int file)
  {
         int lru = LRU_BASE;
         if (active)
@@ -1157,14 +1252,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
         struct page *page;
  
         list_for_each_entry(page, page_list, lru) {
+               int numpages = hpage_nr_pages(page);
                 lru = page_lru_base_type(page);
                 if (PageActive(page)) {
                         lru += LRU_ACTIVE;
                         ClearPageActive(page);
-                       nr_active++;
+                       nr_active += numpages;
                 }
                 if (count)
-                       count[lru]++;
+                       count[lru] += numpages;
         }
  
         return nr_active;
@@ -1199,13 +1295,16 @@ int isolate_lru_page(struct page *page)
  {
         int ret = -EBUSY;
  
+       VM_BUG_ON(!page_count(page));
+
         if (PageLRU(page)) {
                 struct zone *zone = page_zone(page);
  
                 spin_lock_irq(&zone->lru_lock);
-               if (PageLRU(page) && get_page_unless_zero(page)) {
+               if (PageLRU(page)) {
                         int lru = page_lru(page);
                         ret = 0;
+                       get_page(page);
                         ClearPageLRU(page);
  
                         del_page_from_lru_list(zone, page, lru);
@@ -1274,7 +1373,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
                 add_page_to_lru_list(zone, page, lru);
                 if (is_active_lru(lru)) {
                         int file = is_file_lru(lru);
-                       reclaim_stat->recent_rotated[file]++;
+                       int numpages = hpage_nr_pages(page);
+                       reclaim_stat->recent_rotated[file] += numpages;
                 }
                 if (!pagevec_add(&pvec, page)) {
                         spin_unlock_irq(&zone->lru_lock);
@@ -1321,7 +1421,7 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
  }
  
  /*
- * Returns true if the caller should wait to clean dirty/writeback pages.
+ * Returns true if a direct reclaim should wait on pages under writeback.
   *
   * If we are direct reclaiming for contiguous pages and we do not reclaim
   * everything in the list, try again and wait for writeback IO to complete.
@@ -1343,7 +1443,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
         if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
                 return false;
  
-       /* If we have relaimed everything on the isolated list, no stall */
+       /* If we have reclaimed everything on the isolated list, no stall */
         if (nr_freed == nr_taken)
                 return false;
  
@@ -1375,6 +1475,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
         unsigned long nr_taken;
         unsigned long nr_anon;
         unsigned long nr_file;
+       unsigned long nr_dirty = 0;
+       unsigned long nr_writeback = 0;
+       isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
  
         while (unlikely(too_many_isolated(zone, file, sc))) {
                 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1385,15 +1488,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
         }
  
         set_reclaim_mode(priority, sc, false);
+       if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+               reclaim_mode |= ISOLATE_ACTIVE;
+
         lru_add_drain();
+
+       if (!sc->may_unmap)
+               reclaim_mode |= ISOLATE_UNMAPPED;
+       if (!sc->may_writepage)
+               reclaim_mode |= ISOLATE_CLEAN;
+
         spin_lock_irq(&zone->lru_lock);
  
         if (scanning_global_lru(sc)) {
-               nr_taken = isolate_pages_global(nr_to_scan,
-                       &page_list, &nr_scanned, sc->order,
-                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
-                       zone, 0, file);
+               nr_taken = isolate_pages_global(nr_to_scan, &page_list,
+                       &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
                 zone->pages_scanned += nr_scanned;
                 if (current_is_kswapd())
                         __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1402,12 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                         __count_zone_vm_events(PGSCAN_DIRECT, zone,
                                                nr_scanned);
         } else {
-               nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
-                       &page_list, &nr_scanned, sc->order,
-                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
-                       zone, sc->mem_cgroup,
-                       0, file);
+               nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
+                       &nr_scanned, sc->order, reclaim_mode, zone,
+                       sc->mem_cgroup, 0, file);
                 /*
                  * mem_cgroup_isolate_pages() keeps track of
                  * scanned pages on its own.
@@ -1423,12 +1529,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
  
         spin_unlock_irq(&zone->lru_lock);
  
-       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
+       nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
+                                               &nr_dirty, &nr_writeback);
  
         /* Check if we should syncronously wait for writeback */
         if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
                 set_reclaim_mode(priority, sc, true);
-               nr_reclaimed += shrink_page_list(&page_list, zone, sc);
+               nr_reclaimed += shrink_page_list(&page_list, zone, sc,
+                                       priority, &nr_dirty, &nr_writeback);
         }
  
         local_irq_disable();
@@ -1438,6 +1546,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
  
         putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
  
+       /*
+        * If reclaim is isolating dirty pages under writeback, it implies
+        * that the long-lived page allocation rate is exceeding the page
+        * laundering rate. Either the global limits are not being effective
+        * at throttling processes due to the page distribution throughout
+        * zones or there is heavy usage of a slow backing device. The
+        * only option is to throttle from reclaim context which is not ideal
+        * as there is no guarantee the dirtying process is throttled in the
+        * same way balance_dirty_pages() manages.
+        *
+        * This scales the number of dirty pages that must be under writeback
+        * before throttling depending on priority. It is a simple backoff
+        * function that has the most effect in the range DEF_PRIORITY to
+        * DEF_PRIORITY-2 which is the priority reclaim is considered to be
+        * in trouble and reclaim is considered to be in trouble.
+        *
+        * DEF_PRIORITY   100% isolated pages must be PageWriteback to throttle
+        * DEF_PRIORITY-1  50% must be PageWriteback
+        * DEF_PRIORITY-2  25% must be PageWriteback, kswapd in trouble
+        * ...
+        * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
+        *                     isolated page is PageWriteback
+        */
+       if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
+               wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+
         trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
                 zone_idx(zone),
                 nr_scanned, nr_reclaimed,
@@ -1482,7 +1616,7 @@ static void move_active_pages_to_lru(struct zone *zone,
  
                 list_move(&page->lru, &zone->lru[lru].list);
                 mem_cgroup_add_lru_list(page, lru);
-               pgmoved++;
+               pgmoved += hpage_nr_pages(page);
  
                 if (!pagevec_add(&pvec, page) || list_empty(list)) {
                         spin_unlock_irq(&zone->lru_lock);
@@ -1509,19 +1643,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
         struct page *page;
         struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
         unsigned long nr_rotated = 0;
+       isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
  
         lru_add_drain();
+
+       if (!sc->may_unmap)
+               reclaim_mode |= ISOLATE_UNMAPPED;
+       if (!sc->may_writepage)
+               reclaim_mode |= ISOLATE_CLEAN;
+
         spin_lock_irq(&zone->lru_lock);
         if (scanning_global_lru(sc)) {
                 nr_taken = isolate_pages_global(nr_pages, &l_hold,
                                                 &pgscanned, sc->order,
-                                               ISOLATE_ACTIVE, zone,
+                                               reclaim_mode, zone,
                                                 1, file);
                 zone->pages_scanned += pgscanned;
         } else {
                 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
                                                 &pgscanned, sc->order,
-                                               ISOLATE_ACTIVE, zone,
+                                               reclaim_mode, zone,
                                                 sc->mem_cgroup, 1, file);
                 /*
                  * mem_cgroup_isolate_pages() keeps track of
@@ -1550,7 +1691,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                 }
  
                 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
-                       nr_rotated++;
+                       nr_rotated += hpage_nr_pages(page);
                         /*
                          * Identify referenced, file-backed active pages and
                          * give them one more trip around the active list. So
@@ -1626,7 +1767,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
         if (scanning_global_lru(sc))
                 low = inactive_anon_is_low_global(zone);
         else
-               low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
+               low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
         return low;
  }
  #else
@@ -1669,7 +1810,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
         if (scanning_global_lru(sc))
                 low = inactive_file_is_low_global(zone);
         else
-               low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
+               low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
         return low;
  }
  
@@ -1696,24 +1837,11 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
         return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
  }
  
-/*
- * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
- * until we collected @swap_cluster_max pages to scan.
- */
-static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
-                                      unsigned long *nr_saved_scan)
+static int vmscan_swappiness(struct scan_control *sc)
  {
-       unsigned long nr;
-
-       *nr_saved_scan += nr_to_scan;
-       nr = *nr_saved_scan;
-
-       if (nr >= SWAP_CLUSTER_MAX)
-               *nr_saved_scan = 0;
-       else
-               nr = 0;
-
-       return nr;
+       if (scanning_global_lru(sc))
+               return vm_swappiness;
+       return mem_cgroup_swappiness(sc->mem_cgroup);
  }
  
  /*
@@ -1734,6 +1862,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
         u64 fraction[2], denominator;
         enum lru_list l;
         int noswap = 0;
+       bool force_scan = false;
+
+       /*
+        * If the zone or memcg is small, nr[l] can be 0.  This
+        * results in no scanning on this priority and a potential
+        * priority drop.  Global direct reclaim can go to the next
+        * zone and tends to have no problems. Global kswapd is for
+        * zone balancing and it needs to scan a minimum amount. When
+        * reclaiming for a memcg, a priority drop can cause high
+        * latencies, so it's better to scan a minimum amount there as
+        * well.
+        */
+       if (scanning_global_lru(sc) && current_is_kswapd())
+               force_scan = true;
+       if (!scanning_global_lru(sc))
+               force_scan = true;
  
         /* If we have no swap space, do not bother scanning anon pages. */
         if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1765,8 +1909,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
          * With swappiness at 100, anonymous and file have the same priority.
          * This scanning priority is essentially the inverse of IO cost.
          */
-       anon_prio = sc->swappiness;
-       file_prio = 200 - sc->swappiness;
+       anon_prio = vmscan_swappiness(sc);
+       file_prio = 200 - vmscan_swappiness(sc);
  
         /*
          * OK, so we have swap space and a fair amount of page cache
@@ -1813,10 +1957,11 @@ out:
                 scan = zone_nr_lru_pages(zone, sc, l);
                 if (priority || noswap) {
                         scan >>= priority;
+                       if (!scan && force_scan)
+                               scan = SWAP_CLUSTER_MAX;
                         scan = div64_u64(scan * fraction[file], denominator);
                 }
-               nr[l] = nr_scan_try_batch(scan,
-                                         &reclaim_stat->nr_saved_scan[l]);
+               nr[l] = scan;
         }
  }
  
@@ -1839,16 +1984,28 @@ static inline bool should_continue_reclaim(struct zone *zone,
         if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
                 return false;
  
-       /*
-        * If we failed to reclaim and have scanned the full list, stop.
-        * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
-        *       faster but obviously would be less likely to succeed
-        *       allocation. If this is desirable, use GFP_REPEAT to decide
-        *       if both reclaimed and scanned should be checked or just
-        *       reclaimed
-        */
-       if (!nr_reclaimed && !nr_scanned)
-               return false;
+       /* Consider stopping depending on scan and reclaim activity */
+       if (sc->gfp_mask & __GFP_REPEAT) {
+               /*
+                * For __GFP_REPEAT allocations, stop reclaiming if the
+                * full LRU list has been scanned and we are still failing
+                * to reclaim pages. This full LRU scan is potentially
+                * expensive but a __GFP_REPEAT caller really wants to succeed
+                */
+               if (!nr_reclaimed && !nr_scanned)
+                       return false;
+       } else {
+               /*
+                * For non-__GFP_REPEAT allocations which can presumably
+                * fail without consequence, stop if we failed to reclaim
+                * any pages from the last SWAP_CLUSTER_MAX number of
+                * pages that were scanned. This will return to the
+                * caller faster at the risk reclaim/compaction and
+                * the resulting allocation attempt fails
+                */
+               if (!nr_reclaimed)
+                       return false;
+       }
  
         /*
          * If we have not reclaimed enough pages for compaction and the
@@ -1880,14 +2037,16 @@ static void shrink_zone(int priority, struct zone *zone,
         unsigned long nr[NR_LRU_LISTS];
         unsigned long nr_to_scan;
         enum lru_list l;
-       unsigned long nr_reclaimed;
+       unsigned long nr_reclaimed, nr_scanned;
         unsigned long nr_to_reclaim = sc->nr_to_reclaim;
-       unsigned long nr_scanned = sc->nr_scanned;
+       struct blk_plug plug;
  
  restart:
         nr_reclaimed = 0;
+       nr_scanned = sc->nr_scanned;
         get_scan_count(zone, sc, nr, priority);
  
+       blk_start_plug(&plug);
         while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                         nr[LRU_INACTIVE_FILE]) {
                 for_each_evictable_lru(l) {
@@ -1911,6 +2070,7 @@ restart:
                 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
                         break;
         }
+       blk_finish_plug(&plug);
         sc->nr_reclaimed += nr_reclaimed;
  
         /*
@@ -1943,12 +2103,19 @@ restart:
   *
   * If a zone is deemed to be full of pinned pages then just give it a light
   * scan then give up on it.
+ *
+ * This function returns true if a zone is being reclaimed for a costly
+ * high-order allocation and compaction is either ready to begin or deferred.
+ * This indicates to the caller that it should retry the allocation or fail.
   */
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(int priority, struct zonelist *zonelist,
                                         struct scan_control *sc)
  {
         struct zoneref *z;
         struct zone *zone;
+       unsigned long nr_soft_reclaimed;
+       unsigned long nr_soft_scanned;
+       bool should_abort_reclaim = false;
  
         for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                         gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1963,10 +2130,42 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                                 continue;
                         if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                 continue;       /* Let kswapd poll it */
+                       if (COMPACTION_BUILD) {
+                               /*
+                                * If we already have plenty of memory free for
+                                * compaction in this zone, don't free any more.
+                                * Even though compaction is invoked for any
+                                * non-zero order, only frequent costly order
+                                * reclamation is disruptive enough to become a
+                                * noticable problem, like transparent huge page
+                                * allocations.
+                                */
+                               if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
+                                       (compaction_suitable(zone, sc->order) ||
+                                        compaction_deferred(zone))) {
+                                       should_abort_reclaim = true;
+                                       continue;
+                               }
+                       }
+                       /*
+                        * This steals pages from memory cgroups over softlimit
+                        * and returns the number of reclaimed pages and
+                        * scanned pages. This works for global memory pressure
+                        * and balancing, not for a memcg's limit.
+                        */
+                       nr_soft_scanned = 0;
+                       nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+                                               sc->order, sc->gfp_mask,
+                                               &nr_soft_scanned);
+                       sc->nr_reclaimed += nr_soft_reclaimed;
+                       sc->nr_scanned += nr_soft_scanned;
+                       /* need some check for avoid more shrink_zone() */
                 }
  
                 shrink_zone(priority, zone, sc);
         }
+
+       return should_abort_reclaim;
  }
  
  static bool zone_reclaimable(struct zone *zone)
@@ -1974,17 +2173,12 @@ static bool zone_reclaimable(struct zone *zone)
         return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
  }
  
-/*
- * As hibernation is going on, kswapd is freezed so that it can't mark
- * the zone into all_unreclaimable. It can't handle OOM during hibernation.
- * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
- */
+/* All zones in zonelist are unreclaimable? */
  static bool all_unreclaimable(struct zonelist *zonelist,
                 struct scan_control *sc)
  {
         struct zoneref *z;
         struct zone *zone;
-       bool all_unreclaimable = true;
  
         for_each_zone_zonelist_nodemask(zone, z, zonelist,
                         gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1992,13 +2186,11 @@ static bool all_unreclaimable(struct zonelist *zonelist,
                         continue;
                 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                         continue;
-               if (zone_reclaimable(zone)) {
-                       all_unreclaimable = false;
-                       break;
-               }
+               if (!zone->all_unreclaimable)
+                       return false;
         }
  
-       return all_unreclaimable;
+       return true;
  }
  
  /*
@@ -2018,7 +2210,8 @@ static bool all_unreclaimable(struct zonelist *zonelist,
   *             else, the number of pages reclaimed
   */
  static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
-                                       struct scan_control *sc)
+                                       struct scan_control *sc,
+                                       struct shrink_control *shrink)
  {
         int priority;
         unsigned long total_scanned = 0;
@@ -2036,8 +2229,10 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
         for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                 sc->nr_scanned = 0;
                 if (!priority)
-                       disable_swap_token();
-               shrink_zones(priority, zonelist, sc);
+                       disable_swap_token(sc->mem_cgroup);
+               if (shrink_zones(priority, zonelist, sc))
+                       break;
+
                 /*
                  * Don't shrink slabs when reclaiming memory from
                  * over limit cgroups
@@ -2052,7 +2247,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                 lru_pages += zone_reclaimable_pages(zone);
                         }
  
-                       shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
+                       shrink_slab(shrink, sc->nr_scanned, lru_pages);
                         if (reclaim_state) {
                                 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
                                 reclaim_state->reclaimed_slab = 0;
@@ -2071,7 +2266,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                  */
                 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
                 if (total_scanned > writeback_threshold) {
-                       wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
+                       wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
+                                               WB_REASON_TRY_TO_FREE_PAGES);
                         sc->may_writepage = 1;
                 }
  
@@ -2081,7 +2277,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                         struct zone *preferred_zone;
  
                         first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
-                                                       NULL, &preferred_zone);
+                                               &cpuset_current_mems_allowed,
+                                               &preferred_zone);
                         wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
                 }
         }
@@ -2093,6 +2290,14 @@ out:
         if (sc->nr_reclaimed)
                 return sc->nr_reclaimed;
  
+       /*
+        * As hibernation is going on, kswapd is freezed so that it can't mark
+        * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
+        * check.
+        */
+       if (oom_killer_disabled)
+               return 0;
+
         /* top priority shrink_zones still had more to do? don't OOM, then */
         if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
                 return 1;
@@ -2110,17 +2315,19 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
                 .may_unmap = 1,
                 .may_swap = 1,
-               .swappiness = vm_swappiness,
                 .order = order,
                 .mem_cgroup = NULL,
                 .nodemask = nodemask,
         };
+       struct shrink_control shrink = {
+               .gfp_mask = sc.gfp_mask,
+       };
  
         trace_mm_vmscan_direct_reclaim_begin(order,
                                 sc.may_writepage,
                                 gfp_mask);
  
-       nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+       nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
  
         trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
  
@@ -2131,18 +2338,19 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
  
  unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                                                 gfp_t gfp_mask, bool noswap,
-                                               unsigned int swappiness,
-                                               struct zone *zone)
+                                               struct zone *zone,
+                                               unsigned long *nr_scanned)
  {
         struct scan_control sc = {
+               .nr_scanned = 0,
                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
                 .may_writepage = !laptop_mode,
                 .may_unmap = 1,
                 .may_swap = !noswap,
-               .swappiness = swappiness,
                 .order = 0,
                 .mem_cgroup = mem,
         };
+
         sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                         (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
  
@@ -2161,36 +2369,46 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
  
         trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
  
+       *nr_scanned = sc.nr_scanned;
         return sc.nr_reclaimed;
  }
  
  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                                            gfp_t gfp_mask,
-                                          bool noswap,
-                                          unsigned int swappiness)
+                                          bool noswap)
  {
         struct zonelist *zonelist;
         unsigned long nr_reclaimed;
+       int nid;
         struct scan_control sc = {
                 .may_writepage = !laptop_mode,
                 .may_unmap = 1,
                 .may_swap = !noswap,
                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
-               .swappiness = swappiness,
                 .order = 0,
                 .mem_cgroup = mem_cont,
                 .nodemask = NULL, /* we don't care the placement */
+               .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+                               (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+       };
+       struct shrink_control shrink = {
+               .gfp_mask = sc.gfp_mask,
         };
  
-       sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
-                       (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
-       zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+       /*
+        * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
+        * take care of from where we get pages. So the node where we start the
+        * scan does not need to be the current node.
+        */
+       nid = mem_cgroup_select_victim_node(mem_cont);
+
+       zonelist = NODE_DATA(nid)->node_zonelists;
  
         trace_mm_vmscan_memcg_reclaim_begin(0,
                                             sc.may_writepage,
                                             sc.gfp_mask);
  
-       nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+       nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
  
         trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
  
@@ -2198,38 +2416,88 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
  }
  #endif
  
+/*
+ * pgdat_balanced is used when checking if a node is balanced for high-order
+ * allocations. Only zones that meet watermarks and are in a zone allowed
+ * by the callers classzone_idx are added to balanced_pages. The total of
+ * balanced pages must be at least 25% of the zones allowed by classzone_idx
+ * for the node to be considered balanced. Forcing all zones to be balanced
+ * for high orders can cause excessive reclaim when there are imbalanced zones.
+ * The choice of 25% is due to
+ *   o a 16M DMA zone that is balanced will not balance a zone on any
+ *     reasonable sized machine
+ *   o On all other machines, the top zone must be at least a reasonable
+ *     percentage of the middle zones. For example, on 32-bit x86, highmem
+ *     would need to be at least 256M for it to be balance a whole node.
+ *     Similarly, on x86-64 the Normal zone would need to be at least 1G
+ *     to balance a node on its own. These seemed like reasonable ratios.
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
+                                               int classzone_idx)
+{
+       unsigned long present_pages = 0;
+       int i;
+
+       for (i = 0; i <= classzone_idx; i++)
+               present_pages += pgdat->node_zones[i].present_pages;
+
+       /* A special case here: if zone has no page, we think it's balanced */
+       return balanced_pages >= (present_pages >> 2);
+}
+
  /* is kswapd sleeping prematurely? */
-static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+                                       int classzone_idx)
  {
         int i;
+       unsigned long balanced = 0;
+       bool all_zones_ok = true;
  
         /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
         if (remaining)
-               return 1;
+               return true;
  
-       /* If after HZ/10, a zone is below the high mark, it's premature */
-       for (i = 0; i < pgdat->nr_zones; i++) {
+       /* Check the watermark levels */
+       for (i = 0; i <= classzone_idx; i++) {
                 struct zone *zone = pgdat->node_zones + i;
  
                 if (!populated_zone(zone))
                         continue;
  
-               if (zone->all_unreclaimable)
+               /*
+                * balance_pgdat() skips over all_unreclaimable after
+                * DEF_PRIORITY. Effectively, it considers them balanced so
+                * they must be considered balanced here as well if kswapd
+                * is to sleep
+                */
+               if (zone->all_unreclaimable) {
+                       balanced += zone->present_pages;
                         continue;
+               }
  
                 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
-                                                               0, 0))
-                       return 1;
+                                                       i, 0))
+                       all_zones_ok = false;
+               else
+                       balanced += zone->present_pages;
         }
  
-       return 0;
+       /*
+        * For high-order requests, the balanced zones must contain at least
+        * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
+        * must be balanced
+        */
+       if (order)
+               return !pgdat_balanced(pgdat, balanced, classzone_idx);
+       else
+               return !all_zones_ok;
  }
  
  /*
   * For kswapd, balance_pgdat() will work across all this node's zones until
   * they are all at high_wmark_pages(zone).
   *
- * Returns the number of pages which were actually freed.
+ * Returns the final order kswapd was reclaiming at
   *
   * There is special handling here for zones which are full of pinned pages.
   * This can happen if the pages are all mlocked, or if they are all used by
@@ -2246,13 +2514,18 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
   * interoperates with the page allocator fallback scheme to ensure that aging
   * of pages is balanced across the zones.
   */
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
+                                                       int *classzone_idx)
  {
         int all_zones_ok;
+       unsigned long balanced;
         int priority;
         int i;
+       int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
         unsigned long total_scanned;
         struct reclaim_state *reclaim_state = current->reclaim_state;
+       unsigned long nr_soft_reclaimed;
+       unsigned long nr_soft_scanned;
         struct scan_control sc = {
                 .gfp_mask = GFP_KERNEL,
                 .may_unmap = 1,
@@ -2262,10 +2535,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
                  * we want to put equal scanning pressure on each zone.
                  */
                 .nr_to_reclaim = ULONG_MAX,
-               .swappiness = vm_swappiness,
                 .order = order,
                 .mem_cgroup = NULL,
         };
+       struct shrink_control shrink = {
+               .gfp_mask = sc.gfp_mask,
+       };
  loop_again:
         total_scanned = 0;
         sc.nr_reclaimed = 0;
@@ -2273,15 +2548,15 @@ loop_again:
         count_vm_event(PAGEOUTRUN);
  
         for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-               int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
                 unsigned long lru_pages = 0;
                 int has_under_min_watermark_zone = 0;
  
                 /* The swap token gets in the way of swapout... */
                 if (!priority)
-                       disable_swap_token();
+                       disable_swap_token(NULL);
  
                 all_zones_ok = 1;
+               balanced = 0;
  
                 /*
                  * Scan in the highmem->dma direction for the highest
@@ -2308,6 +2583,9 @@ loop_again:
                                         high_wmark_pages(zone), 0, 0)) {
                                 end_zone = i;
                                 break;
+                       } else {
+                               /* If balanced, clear the congested flag */
+                               zone_clear_flag(zone, ZONE_CONGESTED);
                         }
                 }
                 if (i < 0)
@@ -2331,6 +2609,7 @@ loop_again:
                 for (i = 0; i <= end_zone; i++) {
                         struct zone *zone = pgdat->node_zones + i;
                         int nr_slab;
+                       unsigned long balance_gap;
  
                         if (!populated_zone(zone))
                                 continue;
@@ -2340,28 +2619,42 @@ loop_again:
  
                         sc.nr_scanned = 0;
  
+                       nr_soft_scanned = 0;
                         /*
                          * Call soft limit reclaim before calling shrink_zone.
-                        * For now we ignore the return value
                          */
-                       mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
+                       nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+                                                       order, sc.gfp_mask,
+                                                       &nr_soft_scanned);
+                       sc.nr_reclaimed += nr_soft_reclaimed;
+                       total_scanned += nr_soft_scanned;
  
                         /*
-                        * We put equal pressure on every zone, unless one
-                        * zone has way too many pages free already.
+                        * We put equal pressure on every zone, unless
+                        * one zone has way too many pages free
+                        * already. The "too many pages" is defined
+                        * as the high wmark plus a "gap" where the
+                        * gap is either the low watermark or 1%
+                        * of the zone, whichever is smaller.
                          */
+                       balance_gap = min(low_wmark_pages(zone),
+                               (zone->present_pages +
+                                       KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                               KSWAPD_ZONE_BALANCE_GAP_RATIO);
                         if (!zone_watermark_ok_safe(zone, order,
-                                       8*high_wmark_pages(zone), end_zone, 0))
+                                       high_wmark_pages(zone) + balance_gap,
+                                       end_zone, 0)) {
                                 shrink_zone(priority, zone, &sc);
-                       reclaim_state->reclaimed_slab = 0;
-                       nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
-                                               lru_pages);
-                       sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-                       total_scanned += sc.nr_scanned;
-                       if (zone->all_unreclaimable)
-                               continue;
-                       if (nr_slab == 0 && !zone_reclaimable(zone))
-                               zone->all_unreclaimable = 1;
+
+                               reclaim_state->reclaimed_slab = 0;
+                               nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
+                               sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+                               total_scanned += sc.nr_scanned;
+
+                               if (nr_slab == 0 && !zone_reclaimable(zone))
+                                       zone->all_unreclaimable = 1;
+                       }
+
                         /*
                          * If we've done a decent amount of scanning and
                          * the reclaim ratio is low, start doing writepage
@@ -2371,14 +2664,11 @@ loop_again:
                             total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                 sc.may_writepage = 1;
  
-                       /*
-                        * Compact the zone for higher orders to reduce
-                        * latencies for higher-order allocations that
-                        * would ordinarily call try_to_compact_pages()
-                        */
-                       if (sc.order > PAGE_ALLOC_COSTLY_ORDER)
-                               compact_zone_order(zone, sc.order, sc.gfp_mask,
-                                                       false);
+                       if (zone->all_unreclaimable) {
+                               if (end_zone && end_zone == i)
+                                       end_zone--;
+                               continue;
+                       }
  
                         if (!zone_watermark_ok_safe(zone, order,
                                         high_wmark_pages(zone), end_zone, 0)) {
@@ -2400,10 +2690,12 @@ loop_again:
                                  * spectulatively avoid congestion waits
                                  */
                                 zone_clear_flag(zone, ZONE_CONGESTED);
+                               if (i <= *classzone_idx)
+                                       balanced += zone->present_pages;
                         }
  
                 }
-               if (all_zones_ok)
+               if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
                         break;          /* kswapd: all done */
                 /*
                  * OK, kswapd is getting into trouble.  Take a nap, then take
@@ -2426,7 +2718,13 @@ loop_again:
                         break;
         }
  out:
-       if (!all_zones_ok) {
+
+       /*
+        * order-0: All zones must meet high watermark for a balanced node
+        * high-order: Balanced zones must make up at least 25% of the node
+        *             for the node to be balanced
+        */
+       if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
                 cond_resched();
  
                 try_to_freeze();
@@ -2451,10 +2749,49 @@ out:
                 goto loop_again;
         }
  
-       return sc.nr_reclaimed;
+       /*
+        * If kswapd was reclaiming at a higher order, it has the option of
+        * sleeping without all zones being balanced. Before it does, it must
+        * ensure that the watermarks for order-0 on *all* zones are met and
+        * that the congestion flags are cleared. The congestion flag must
+        * be cleared as kswapd is the only mechanism that clears the flag
+        * and it is potentially going to sleep here.
+        */
+       if (order) {
+               for (i = 0; i <= end_zone; i++) {
+                       struct zone *zone = pgdat->node_zones + i;
+
+                       if (!populated_zone(zone))
+                               continue;
+
+                       if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                               continue;
+
+                       /* Confirm the zone is balanced for order-0 */
+                       if (!zone_watermark_ok(zone, 0,
+                                       high_wmark_pages(zone), 0, 0)) {
+                               order = sc.order = 0;
+                               goto loop_again;
+                       }
+
+                       /* If balanced, clear the congested flag */
+                       zone_clear_flag(zone, ZONE_CONGESTED);
+                       if (i <= *classzone_idx)
+                               balanced += zone->present_pages;
+               }
+       }
+
+       /*
+        * Return the order we were reclaiming at so sleeping_prematurely()
+        * makes a decision on the order we were last reclaiming at. However,
+        * if another caller entered the allocator slow path while kswapd
+        * was awake, order will remain at the higher level
+        */
+       *classzone_idx = end_zone;
+       return order;
  }
  
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order)
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
  {
         long remaining = 0;
         DEFINE_WAIT(wait);
@@ -2465,7 +2802,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order)
         prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
  
         /* Try to sleep for a short interval */
-       if (!sleeping_prematurely(pgdat, order, remaining)) {
+       if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
                 remaining = schedule_timeout(HZ/10);
                 finish_wait(&pgdat->kswapd_wait, &wait);
                 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -2475,7 +2812,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order)
          * After a short sleep, check if it was a premature sleep. If not, then
          * go fully to sleep until explicitly woken up.
          */
-       if (!sleeping_prematurely(pgdat, order, remaining)) {
+       if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
                 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
  
                 /*
@@ -2513,7 +2850,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order)
   */
  static int kswapd(void *p)
  {
-       unsigned long order;
+       unsigned long order, new_order;
+       unsigned balanced_order;
+       int classzone_idx, new_classzone_idx;
+       int balanced_classzone_idx;
         pg_data_t *pgdat = (pg_data_t*)p;
         struct task_struct *tsk = current;
  
@@ -2543,22 +2883,42 @@ static int kswapd(void *p)
         tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
         set_freezable();
  
-       order = 0;
+       order = new_order = 0;
+       balanced_order = 0;
+       classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+       balanced_classzone_idx = classzone_idx;
         for ( ; ; ) {
-               unsigned long new_order;
                 int ret;
  
-               new_order = pgdat->kswapd_max_order;
-               pgdat->kswapd_max_order = 0;
-               if (order < new_order) {
+               /*
+                * If the last balance_pgdat was unsuccessful it's unlikely a
+                * new request of a similar or harder type will succeed soon
+                * so consider going to sleep on the basis we reclaimed at
+                */
+               if (balanced_classzone_idx >= new_classzone_idx &&
+                                       balanced_order == new_order) {
+                       new_order = pgdat->kswapd_max_order;
+                       new_classzone_idx = pgdat->classzone_idx;
+                       pgdat->kswapd_max_order =  0;
+                       pgdat->classzone_idx = pgdat->nr_zones - 1;
+               }
+
+               if (order < new_order || classzone_idx > new_classzone_idx) {
                         /*
                          * Don't sleep if someone wants a larger 'order'
-                        * allocation
+                        * allocation or has tigher zone constraints
                          */
                         order = new_order;
+                       classzone_idx = new_classzone_idx;
                 } else {
-                       kswapd_try_to_sleep(pgdat, order);
+                       kswapd_try_to_sleep(pgdat, balanced_order,
+                                               balanced_classzone_idx);
                         order = pgdat->kswapd_max_order;
+                       classzone_idx = pgdat->classzone_idx;
+                       new_order = order;
+                       new_classzone_idx = classzone_idx;
+                       pgdat->kswapd_max_order = 0;
+                       pgdat->classzone_idx = pgdat->nr_zones - 1;
                 }
  
                 ret = try_to_freeze();
@@ -2571,7 +2931,9 @@ static int kswapd(void *p)
                  */
                 if (!ret) {
                         trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                       balance_pgdat(pgdat, order);
+                       balanced_classzone_idx = classzone_idx;
+                       balanced_order = balance_pgdat(pgdat, order,
+                                               &balanced_classzone_idx);
                 }
         }
         return 0;
@@ -2580,7 +2942,7 @@ static int kswapd(void *p)
  /*
   * A zone is low on free memory, so wake its kswapd task to service it.
   */
-void wakeup_kswapd(struct zone *zone, int order)
+void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
  {
         pg_data_t *pgdat;
  
@@ -2590,8 +2952,10 @@ void wakeup_kswapd(struct zone *zone, int order)
         if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                 return;
         pgdat = zone->zone_pgdat;
-       if (pgdat->kswapd_max_order < order)
+       if (pgdat->kswapd_max_order < order) {
                 pgdat->kswapd_max_order = order;
+               pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
+       }
         if (!waitqueue_active(&pgdat->kswapd_wait))
                 return;
         if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
@@ -2655,10 +3019,12 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
                 .may_writepage = 1,
                 .nr_to_reclaim = nr_to_reclaim,
                 .hibernation_mode = 1,
-               .swappiness = vm_swappiness,
                 .order = 0,
         };
-       struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
+       struct shrink_control shrink = {
+               .gfp_mask = sc.gfp_mask,
+       };
+       struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
         struct task_struct *p = current;
         unsigned long nr_reclaimed;
  
@@ -2667,7 +3033,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
         reclaim_state.reclaimed_slab = 0;
         p->reclaim_state = &reclaim_state;
  
-       nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+       nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
  
         p->reclaim_state = NULL;
         lockdep_clear_current_reclaim_state();
@@ -2839,9 +3205,11 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                 .nr_to_reclaim = max_t(unsigned long, nr_pages,
                                        SWAP_CLUSTER_MAX),
                 .gfp_mask = gfp_mask,
-               .swappiness = vm_swappiness,
                 .order = order,
         };
+       struct shrink_control shrink = {
+               .gfp_mask = sc.gfp_mask,
+       };
         unsigned long nr_slab_pages0, nr_slab_pages1;
  
         cond_resched();
@@ -2883,7 +3251,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                         unsigned long lru_pages = zone_reclaimable_pages(zone);
  
                         /* No reclaimable slab or very low memory pressure */
-                       if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
+                       if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
                                 break;
  
                         /* Freed enough memory */
@@ -2985,158 +3353,66 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
         return 1;
  }
  
+#ifdef CONFIG_SHMEM
  /**
- * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
- * @page: page to check evictability and move to appropriate lru list
- * @zone: zone page is in
+ * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
+ * @pages:     array of pages to check
+ * @nr_pages:  number of pages to check
   *
- * Checks a page for evictability and moves the page to the appropriate
- * zone lru list.
+ * Checks pages for evictability and moves them to the appropriate lru list.
   *
- * Restrictions: zone->lru_lock must be held, page must be on LRU and must
- * have PageUnevictable set.
+ * This function is only used for SysV IPC SHM_UNLOCK.
   */
-static void check_move_unevictable_page(struct page *page, struct zone *zone)
+void check_move_unevictable_pages(struct page **pages, int nr_pages)
  {
-       VM_BUG_ON(PageActive(page));
-
-retry:
-       ClearPageUnevictable(page);
-       if (page_evictable(page, NULL)) {
-               enum lru_list l = page_lru_base_type(page);
+       struct zone *zone = NULL;
+       int pgscanned = 0;
+       int pgrescued = 0;
+       int i;
  
-               __dec_zone_state(zone, NR_UNEVICTABLE);
-               list_move(&page->lru, &zone->lru[l].list);
-               mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
-               __inc_zone_state(zone, NR_INACTIVE_ANON + l);
-               __count_vm_event(UNEVICTABLE_PGRESCUED);
-       } else {
-               /*
-                * rotate unevictable list
-                */
-               SetPageUnevictable(page);
-               list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
-               mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
-               if (page_evictable(page, NULL))
-                       goto retry;
-       }
-}
+       for (i = 0; i < nr_pages; i++) {
+               struct page *page = pages[i];
+               struct zone *pagezone;
  
-/**
- * scan_mapping_unevictable_pages - scan an address space for evictable pages
- * @mapping: struct address_space to scan for evictable pages
- *
- * Scan all pages in mapping.  Check unevictable pages for
- * evictability and move them to the appropriate zone lru list.
- */
-void scan_mapping_unevictable_pages(struct address_space *mapping)
-{
-       pgoff_t next = 0;
-       pgoff_t end   = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
-                        PAGE_CACHE_SHIFT;
-       struct zone *zone;
-       struct pagevec pvec;
+               pgscanned++;
+               pagezone = page_zone(page);
+               if (pagezone != zone) {
+                       if (zone)
+                               spin_unlock_irq(&zone->lru_lock);
+                       zone = pagezone;
+                       spin_lock_irq(&zone->lru_lock);
+               }
  
-       if (mapping->nrpages == 0)
-               return;
+               if (!PageLRU(page) || !PageUnevictable(page))
+                       continue;
  
-       pagevec_init(&pvec, 0);
-       while (next < end &&
-               pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
-               int i;
-               int pg_scanned = 0;
-
-               zone = NULL;
-
-               for (i = 0; i < pagevec_count(&pvec); i++) {
-                       struct page *page = pvec.pages[i];
-                       pgoff_t page_index = page->index;
-                       struct zone *pagezone = page_zone(page);
-
-                       pg_scanned++;
-                       if (page_index > next)
-                               next = page_index;
-                       next++;
-
-                       if (pagezone != zone) {
-                               if (zone)
-                                       spin_unlock_irq(&zone->lru_lock);
-                               zone = pagezone;
-                               spin_lock_irq(&zone->lru_lock);
-                       }
+               if (page_evictable(page, NULL)) {
+                       enum lru_list lru = page_lru_base_type(page);
  
-                       if (PageLRU(page) && PageUnevictable(page))
-                               check_move_unevictable_page(page, zone);
+                       VM_BUG_ON(PageActive(page));
+                       ClearPageUnevictable(page);
+                       __dec_zone_state(zone, NR_UNEVICTABLE);
+                       list_move(&page->lru, &zone->lru[lru].list);
+                       mem_cgroup_move_lists(page, LRU_UNEVICTABLE, lru);
+                       __inc_zone_state(zone, NR_INACTIVE_ANON + lru);
+                       pgrescued++;
                 }
-               if (zone)
-                       spin_unlock_irq(&zone->lru_lock);
-               pagevec_release(&pvec);
-
-               count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
         }
  
-}
-
-/**
- * scan_zone_unevictable_pages - check unevictable list for evictable pages
- * @zone - zone of which to scan the unevictable list
- *
- * Scan @zone's unevictable LRU lists to check for pages that have become
- * evictable.  Move those that have to @zone's inactive list where they
- * become candidates for reclaim, unless shrink_inactive_zone() decides
- * to reactivate them.  Pages that are still unevictable are rotated
- * back onto @zone's unevictable list.
- */
-#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
-static void scan_zone_unevictable_pages(struct zone *zone)
-{
-       struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
-       unsigned long scan;
-       unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
-
-       while (nr_to_scan > 0) {
-               unsigned long batch_size = min(nr_to_scan,
-                                               SCAN_UNEVICTABLE_BATCH_SIZE);
-
-               spin_lock_irq(&zone->lru_lock);
-               for (scan = 0;  scan < batch_size; scan++) {
-                       struct page *page = lru_to_page(l_unevictable);
-
-                       if (!trylock_page(page))
-                               continue;
-
-                       prefetchw_prev_lru_page(page, l_unevictable, flags);
-
-                       if (likely(PageLRU(page) && PageUnevictable(page)))
-                               check_move_unevictable_page(page, zone);
-
-                       unlock_page(page);
-               }
+       if (zone) {
+               __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
+               __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
                 spin_unlock_irq(&zone->lru_lock);
-
-               nr_to_scan -= batch_size;
         }
  }
+#endif /* CONFIG_SHMEM */
  
-
-/**
- * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
- *
- * A really big hammer:  scan all zones' unevictable LRU lists to check for
- * pages that have become evictable.  Move those back to the zones'
- * inactive list where they become candidates for reclaim.
- * This occurs when, e.g., we have unswappable pages on the unevictable lists,
- * and we add swap to the system.  As such, it runs in the context of a task
- * that has possibly/probably made some previously unevictable pages
- * evictable.
- */
-static void scan_all_zones_unevictable_pages(void)
+static void warn_scan_unevictable_pages(void)
  {
-       struct zone *zone;
-
-       for_each_zone(zone) {
-               scan_zone_unevictable_pages(zone);
-       }
+       printk_once(KERN_WARNING
+                   "The scan_unevictable_pages sysctl/node-interface has been "
+                   "disabled for lack of a legitimate use case.  If you have "
+                   "one, please send an email to linux-mm@kvack.org.\n");
  }
  
  /*
@@ -3149,11 +3425,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
                            void __user *buffer,
                            size_t *length, loff_t *ppos)
  {
+       warn_scan_unevictable_pages();
         proc_doulongvec_minmax(table, write, buffer, length, ppos);
-
-       if (write && *(unsigned long *)table->data)
-               scan_all_zones_unevictable_pages();
-
         scan_unevictable_pages = 0;
         return 0;
  }
@@ -3168,6 +3441,7 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev,
                                           struct sysdev_attribute *attr,
                                           char *buf)
  {
+       warn_scan_unevictable_pages();
         return sprintf(buf, "0\n");     /* always zero; should fit... */
  }
  
@@ -3175,19 +3449,7 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev,
                                            struct sysdev_attribute *attr,
                                         const char *buf, size_t count)
  {
-       struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
-       struct zone *zone;
-       unsigned long res;
-       unsigned long req = strict_strtoul(buf, 10, &res);
-
-       if (!req)
-               return 1;       /* zero is no-op */
-
-       for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-               if (!populated_zone(zone))
-                       continue;
-               scan_zone_unevictable_pages(zone);
-       }
+       warn_scan_unevictable_pages();
         return 1;
  }