Update to 3.4-final.
[linux-flexiantxendom0-3.2.10.git] / mm / vmscan.c
index 91cee9d..33dc256 100644 (file)
@@ -26,7 +26,6 @@
 #include <linux/buffer_head.h> /* for try_to_release_page(),
                                        buffer_heads_over_limit */
 #include <linux/mm_inline.h>
-#include <linux/pagevec.h>
 #include <linux/backing-dev.h>
 #include <linux/rmap.h>
 #include <linux/topology.h>
@@ -103,8 +102,11 @@ struct scan_control {
         */
        reclaim_mode_t reclaim_mode;
 
-       /* Which cgroup do we reclaim from */
-       struct mem_cgroup *mem_cgroup;
+       /*
+        * The memory cgroup that hit its limit and as a result is the
+        * primary target of this reclaim invocation.
+        */
+       struct mem_cgroup *target_mem_cgroup;
 
        /*
         * Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -113,6 +115,11 @@ struct scan_control {
        nodemask_t      *nodemask;
 };
 
+struct mem_cgroup_zone {
+       struct mem_cgroup *mem_cgroup;
+       struct zone *zone;
+};
+
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 
 #ifdef ARCH_HAS_PREFETCH
@@ -153,28 +160,45 @@ static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
-#define scanning_global_lru(sc)        (!(sc)->mem_cgroup)
+static bool global_reclaim(struct scan_control *sc)
+{
+       return !sc->target_mem_cgroup;
+}
+
+static bool scanning_global_lru(struct mem_cgroup_zone *mz)
+{
+       return !mz->mem_cgroup;
+}
 #else
-#define scanning_global_lru(sc)        (1)
+static bool global_reclaim(struct scan_control *sc)
+{
+       return true;
+}
+
+static bool scanning_global_lru(struct mem_cgroup_zone *mz)
+{
+       return true;
+}
 #endif
 
-static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
-                                                 struct scan_control *sc)
+static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
 {
-       if (!scanning_global_lru(sc))
-               return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
+       if (!scanning_global_lru(mz))
+               return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone);
 
-       return &zone->reclaim_stat;
+       return &mz->zone->reclaim_stat;
 }
 
-static unsigned long zone_nr_lru_pages(struct zone *zone,
-                               struct scan_control *sc, enum lru_list lru)
+static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
+                                      enum lru_list lru)
 {
-       if (!scanning_global_lru(sc))
-               return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
-                               zone_to_nid(zone), zone_idx(zone), BIT(lru));
+       if (!scanning_global_lru(mz))
+               return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
+                                                   zone_to_nid(mz->zone),
+                                                   zone_idx(mz->zone),
+                                                   BIT(lru));
 
-       return zone_page_state(zone, NR_LRU_BASE + lru);
+       return zone_page_state(mz->zone, NR_LRU_BASE + lru);
 }
 
 
@@ -183,7 +207,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
  */
 void register_shrinker(struct shrinker *shrinker)
 {
-       shrinker->nr = 0;
+       atomic_long_set(&shrinker->nr_in_batch, 0);
        down_write(&shrinker_rwsem);
        list_add_tail(&shrinker->list, &shrinker_list);
        up_write(&shrinker_rwsem);
@@ -247,25 +271,26 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 
        list_for_each_entry(shrinker, &shrinker_list, list) {
                unsigned long long delta;
-               unsigned long total_scan;
-               unsigned long max_pass;
+               long total_scan;
+               long max_pass;
                int shrink_ret = 0;
                long nr;
                long new_nr;
                long batch_size = shrinker->batch ? shrinker->batch
                                                  : SHRINK_BATCH;
 
+               max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+               if (max_pass <= 0)
+                       continue;
+
                /*
                 * copy the current shrinker scan count into a local variable
                 * and zero it so that other concurrent shrinker invocations
                 * don't also do this scanning work.
                 */
-               do {
-                       nr = shrinker->nr;
-               } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+               nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
 
                total_scan = nr;
-               max_pass = do_shrinker_shrink(shrinker, shrink, 0);
                delta = (4 * nr_pages_scanned) / shrinker->seeks;
                delta *= max_pass;
                do_div(delta, lru_pages + 1);
@@ -325,12 +350,11 @@ unsigned long shrink_slab(struct shrink_control *shrink,
                 * manner that handles concurrent updates. If we exhausted the
                 * scan, there is no need to do an update.
                 */
-               do {
-                       nr = shrinker->nr;
-                       new_nr = total_scan + nr;
-                       if (total_scan <= 0)
-                               break;
-               } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+               if (total_scan > 0)
+                       new_nr = atomic_long_add_return(total_scan,
+                                       &shrinker->nr_in_batch);
+               else
+                       new_nr = atomic_long_read(&shrinker->nr_in_batch);
 
                trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
        }
@@ -495,15 +519,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        return PAGE_ACTIVATE;
                }
 
-               /*
-                * Wait on writeback if requested to. This happens when
-                * direct reclaiming a large contiguous area and the
-                * first attempt to free a range of pages fails.
-                */
-               if (PageWriteback(page) &&
-                   (sc->reclaim_mode & RECLAIM_MODE_SYNC))
-                       wait_on_page_writeback(page);
-
                if (!PageWriteback(page)) {
                        /* synchronous write or broken a_ops? */
                        ClearPageReclaim(page);
@@ -642,13 +657,14 @@ redo:
                lru = LRU_UNEVICTABLE;
                add_page_to_unevictable_list(page);
                /*
-                * When racing with an mlock clearing (page is
-                * unlocked), make sure that if the other thread does
-                * not observe our setting of PG_lru and fails
-                * isolation, we see PG_mlocked cleared below and move
+                * When racing with an mlock or AS_UNEVICTABLE clearing
+                * (page is unlocked) make sure that if the other thread
+                * does not observe our setting of PG_lru and fails
+                * isolation/check_move_unevictable_pages,
+                * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
                 * the page back to the evictable list.
                 *
-                * The other side is TestClearPageMlocked().
+                * The other side is TestClearPageMlocked() or shmem_lock().
                 */
                smp_mb();
        }
@@ -685,12 +701,13 @@ enum page_references {
 };
 
 static enum page_references page_check_references(struct page *page,
+                                                 struct mem_cgroup_zone *mz,
                                                  struct scan_control *sc)
 {
        int referenced_ptes, referenced_page;
        unsigned long vm_flags;
 
-       referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
+       referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags);
        referenced_page = TestClearPageReferenced(page);
 
        /* Lumpy reclaim - ignore references */
@@ -723,7 +740,13 @@ static enum page_references page_check_references(struct page *page,
                 */
                SetPageReferenced(page);
 
-               if (referenced_page)
+               if (referenced_page || referenced_ptes > 1)
+                       return PAGEREF_ACTIVATE;
+
+               /*
+                * Activate file-backed executable pages after first usage.
+                */
+               if (vm_flags & VM_EXEC)
                        return PAGEREF_ACTIVATE;
 
                return PAGEREF_KEEP;
@@ -736,30 +759,15 @@ static enum page_references page_check_references(struct page *page,
        return PAGEREF_RECLAIM;
 }
 
-static noinline_for_stack void free_page_list(struct list_head *free_pages)
-{
-       struct pagevec freed_pvec;
-       struct page *page, *tmp;
-
-       pagevec_init(&freed_pvec, 1);
-
-       list_for_each_entry_safe(page, tmp, free_pages, lru) {
-               list_del(&page->lru);
-               if (!pagevec_add(&freed_pvec, page)) {
-                       __pagevec_free(&freed_pvec);
-                       pagevec_reinit(&freed_pvec);
-               }
-       }
-
-       pagevec_free(&freed_pvec);
-}
-
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
-                                     struct zone *zone,
-                                     struct scan_control *sc)
+                                     struct mem_cgroup_zone *mz,
+                                     struct scan_control *sc,
+                                     int priority,
+                                     unsigned long *ret_nr_dirty,
+                                     unsigned long *ret_nr_writeback)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
@@ -767,6 +775,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        unsigned long nr_dirty = 0;
        unsigned long nr_congested = 0;
        unsigned long nr_reclaimed = 0;
+       unsigned long nr_writeback = 0;
 
        cond_resched();
 
@@ -785,7 +794,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        goto keep;
 
                VM_BUG_ON(PageActive(page));
-               VM_BUG_ON(page_zone(page) != zone);
+               VM_BUG_ON(page_zone(page) != mz->zone);
 
                sc->nr_scanned++;
 
@@ -803,13 +812,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
 
                if (PageWriteback(page)) {
+                       nr_writeback++;
                        /*
-                        * Synchronous reclaim is performed in two passes,
-                        * first an asynchronous pass over the list to
-                        * start parallel writeback, and a second synchronous
-                        * pass to wait for the IO to complete.  Wait here
-                        * for any page for which writeback has already
-                        * started.
+                        * Synchronous reclaim cannot queue pages for
+                        * writeback due to the possibility of stack overflow
+                        * but if it encounters a page under writeback, wait
+                        * for the IO to complete.
                         */
                        if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
                            may_enter_fs)
@@ -820,7 +828,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        }
                }
 
-               references = page_check_references(page, sc);
+               references = page_check_references(page, mz, sc);
                switch (references) {
                case PAGEREF_ACTIVATE:
                        goto activate_locked;
@@ -865,6 +873,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (PageDirty(page)) {
                        nr_dirty++;
 
+                       /*
+                        * Only kswapd can writeback filesystem pages to
+                        * avoid risk of stack overflow but do not writeback
+                        * unless under significant pressure.
+                        */
+                       if (page_is_file_cache(page) &&
+                                       (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
+                               /*
+                                * Immediately reclaim when written back.
+                                * Similar in principal to deactivate_page()
+                                * except we already have the page isolated
+                                * and know it's dirty
+                                */
+                               inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
+                               SetPageReclaim(page);
+
+                               goto keep_locked;
+                       }
+
                        if (references == PAGEREF_RECLAIM_CLEAN)
                                goto keep_locked;
                        if (!may_enter_fs)
@@ -992,13 +1019,15 @@ keep_lumpy:
         * back off and wait for congestion to clear because further reclaim
         * will encounter the same problem
         */
-       if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
-               zone_set_flag(zone, ZONE_CONGESTED);
+       if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
+               zone_set_flag(mz->zone, ZONE_CONGESTED);
 
-       free_page_list(&free_pages);
+       free_hot_cold_page_list(&free_pages, 1);
 
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
+       *ret_nr_dirty += nr_dirty;
+       *ret_nr_writeback += nr_writeback;
        return nr_reclaimed;
 }
 
@@ -1012,23 +1041,27 @@ keep_lumpy:
  *
  * returns 0 on success, -ve errno on failure.
  */
-int __isolate_lru_page(struct page *page, int mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
 {
+       bool all_lru_mode;
        int ret = -EINVAL;
 
        /* Only take pages on the LRU. */
        if (!PageLRU(page))
                return ret;
 
+       all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+               (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
+
        /*
         * When checking the active state, we need to be sure we are
         * dealing with comparible boolean values.  Take the logical not
         * of each.
         */
-       if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+       if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
                return ret;
 
-       if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
+       if (!all_lru_mode && !!page_is_file_cache(page) != file)
                return ret;
 
        /*
@@ -1041,6 +1074,43 @@ int __isolate_lru_page(struct page *page, int mode, int file)
 
        ret = -EBUSY;
 
+       /*
+        * To minimise LRU disruption, the caller can indicate that it only
+        * wants to isolate pages it will be able to operate on without
+        * blocking - clean pages for the most part.
+        *
+        * ISOLATE_CLEAN means that only clean pages should be isolated. This
+        * is used by reclaim when it is cannot write to backing storage
+        *
+        * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
+        * that it is possible to migrate without blocking
+        */
+       if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
+               /* All the caller can do on PageWriteback is block */
+               if (PageWriteback(page))
+                       return ret;
+
+               if (PageDirty(page)) {
+                       struct address_space *mapping;
+
+                       /* ISOLATE_CLEAN means only clean pages */
+                       if (mode & ISOLATE_CLEAN)
+                               return ret;
+
+                       /*
+                        * Only pages without mappings or that have a
+                        * ->migratepage callback are possible to migrate
+                        * without blocking
+                        */
+                       mapping = page_mapping(page);
+                       if (mapping && !mapping->a_ops->migratepage)
+                               return ret;
+               }
+       }
+
+       if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+               return ret;
+
        if (likely(get_page_unless_zero(page))) {
                /*
                 * Be careful not to clear PageLRU until after we're
@@ -1065,24 +1135,36 @@ int __isolate_lru_page(struct page *page, int mode, int file)
  * Appropriate locks must be held before calling this function.
  *
  * @nr_to_scan:        The number of pages to look through on the list.
- * @src:       The LRU list to pull pages off.
+ * @mz:                The mem_cgroup_zone to pull pages from.
  * @dst:       The temp list to put pages on to.
- * @scanned:   The number of pages that were scanned.
- * @order:     The caller's attempted allocation order
+ * @nr_scanned:        The number of pages that were scanned.
+ * @sc:                The scan_control struct for this reclaim session
  * @mode:      One of the LRU isolation modes
+ * @active:    True [1] if isolating active pages
  * @file:      True [1] if isolating file [!anon] pages
  *
  * returns how many pages were moved onto *@dst.
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
-               struct list_head *src, struct list_head *dst,
-               unsigned long *scanned, int order, int mode, int file)
+               struct mem_cgroup_zone *mz, struct list_head *dst,
+               unsigned long *nr_scanned, struct scan_control *sc,
+               isolate_mode_t mode, int active, int file)
 {
+       struct lruvec *lruvec;
+       struct list_head *src;
        unsigned long nr_taken = 0;
        unsigned long nr_lumpy_taken = 0;
        unsigned long nr_lumpy_dirty = 0;
        unsigned long nr_lumpy_failed = 0;
        unsigned long scan;
+       int lru = LRU_BASE;
+
+       lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
+       if (active)
+               lru += LRU_ACTIVE;
+       if (file)
+               lru += LRU_FILE;
+       src = &lruvec->lists[lru];
 
        for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
                struct page *page;
@@ -1098,22 +1180,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 
                switch (__isolate_lru_page(page, mode, file)) {
                case 0:
+                       mem_cgroup_lru_del(page);
                        list_move(&page->lru, dst);
-                       mem_cgroup_del_lru(page);
                        nr_taken += hpage_nr_pages(page);
                        break;
 
                case -EBUSY:
                        /* else it is being freed elsewhere */
                        list_move(&page->lru, src);
-                       mem_cgroup_rotate_lru_list(page, page_lru(page));
                        continue;
 
                default:
                        BUG();
                }
 
-               if (!order)
+               if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
                        continue;
 
                /*
@@ -1127,8 +1208,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 */
                zone_id = page_zone_id(page);
                page_pfn = page_to_pfn(page);
-               pfn = page_pfn & ~((1 << order) - 1);
-               end_pfn = pfn + (1 << order);
+               pfn = page_pfn & ~((1 << sc->order) - 1);
+               end_pfn = pfn + (1 << sc->order);
                for (; pfn < end_pfn; pfn++) {
                        struct page *cursor_page;
 
@@ -1151,18 +1232,22 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                         * anon page which don't already have a swap slot is
                         * pointless.
                         */
-                       if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
+                       if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
                            !PageSwapCache(cursor_page))
                                break;
 
                        if (__isolate_lru_page(cursor_page, mode, file) == 0) {
+                               unsigned int isolated_pages;
+
+                               mem_cgroup_lru_del(cursor_page);
                                list_move(&cursor_page->lru, dst);
-                               mem_cgroup_del_lru(cursor_page);
-                               nr_taken += hpage_nr_pages(page);
-                               nr_lumpy_taken++;
+                               isolated_pages = hpage_nr_pages(cursor_page);
+                               nr_taken += isolated_pages;
+                               nr_lumpy_taken += isolated_pages;
                                if (PageDirty(cursor_page))
-                                       nr_lumpy_dirty++;
+                                       nr_lumpy_dirty += isolated_pages;
                                scan++;
+                               pfn += isolated_pages - 1;
                        } else {
                                /*
                                 * Check if the page is freed already.
@@ -1188,57 +1273,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                        nr_lumpy_failed++;
        }
 
-       *scanned = scan;
+       *nr_scanned = scan;
 
-       trace_mm_vmscan_lru_isolate(order,
+       trace_mm_vmscan_lru_isolate(sc->order,
                        nr_to_scan, scan,
                        nr_taken,
                        nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
-                       mode);
+                       mode, file);
        return nr_taken;
 }
 
-static unsigned long isolate_pages_global(unsigned long nr,
-                                       struct list_head *dst,
-                                       unsigned long *scanned, int order,
-                                       int mode, struct zone *z,
-                                       int active, int file)
-{
-       int lru = LRU_BASE;
-       if (active)
-               lru += LRU_ACTIVE;
-       if (file)
-               lru += LRU_FILE;
-       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
-                                                               mode, file);
-}
-
-/*
- * clear_active_flags() is a helper for shrink_active_list(), clearing
- * any active bits from the pages in the list.
- */
-static unsigned long clear_active_flags(struct list_head *page_list,
-                                       unsigned int *count)
-{
-       int nr_active = 0;
-       int lru;
-       struct page *page;
-
-       list_for_each_entry(page, page_list, lru) {
-               int numpages = hpage_nr_pages(page);
-               lru = page_lru_base_type(page);
-               if (PageActive(page)) {
-                       lru += LRU_ACTIVE;
-                       ClearPageActive(page);
-                       nr_active += numpages;
-               }
-               if (count)
-                       count[lru] += numpages;
-       }
-
-       return nr_active;
-}
-
 /**
  * isolate_lru_page - tries to isolate a page from its LRU list
  * @page: page to isolate from its LRU list
@@ -1298,7 +1342,7 @@ static int too_many_isolated(struct zone *zone, int file,
        if (current_is_kswapd())
                return 0;
 
-       if (!scanning_global_lru(sc))
+       if (!global_reclaim(sc))
                return 0;
 
        if (file) {
@@ -1312,27 +1356,21 @@ static int too_many_isolated(struct zone *zone, int file,
        return isolated > inactive;
 }
 
-/*
- * TODO: Try merging with migrations version of putback_lru_pages
- */
 static noinline_for_stack void
-putback_lru_pages(struct zone *zone, struct scan_control *sc,
-                               unsigned long nr_anon, unsigned long nr_file,
-                               struct list_head *page_list)
+putback_inactive_pages(struct mem_cgroup_zone *mz,
+                      struct list_head *page_list)
 {
-       struct page *page;
-       struct pagevec pvec;
-       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
-
-       pagevec_init(&pvec, 1);
+       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+       struct zone *zone = mz->zone;
+       LIST_HEAD(pages_to_free);
 
        /*
         * Put back any unfreeable pages.
         */
-       spin_lock(&zone->lru_lock);
        while (!list_empty(page_list)) {
+               struct page *page = lru_to_page(page_list);
                int lru;
-               page = lru_to_page(page_list);
+
                VM_BUG_ON(PageLRU(page));
                list_del(&page->lru);
                if (unlikely(!page_evictable(page, NULL))) {
@@ -1349,30 +1387,53 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
                        int numpages = hpage_nr_pages(page);
                        reclaim_stat->recent_rotated[file] += numpages;
                }
-               if (!pagevec_add(&pvec, page)) {
-                       spin_unlock_irq(&zone->lru_lock);
-                       __pagevec_release(&pvec);
-                       spin_lock_irq(&zone->lru_lock);
+               if (put_page_testzero(page)) {
+                       __ClearPageLRU(page);
+                       __ClearPageActive(page);
+                       del_page_from_lru_list(zone, page, lru);
+
+                       if (unlikely(PageCompound(page))) {
+                               spin_unlock_irq(&zone->lru_lock);
+                               (*get_compound_page_dtor(page))(page);
+                               spin_lock_irq(&zone->lru_lock);
+                       } else
+                               list_add(&page->lru, &pages_to_free);
                }
        }
-       __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
-       __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
 
-       spin_unlock_irq(&zone->lru_lock);
-       pagevec_release(&pvec);
+       /*
+        * To save our caller's stack, now use input list for pages to free.
+        */
+       list_splice(&pages_to_free, page_list);
 }
 
-static noinline_for_stack void update_isolated_counts(struct zone *zone,
-                                       struct scan_control *sc,
-                                       unsigned long *nr_anon,
-                                       unsigned long *nr_file,
-                                       struct list_head *isolated_list)
+static noinline_for_stack void
+update_isolated_counts(struct mem_cgroup_zone *mz,
+                      struct list_head *page_list,
+                      unsigned long *nr_anon,
+                      unsigned long *nr_file)
 {
-       unsigned long nr_active;
+       struct zone *zone = mz->zone;
        unsigned int count[NR_LRU_LISTS] = { 0, };
-       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+       unsigned long nr_active = 0;
+       struct page *page;
+       int lru;
 
-       nr_active = clear_active_flags(isolated_list, count);
+       /*
+        * Count pages and clear active flags
+        */
+       list_for_each_entry(page, page_list, lru) {
+               int numpages = hpage_nr_pages(page);
+               lru = page_lru_base_type(page);
+               if (PageActive(page)) {
+                       lru += LRU_ACTIVE;
+                       ClearPageActive(page);
+                       nr_active += numpages;
+               }
+               count[lru] += numpages;
+       }
+
+       preempt_disable();
        __count_vm_events(PGDEACTIVATE, nr_active);
 
        __mod_zone_page_state(zone, NR_ACTIVE_FILE,
@@ -1386,15 +1447,14 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
 
        *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
        *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+
        __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
        __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
-
-       reclaim_stat->recent_scanned[0] += *nr_anon;
-       reclaim_stat->recent_scanned[1] += *nr_file;
+       preempt_enable();
 }
 
 /*
- * Returns true if the caller should wait to clean dirty/writeback pages.
+ * Returns true if a direct reclaim should wait on pages under writeback.
  *
  * If we are direct reclaiming for contiguous pages and we do not reclaim
  * everything in the list, try again and wait for writeback IO to complete.
@@ -1416,7 +1476,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
        if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
                return false;
 
-       /* If we have relaimed everything on the isolated list, no stall */
+       /* If we have reclaimed everything on the isolated list, no stall */
        if (nr_freed == nr_taken)
                return false;
 
@@ -1439,8 +1499,8 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
  * of reclaimed pages
  */
 static noinline_for_stack unsigned long
-shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
-                       struct scan_control *sc, int priority, int file)
+shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
+                    struct scan_control *sc, int priority, int file)
 {
        LIST_HEAD(page_list);
        unsigned long nr_scanned;
@@ -1448,6 +1508,11 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        unsigned long nr_taken;
        unsigned long nr_anon;
        unsigned long nr_file;
+       unsigned long nr_dirty = 0;
+       unsigned long nr_writeback = 0;
+       isolate_mode_t isolate_mode = ISOLATE_INACTIVE;
+       struct zone *zone = mz->zone;
+       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
 
        while (unlikely(too_many_isolated(zone, file, sc))) {
                congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1458,15 +1523,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        }
 
        set_reclaim_mode(priority, sc, false);
+       if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+               isolate_mode |= ISOLATE_ACTIVE;
+
        lru_add_drain();
+
+       if (!sc->may_unmap)
+               isolate_mode |= ISOLATE_UNMAPPED;
+       if (!sc->may_writepage)
+               isolate_mode |= ISOLATE_CLEAN;
+
        spin_lock_irq(&zone->lru_lock);
 
-       if (scanning_global_lru(sc)) {
-               nr_taken = isolate_pages_global(nr_to_scan,
-                       &page_list, &nr_scanned, sc->order,
-                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
-                       zone, 0, file);
+       nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned,
+                                    sc, isolate_mode, 0, file);
+       if (global_reclaim(sc)) {
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
                        __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1474,42 +1545,72 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                else
                        __count_zone_vm_events(PGSCAN_DIRECT, zone,
                                               nr_scanned);
-       } else {
-               nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
-                       &page_list, &nr_scanned, sc->order,
-                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
-                       zone, sc->mem_cgroup,
-                       0, file);
-               /*
-                * mem_cgroup_isolate_pages() keeps track of
-                * scanned pages on its own.
-                */
        }
+       spin_unlock_irq(&zone->lru_lock);
 
-       if (nr_taken == 0) {
-               spin_unlock_irq(&zone->lru_lock);
+       if (nr_taken == 0)
                return 0;
-       }
 
-       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
+       update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
 
-       spin_unlock_irq(&zone->lru_lock);
-
-       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
+       nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
+                                               &nr_dirty, &nr_writeback);
 
        /* Check if we should syncronously wait for writeback */
        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
                set_reclaim_mode(priority, sc, true);
-               nr_reclaimed += shrink_page_list(&page_list, zone, sc);
+               nr_reclaimed += shrink_page_list(&page_list, mz, sc,
+                                       priority, &nr_dirty, &nr_writeback);
        }
 
-       local_irq_disable();
-       if (current_is_kswapd())
-               __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
-       __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
+       spin_lock_irq(&zone->lru_lock);
+
+       reclaim_stat->recent_scanned[0] += nr_anon;
+       reclaim_stat->recent_scanned[1] += nr_file;
+
+       if (global_reclaim(sc)) {
+               if (current_is_kswapd())
+                       __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
+                                              nr_reclaimed);
+               else
+                       __count_zone_vm_events(PGSTEAL_DIRECT, zone,
+                                              nr_reclaimed);
+       }
 
-       putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+       putback_inactive_pages(mz, &page_list);
+
+       __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+       __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+
+       spin_unlock_irq(&zone->lru_lock);
+
+       free_hot_cold_page_list(&page_list, 1);
+
+       /*
+        * If reclaim is isolating dirty pages under writeback, it implies
+        * that the long-lived page allocation rate is exceeding the page
+        * laundering rate. Either the global limits are not being effective
+        * at throttling processes due to the page distribution throughout
+        * zones or there is heavy usage of a slow backing device. The
+        * only option is to throttle from reclaim context which is not ideal
+        * as there is no guarantee the dirtying process is throttled in the
+        * same way balance_dirty_pages() manages.
+        *
+        * This scales the number of dirty pages that must be under writeback
+        * before throttling depending on priority. It is a simple backoff
+        * function that has the most effect in the range DEF_PRIORITY to
+        * DEF_PRIORITY-2 which is the priority reclaim is considered to be
+        * in trouble and reclaim is considered to be in trouble.
+        *
+        * DEF_PRIORITY   100% isolated pages must be PageWriteback to throttle
+        * DEF_PRIORITY-1  50% must be PageWriteback
+        * DEF_PRIORITY-2  25% must be PageWriteback, kswapd in trouble
+        * ...
+        * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
+        *                     isolated page is PageWriteback
+        */
+       if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
+               wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
 
        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
                zone_idx(zone),
@@ -1539,30 +1640,35 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 
 static void move_active_pages_to_lru(struct zone *zone,
                                     struct list_head *list,
+                                    struct list_head *pages_to_free,
                                     enum lru_list lru)
 {
        unsigned long pgmoved = 0;
-       struct pagevec pvec;
        struct page *page;
 
-       pagevec_init(&pvec, 1);
-
        while (!list_empty(list)) {
+               struct lruvec *lruvec;
+
                page = lru_to_page(list);
 
                VM_BUG_ON(PageLRU(page));
                SetPageLRU(page);
 
-               list_move(&page->lru, &zone->lru[lru].list);
-               mem_cgroup_add_lru_list(page, lru);
+               lruvec = mem_cgroup_lru_add_list(zone, page, lru);
+               list_move(&page->lru, &lruvec->lists[lru]);
                pgmoved += hpage_nr_pages(page);
 
-               if (!pagevec_add(&pvec, page) || list_empty(list)) {
-                       spin_unlock_irq(&zone->lru_lock);
-                       if (buffer_heads_over_limit)
-                               pagevec_strip(&pvec);
-                       __pagevec_release(&pvec);
-                       spin_lock_irq(&zone->lru_lock);
+               if (put_page_testzero(page)) {
+                       __ClearPageLRU(page);
+                       __ClearPageActive(page);
+                       del_page_from_lru_list(zone, page, lru);
+
+                       if (unlikely(PageCompound(page))) {
+                               spin_unlock_irq(&zone->lru_lock);
+                               (*get_compound_page_dtor(page))(page);
+                               spin_lock_irq(&zone->lru_lock);
+                       } else
+                               list_add(&page->lru, pages_to_free);
                }
        }
        __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
@@ -1570,41 +1676,42 @@ static void move_active_pages_to_lru(struct zone *zone,
                __count_vm_events(PGDEACTIVATE, pgmoved);
 }
 
-static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
-                       struct scan_control *sc, int priority, int file)
+static void shrink_active_list(unsigned long nr_to_scan,
+                              struct mem_cgroup_zone *mz,
+                              struct scan_control *sc,
+                              int priority, int file)
 {
        unsigned long nr_taken;
-       unsigned long pgscanned;
+       unsigned long nr_scanned;
        unsigned long vm_flags;
        LIST_HEAD(l_hold);      /* The pages which were snipped off */
        LIST_HEAD(l_active);
        LIST_HEAD(l_inactive);
        struct page *page;
-       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
        unsigned long nr_rotated = 0;
+       isolate_mode_t isolate_mode = ISOLATE_ACTIVE;
+       struct zone *zone = mz->zone;
 
        lru_add_drain();
+
+       reset_reclaim_mode(sc);
+
+       if (!sc->may_unmap)
+               isolate_mode |= ISOLATE_UNMAPPED;
+       if (!sc->may_writepage)
+               isolate_mode |= ISOLATE_CLEAN;
+
        spin_lock_irq(&zone->lru_lock);
-       if (scanning_global_lru(sc)) {
-               nr_taken = isolate_pages_global(nr_pages, &l_hold,
-                                               &pgscanned, sc->order,
-                                               ISOLATE_ACTIVE, zone,
-                                               1, file);
-               zone->pages_scanned += pgscanned;
-       } else {
-               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
-                                               &pgscanned, sc->order,
-                                               ISOLATE_ACTIVE, zone,
-                                               sc->mem_cgroup, 1, file);
-               /*
-                * mem_cgroup_isolate_pages() keeps track of
-                * scanned pages on its own.
-                */
-       }
+
+       nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc,
+                                    isolate_mode, 1, file);
+       if (global_reclaim(sc))
+               zone->pages_scanned += nr_scanned;
 
        reclaim_stat->recent_scanned[file] += nr_taken;
 
-       __count_zone_vm_events(PGREFILL, zone, pgscanned);
+       __count_zone_vm_events(PGREFILL, zone, nr_scanned);
        if (file)
                __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
        else
@@ -1622,7 +1729,15 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                        continue;
                }
 
-               if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
+               if (unlikely(buffer_heads_over_limit)) {
+                       if (page_has_private(page) && trylock_page(page)) {
+                               if (page_has_private(page))
+                                       try_to_release_page(page, 0);
+                               unlock_page(page);
+                       }
+               }
+
+               if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
                        nr_rotated += hpage_nr_pages(page);
                        /*
                         * Identify referenced, file-backed active pages and
@@ -1655,12 +1770,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
         */
        reclaim_stat->recent_rotated[file] += nr_rotated;
 
-       move_active_pages_to_lru(zone, &l_active,
+       move_active_pages_to_lru(zone, &l_active, &l_hold,
                                                LRU_ACTIVE + file * LRU_FILE);
-       move_active_pages_to_lru(zone, &l_inactive,
+       move_active_pages_to_lru(zone, &l_inactive, &l_hold,
                                                LRU_BASE   + file * LRU_FILE);
        __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
        spin_unlock_irq(&zone->lru_lock);
+
+       free_hot_cold_page_list(&l_hold, 1);
 }
 
 #ifdef CONFIG_SWAP
@@ -1685,10 +1802,8 @@ static int inactive_anon_is_low_global(struct zone *zone)
  * Returns true if the zone does not have enough inactive anon pages,
  * meaning some active anon pages need to be deactivated.
  */
-static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
+static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
 {
-       int low;
-
        /*
         * If we don't have swap space, anonymous page deactivation
         * is pointless.
@@ -1696,15 +1811,14 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
        if (!total_swap_pages)
                return 0;
 
-       if (scanning_global_lru(sc))
-               low = inactive_anon_is_low_global(zone);
-       else
-               low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
-       return low;
+       if (!scanning_global_lru(mz))
+               return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
+                                                      mz->zone);
+
+       return inactive_anon_is_low_global(mz->zone);
 }
 #else
-static inline int inactive_anon_is_low(struct zone *zone,
-                                       struct scan_control *sc)
+static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz)
 {
        return 0;
 }
@@ -1722,8 +1836,7 @@ static int inactive_file_is_low_global(struct zone *zone)
 
 /**
  * inactive_file_is_low - check if file pages need to be deactivated
- * @zone: zone to check
- * @sc:   scan control of this context
+ * @mz: memory cgroup and zone to check
  *
  * When the system is doing streaming IO, memory pressure here
  * ensures that active file pages get deactivated, until more
@@ -1735,45 +1848,44 @@ static int inactive_file_is_low_global(struct zone *zone)
  * This uses a different ratio than the anonymous pages, because
  * the page cache uses a use-once replacement algorithm.
  */
-static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
+static int inactive_file_is_low(struct mem_cgroup_zone *mz)
 {
-       int low;
+       if (!scanning_global_lru(mz))
+               return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
+                                                      mz->zone);
 
-       if (scanning_global_lru(sc))
-               low = inactive_file_is_low_global(zone);
-       else
-               low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
-       return low;
+       return inactive_file_is_low_global(mz->zone);
 }
 
-static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
-                               int file)
+static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file)
 {
        if (file)
-               return inactive_file_is_low(zone, sc);
+               return inactive_file_is_low(mz);
        else
-               return inactive_anon_is_low(zone, sc);
+               return inactive_anon_is_low(mz);
 }
 
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
-       struct zone *zone, struct scan_control *sc, int priority)
+                                struct mem_cgroup_zone *mz,
+                                struct scan_control *sc, int priority)
 {
        int file = is_file_lru(lru);
 
        if (is_active_lru(lru)) {
-               if (inactive_list_is_low(zone, sc, file))
-                   shrink_active_list(nr_to_scan, zone, sc, priority, file);
+               if (inactive_list_is_low(mz, file))
+                       shrink_active_list(nr_to_scan, mz, sc, priority, file);
                return 0;
        }
 
-       return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
+       return shrink_inactive_list(nr_to_scan, mz, sc, priority, file);
 }
 
-static int vmscan_swappiness(struct scan_control *sc)
+static int vmscan_swappiness(struct mem_cgroup_zone *mz,
+                            struct scan_control *sc)
 {
-       if (scanning_global_lru(sc))
+       if (global_reclaim(sc))
                return vm_swappiness;
-       return mem_cgroup_swappiness(sc->mem_cgroup);
+       return mem_cgroup_swappiness(mz->mem_cgroup);
 }
 
 /*
@@ -1784,32 +1896,32 @@ static int vmscan_swappiness(struct scan_control *sc)
  *
  * nr[0] = anon pages to scan; nr[1] = file pages to scan
  */
-static void get_scan_count(struct zone *zone, struct scan_control *sc,
-                                       unsigned long *nr, int priority)
+static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
+                          unsigned long *nr, int priority)
 {
        unsigned long anon, file, free;
        unsigned long anon_prio, file_prio;
        unsigned long ap, fp;
-       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
        u64 fraction[2], denominator;
-       enum lru_list l;
+       enum lru_list lru;
        int noswap = 0;
-       int force_scan = 0;
+       bool force_scan = false;
 
-
-       anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
-               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
-       file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
-               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-
-       if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
-               /* kswapd does zone balancing and need to scan this zone */
-               if (scanning_global_lru(sc) && current_is_kswapd())
-                       force_scan = 1;
-               /* memcg may have small limit and need to avoid priority drop */
-               if (!scanning_global_lru(sc))
-                       force_scan = 1;
-       }
+       /*
+        * If the zone or memcg is small, nr[l] can be 0.  This
+        * results in no scanning on this priority and a potential
+        * priority drop.  Global direct reclaim can go to the next
+        * zone and tends to have no problems. Global kswapd is for
+        * zone balancing and it needs to scan a minimum amount. When
+        * reclaiming for a memcg, a priority drop can cause high
+        * latencies, so it's better to scan a minimum amount there as
+        * well.
+        */
+       if (current_is_kswapd() && mz->zone->all_unreclaimable)
+               force_scan = true;
+       if (!global_reclaim(sc))
+               force_scan = true;
 
        /* If we have no swap space, do not bother scanning anon pages. */
        if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1820,11 +1932,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                goto out;
        }
 
-       if (scanning_global_lru(sc)) {
-               free  = zone_page_state(zone, NR_FREE_PAGES);
+       anon  = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) +
+               zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
+       file  = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) +
+               zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
+
+       if (global_reclaim(sc)) {
+               free  = zone_page_state(mz->zone, NR_FREE_PAGES);
                /* If we have very few page cache pages,
                   force-scan anon pages. */
-               if (unlikely(file + free <= high_wmark_pages(zone))) {
+               if (unlikely(file + free <= high_wmark_pages(mz->zone))) {
                        fraction[0] = 1;
                        fraction[1] = 0;
                        denominator = 1;
@@ -1836,8 +1953,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
         * With swappiness at 100, anonymous and file have the same priority.
         * This scanning priority is essentially the inverse of IO cost.
         */
-       anon_prio = vmscan_swappiness(sc);
-       file_prio = 200 - vmscan_swappiness(sc);
+       anon_prio = vmscan_swappiness(mz, sc);
+       file_prio = 200 - vmscan_swappiness(mz, sc);
 
        /*
         * OK, so we have swap space and a fair amount of page cache
@@ -1850,7 +1967,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
         *
         * anon in [0], file in [1]
         */
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(&mz->zone->lru_lock);
        if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
                reclaim_stat->recent_scanned[0] /= 2;
                reclaim_stat->recent_rotated[0] /= 2;
@@ -1871,38 +1988,24 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
 
        fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
        fp /= reclaim_stat->recent_rotated[1] + 1;
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(&mz->zone->lru_lock);
 
        fraction[0] = ap;
        fraction[1] = fp;
        denominator = ap + fp + 1;
 out:
-       for_each_evictable_lru(l) {
-               int file = is_file_lru(l);
+       for_each_evictable_lru(lru) {
+               int file = is_file_lru(lru);
                unsigned long scan;
 
-               scan = zone_nr_lru_pages(zone, sc, l);
+               scan = zone_nr_lru_pages(mz, lru);
                if (priority || noswap) {
                        scan >>= priority;
-                       scan = div64_u64(scan * fraction[file], denominator);
-               }
-
-               /*
-                * If zone is small or memcg is small, nr[l] can be 0.
-                * This results no-scan on this priority and priority drop down.
-                * For global direct reclaim, it can visit next zone and tend
-                * not to have problems. For global kswapd, it's for zone
-                * balancing and it need to scan a small amounts. When using
-                * memcg, priority drop can cause big latency. So, it's better
-                * to scan small amount. See may_noscan above.
-                */
-               if (!scan && force_scan) {
-                       if (file)
-                               scan = SWAP_CLUSTER_MAX;
-                       else if (!noswap)
+                       if (!scan && force_scan)
                                scan = SWAP_CLUSTER_MAX;
+                       scan = div64_u64(scan * fraction[file], denominator);
                }
-               nr[l] = scan;
+               nr[lru] = scan;
        }
 }
 
@@ -1913,7 +2016,7 @@ out:
  * back to the allocator and call try_to_compact_zone(), we ensure that
  * there are enough free pages for it to be likely successful
  */
-static inline bool should_continue_reclaim(struct zone *zone,
+static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
                                        unsigned long nr_reclaimed,
                                        unsigned long nr_scanned,
                                        struct scan_control *sc)
@@ -1953,14 +2056,15 @@ static inline bool should_continue_reclaim(struct zone *zone,
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = (2UL << sc->order);
-       inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
-                               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+       inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
+       if (nr_swap_pages > 0)
+               inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
        if (sc->nr_reclaimed < pages_for_compaction &&
                        inactive_lru_pages > pages_for_compaction)
                return true;
 
        /* If compaction would go ahead or the allocation would succeed, stop */
-       switch (compaction_suitable(zone, sc->order)) {
+       switch (compaction_suitable(mz->zone, sc->order)) {
        case COMPACT_PARTIAL:
        case COMPACT_CONTINUE:
                return false;
@@ -1972,30 +2076,32 @@ static inline bool should_continue_reclaim(struct zone *zone,
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static void shrink_zone(int priority, struct zone *zone,
-                               struct scan_control *sc)
+static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
+                                  struct scan_control *sc)
 {
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
-       enum lru_list l;
+       enum lru_list lru;
        unsigned long nr_reclaimed, nr_scanned;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+       struct blk_plug plug;
 
 restart:
        nr_reclaimed = 0;
        nr_scanned = sc->nr_scanned;
-       get_scan_count(zone, sc, nr, priority);
+       get_scan_count(mz, sc, nr, priority);
 
+       blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
-               for_each_evictable_lru(l) {
-                       if (nr[l]) {
+               for_each_evictable_lru(lru) {
+                       if (nr[lru]) {
                                nr_to_scan = min_t(unsigned long,
-                                                  nr[l], SWAP_CLUSTER_MAX);
-                               nr[l] -= nr_to_scan;
+                                                  nr[lru], SWAP_CLUSTER_MAX);
+                               nr[lru] -= nr_to_scan;
 
-                               nr_reclaimed += shrink_list(l, nr_to_scan,
-                                                           zone, sc, priority);
+                               nr_reclaimed += shrink_list(lru, nr_to_scan,
+                                                           mz, sc, priority);
                        }
                }
                /*
@@ -2009,23 +2115,96 @@ restart:
                if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
                        break;
        }
+       blk_finish_plug(&plug);
        sc->nr_reclaimed += nr_reclaimed;
 
        /*
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-       if (inactive_anon_is_low(zone, sc))
-               shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+       if (inactive_anon_is_low(mz))
+               shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0);
 
        /* reclaim/compaction might need reclaim to continue */
-       if (should_continue_reclaim(zone, nr_reclaimed,
+       if (should_continue_reclaim(mz, nr_reclaimed,
                                        sc->nr_scanned - nr_scanned, sc))
                goto restart;
 
        throttle_vm_writeout(sc->gfp_mask);
 }
 
+static void shrink_zone(int priority, struct zone *zone,
+                       struct scan_control *sc)
+{
+       struct mem_cgroup *root = sc->target_mem_cgroup;
+       struct mem_cgroup_reclaim_cookie reclaim = {
+               .zone = zone,
+               .priority = priority,
+       };
+       struct mem_cgroup *memcg;
+
+       memcg = mem_cgroup_iter(root, NULL, &reclaim);
+       do {
+               struct mem_cgroup_zone mz = {
+                       .mem_cgroup = memcg,
+                       .zone = zone,
+               };
+
+               shrink_mem_cgroup_zone(priority, &mz, sc);
+               /*
+                * Limit reclaim has historically picked one memcg and
+                * scanned it with decreasing priority levels until
+                * nr_to_reclaim had been reclaimed.  This priority
+                * cycle is thus over after a single memcg.
+                *
+                * Direct reclaim and kswapd, on the other hand, have
+                * to scan all memory cgroups to fulfill the overall
+                * scan target for the zone.
+                */
+               if (!global_reclaim(sc)) {
+                       mem_cgroup_iter_break(root, memcg);
+                       break;
+               }
+               memcg = mem_cgroup_iter(root, memcg, &reclaim);
+       } while (memcg);
+}
+
+/* Returns true if compaction should go ahead for a high-order request */
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+{
+       unsigned long balance_gap, watermark;
+       bool watermark_ok;
+
+       /* Do not consider compaction for orders reclaim is meant to satisfy */
+       if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
+               return false;
+
+       /*
+        * Compaction takes time to run and there are potentially other
+        * callers using the pages just freed. Continue reclaiming until
+        * there is a buffer of free pages available to give compaction
+        * a reasonable chance of completing and allocating the page
+        */
+       balance_gap = min(low_wmark_pages(zone),
+               (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                       KSWAPD_ZONE_BALANCE_GAP_RATIO);
+       watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
+       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+
+       /*
+        * If compaction is deferred, reclaim up to a point where
+        * compaction will have a chance of success when re-enabled
+        */
+       if (compaction_deferred(zone, sc->order))
+               return watermark_ok;
+
+       /* If compaction is not ready to start, keep reclaiming */
+       if (!compaction_suitable(zone, sc->order))
+               return false;
+
+       return watermark_ok;
+}
+
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2041,14 +2220,28 @@ restart:
  *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
+ *
+ * This function returns true if a zone is being reclaimed for a costly
+ * high-order allocation and compaction is ready to begin. This indicates to
+ * the caller that it should consider retrying the allocation instead of
+ * further reclaim.
  */
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
        struct zoneref *z;
        struct zone *zone;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
+       bool aborted_reclaim = false;
+
+       /*
+        * If the number of buffer_heads in the machine exceeds the maximum
+        * allowed level, force direct reclaim to scan the highmem zone as
+        * highmem pages could be pinning lowmem pages storing buffer_heads
+        */
+       if (buffer_heads_over_limit)
+               sc->gfp_mask |= __GFP_HIGHMEM;
 
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2058,11 +2251,26 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                 * Take care memory controller reclaiming has small influence
                 * to global LRU.
                 */
-               if (scanning_global_lru(sc)) {
+               if (global_reclaim(sc)) {
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                continue;
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
+                       if (COMPACTION_BUILD) {
+                               /*
+                                * If we already have plenty of memory free for
+                                * compaction in this zone, don't free any more.
+                                * Even though compaction is invoked for any
+                                * non-zero order, only frequent costly order
+                                * reclamation is disruptive enough to become a
+                                * noticeable problem, like transparent huge
+                                * page allocations.
+                                */
+                               if (compaction_ready(zone, sc)) {
+                                       aborted_reclaim = true;
+                                       continue;
+                               }
+                       }
                        /*
                         * This steals pages from memory cgroups over softlimit
                         * and returns the number of reclaimed pages and
@@ -2080,6 +2288,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
 
                shrink_zone(priority, zone, sc);
        }
+
+       return aborted_reclaim;
 }
 
 static bool zone_reclaimable(struct zone *zone)
@@ -2133,23 +2343,24 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        struct zoneref *z;
        struct zone *zone;
        unsigned long writeback_threshold;
+       bool aborted_reclaim;
 
-       get_mems_allowed();
        delayacct_freepages_start();
 
-       if (scanning_global_lru(sc))
+       if (global_reclaim(sc))
                count_vm_event(ALLOCSTALL);
 
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                sc->nr_scanned = 0;
                if (!priority)
-                       disable_swap_token(sc->mem_cgroup);
-               shrink_zones(priority, zonelist, sc);
+                       disable_swap_token(sc->target_mem_cgroup);
+               aborted_reclaim = shrink_zones(priority, zonelist, sc);
+
                /*
                 * Don't shrink slabs when reclaiming memory from
                 * over limit cgroups
                 */
-               if (scanning_global_lru(sc)) {
+               if (global_reclaim(sc)) {
                        unsigned long lru_pages = 0;
                        for_each_zone_zonelist(zone, z, zonelist,
                                        gfp_zone(sc->gfp_mask)) {
@@ -2178,7 +2389,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                 */
                writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
                if (total_scanned > writeback_threshold) {
-                       wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
+                       wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
+                                               WB_REASON_TRY_TO_FREE_PAGES);
                        sc->may_writepage = 1;
                }
 
@@ -2196,7 +2408,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 
 out:
        delayacct_freepages_end();
-       put_mems_allowed();
 
        if (sc->nr_reclaimed)
                return sc->nr_reclaimed;
@@ -2209,8 +2420,12 @@ out:
        if (oom_killer_disabled)
                return 0;
 
+       /* Aborted reclaim to try compaction? don't OOM, then */
+       if (aborted_reclaim)
+               return 1;
+
        /* top priority shrink_zones still had more to do? don't OOM, then */
-       if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
+       if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
                return 1;
 
        return 0;
@@ -2227,7 +2442,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .may_unmap = 1,
                .may_swap = 1,
                .order = order,
-               .mem_cgroup = NULL,
+               .target_mem_cgroup = NULL,
                .nodemask = nodemask,
        };
        struct shrink_control shrink = {
@@ -2247,7 +2462,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
-unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
                                                gfp_t gfp_mask, bool noswap,
                                                struct zone *zone,
                                                unsigned long *nr_scanned)
@@ -2259,7 +2474,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                .may_unmap = 1,
                .may_swap = !noswap,
                .order = 0,
-               .mem_cgroup = mem,
+               .target_mem_cgroup = memcg,
+       };
+       struct mem_cgroup_zone mz = {
+               .mem_cgroup = memcg,
+               .zone = zone,
        };
 
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2276,7 +2495,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
         * will pick up pages from other mem cgroup's as well. We hack
         * the priority and make it zero.
         */
-       shrink_zone(0, zone, &sc);
+       shrink_mem_cgroup_zone(0, &mz, &sc);
 
        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
 
@@ -2284,7 +2503,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
        return sc.nr_reclaimed;
 }
 
-unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                           gfp_t gfp_mask,
                                           bool noswap)
 {
@@ -2297,7 +2516,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                .may_swap = !noswap,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .order = 0,
-               .mem_cgroup = mem_cont,
+               .target_mem_cgroup = memcg,
                .nodemask = NULL, /* we don't care the placement */
                .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                                (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
@@ -2311,7 +2530,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
         * take care of from where we get pages. So the node where we start the
         * scan does not need to be the current node.
         */
-       nid = mem_cgroup_select_victim_node(mem_cont);
+       nid = mem_cgroup_select_victim_node(memcg);
 
        zonelist = NODE_DATA(nid)->node_zonelists;
 
@@ -2327,6 +2546,29 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 }
 #endif
 
+static void age_active_anon(struct zone *zone, struct scan_control *sc,
+                           int priority)
+{
+       struct mem_cgroup *memcg;
+
+       if (!total_swap_pages)
+               return;
+
+       memcg = mem_cgroup_iter(NULL, NULL, NULL);
+       do {
+               struct mem_cgroup_zone mz = {
+                       .mem_cgroup = memcg,
+                       .zone = zone,
+               };
+
+               if (inactive_anon_is_low(&mz))
+                       shrink_active_list(SWAP_CLUSTER_MAX, &mz,
+                                          sc, priority, 0);
+
+               memcg = mem_cgroup_iter(NULL, memcg, NULL);
+       } while (memcg);
+}
+
 /*
  * pgdat_balanced is used when checking if a node is balanced for high-order
  * allocations. Only zones that meet watermarks and are in a zone allowed
@@ -2447,7 +2689,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                 */
                .nr_to_reclaim = ULONG_MAX,
                .order = order,
-               .mem_cgroup = NULL,
+               .target_mem_cgroup = NULL,
        };
        struct shrink_control shrink = {
                .gfp_mask = sc.gfp_mask,
@@ -2486,14 +2728,26 @@ loop_again:
                         * Do some background aging of the anon list, to give
                         * pages a chance to be referenced before reclaiming.
                         */
-                       if (inactive_anon_is_low(zone, &sc))
-                               shrink_active_list(SWAP_CLUSTER_MAX, zone,
-                                                       &sc, priority, 0);
+                       age_active_anon(zone, &sc, priority);
+
+                       /*
+                        * If the number of buffer_heads in the machine
+                        * exceeds the maximum allowed level and this node
+                        * has a highmem zone, force kswapd to reclaim from
+                        * it to relieve lowmem pressure.
+                        */
+                       if (buffer_heads_over_limit && is_highmem_idx(i)) {
+                               end_zone = i;
+                               break;
+                       }
 
                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
                                break;
+                       } else {
+                               /* If balanced, clear the congested flag */
+                               zone_clear_flag(zone, ZONE_CONGESTED);
                        }
                }
                if (i < 0)
@@ -2516,7 +2770,7 @@ loop_again:
                 */
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
-                       int nr_slab;
+                       int nr_slab, testorder;
                        unsigned long balance_gap;
 
                        if (!populated_zone(zone))
@@ -2549,7 +2803,21 @@ loop_again:
                                (zone->present_pages +
                                        KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
                                KSWAPD_ZONE_BALANCE_GAP_RATIO);
-                       if (!zone_watermark_ok_safe(zone, order,
+                       /*
+                        * Kswapd reclaims only single pages with compaction
+                        * enabled. Trying too hard to reclaim until contiguous
+                        * free pages have become available can hurt performance
+                        * by evicting too much useful data from memory.
+                        * Do not reclaim more than needed for compaction.
+                        */
+                       testorder = order;
+                       if (COMPACTION_BUILD && order &&
+                                       compaction_suitable(zone, order) !=
+                                               COMPACT_SKIPPED)
+                               testorder = 0;
+
+                       if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
+                                   !zone_watermark_ok_safe(zone, testorder,
                                        high_wmark_pages(zone) + balance_gap,
                                        end_zone, 0)) {
                                shrink_zone(priority, zone, &sc);
@@ -2578,7 +2846,7 @@ loop_again:
                                continue;
                        }
 
-                       if (!zone_watermark_ok_safe(zone, order,
+                       if (!zone_watermark_ok_safe(zone, testorder,
                                        high_wmark_pages(zone), end_zone, 0)) {
                                all_zones_ok = 0;
                                /*
@@ -2666,6 +2934,8 @@ out:
         * and it is potentially going to sleep here.
         */
        if (order) {
+               int zones_need_compaction = 1;
+
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
 
@@ -2675,6 +2945,11 @@ out:
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;
 
+                       /* Would compaction fail due to lack of free memory? */
+                       if (COMPACTION_BUILD &&
+                           compaction_suitable(zone, order) == COMPACT_SKIPPED)
+                               goto loop_again;
+
                        /* Confirm the zone is balanced for order-0 */
                        if (!zone_watermark_ok(zone, 0,
                                        high_wmark_pages(zone), 0, 0)) {
@@ -2682,9 +2957,17 @@ out:
                                goto loop_again;
                        }
 
+                       /* Check if the memory needs to be defragmented. */
+                       if (zone_watermark_ok(zone, order,
+                                   low_wmark_pages(zone), *classzone_idx, 0))
+                               zones_need_compaction = 0;
+
                        /* If balanced, clear the congested flag */
                        zone_clear_flag(zone, ZONE_CONGESTED);
                }
+
+               if (zones_need_compaction)
+                       compact_pgdat(pgdat, order);
        }
 
        /*
@@ -2757,7 +3040,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 static int kswapd(void *p)
 {
        unsigned long order, new_order;
+       unsigned balanced_order;
        int classzone_idx, new_classzone_idx;
+       int balanced_classzone_idx;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
 
@@ -2788,7 +3073,9 @@ static int kswapd(void *p)
        set_freezable();
 
        order = new_order = 0;
+       balanced_order = 0;
        classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+       balanced_classzone_idx = classzone_idx;
        for ( ; ; ) {
                int ret;
 
@@ -2797,7 +3084,8 @@ static int kswapd(void *p)
                 * new request of a similar or harder type will succeed soon
                 * so consider going to sleep on the basis we reclaimed at
                 */
-               if (classzone_idx >= new_classzone_idx && order == new_order) {
+               if (balanced_classzone_idx >= new_classzone_idx &&
+                                       balanced_order == new_order) {
                        new_order = pgdat->kswapd_max_order;
                        new_classzone_idx = pgdat->classzone_idx;
                        pgdat->kswapd_max_order =  0;
@@ -2812,9 +3100,12 @@ static int kswapd(void *p)
                        order = new_order;
                        classzone_idx = new_classzone_idx;
                } else {
-                       kswapd_try_to_sleep(pgdat, order, classzone_idx);
+                       kswapd_try_to_sleep(pgdat, balanced_order,
+                                               balanced_classzone_idx);
                        order = pgdat->kswapd_max_order;
                        classzone_idx = pgdat->classzone_idx;
+                       new_order = order;
+                       new_classzone_idx = classzone_idx;
                        pgdat->kswapd_max_order = 0;
                        pgdat->classzone_idx = pgdat->nr_zones - 1;
                }
@@ -2829,7 +3120,9 @@ static int kswapd(void *p)
                 */
                if (!ret) {
                        trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                       order = balance_pgdat(pgdat, order, &classzone_idx);
+                       balanced_classzone_idx = classzone_idx;
+                       balanced_order = balance_pgdat(pgdat, order,
+                                               &balanced_classzone_idx);
                }
        }
        return 0;
@@ -3249,158 +3542,69 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
        return 1;
 }
 
+#ifdef CONFIG_SHMEM
 /**
- * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
- * @page: page to check evictability and move to appropriate lru list
- * @zone: zone page is in
+ * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
+ * @pages:     array of pages to check
+ * @nr_pages:  number of pages to check
  *
- * Checks a page for evictability and moves the page to the appropriate
- * zone lru list.
+ * Checks pages for evictability and moves them to the appropriate lru list.
  *
- * Restrictions: zone->lru_lock must be held, page must be on LRU and must
- * have PageUnevictable set.
+ * This function is only used for SysV IPC SHM_UNLOCK.
  */
-static void check_move_unevictable_page(struct page *page, struct zone *zone)
+void check_move_unevictable_pages(struct page **pages, int nr_pages)
 {
-       VM_BUG_ON(PageActive(page));
-
-retry:
-       ClearPageUnevictable(page);
-       if (page_evictable(page, NULL)) {
-               enum lru_list l = page_lru_base_type(page);
-
-               __dec_zone_state(zone, NR_UNEVICTABLE);
-               list_move(&page->lru, &zone->lru[l].list);
-               mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
-               __inc_zone_state(zone, NR_INACTIVE_ANON + l);
-               __count_vm_event(UNEVICTABLE_PGRESCUED);
-       } else {
-               /*
-                * rotate unevictable list
-                */
-               SetPageUnevictable(page);
-               list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
-               mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
-               if (page_evictable(page, NULL))
-                       goto retry;
-       }
-}
-
-/**
- * scan_mapping_unevictable_pages - scan an address space for evictable pages
- * @mapping: struct address_space to scan for evictable pages
- *
- * Scan all pages in mapping.  Check unevictable pages for
- * evictability and move them to the appropriate zone lru list.
- */
-void scan_mapping_unevictable_pages(struct address_space *mapping)
-{
-       pgoff_t next = 0;
-       pgoff_t end   = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
-                        PAGE_CACHE_SHIFT;
-       struct zone *zone;
-       struct pagevec pvec;
-
-       if (mapping->nrpages == 0)
-               return;
-
-       pagevec_init(&pvec, 0);
-       while (next < end &&
-               pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
-               int i;
-               int pg_scanned = 0;
-
-               zone = NULL;
+       struct lruvec *lruvec;
+       struct zone *zone = NULL;
+       int pgscanned = 0;
+       int pgrescued = 0;
+       int i;
 
-               for (i = 0; i < pagevec_count(&pvec); i++) {
-                       struct page *page = pvec.pages[i];
-                       pgoff_t page_index = page->index;
-                       struct zone *pagezone = page_zone(page);
+       for (i = 0; i < nr_pages; i++) {
+               struct page *page = pages[i];
+               struct zone *pagezone;
 
-                       pg_scanned++;
-                       if (page_index > next)
-                               next = page_index;
-                       next++;
+               pgscanned++;
+               pagezone = page_zone(page);
+               if (pagezone != zone) {
+                       if (zone)
+                               spin_unlock_irq(&zone->lru_lock);
+                       zone = pagezone;
+                       spin_lock_irq(&zone->lru_lock);
+               }
 
-                       if (pagezone != zone) {
-                               if (zone)
-                                       spin_unlock_irq(&zone->lru_lock);
-                               zone = pagezone;
-                               spin_lock_irq(&zone->lru_lock);
-                       }
+               if (!PageLRU(page) || !PageUnevictable(page))
+                       continue;
 
-                       if (PageLRU(page) && PageUnevictable(page))
-                               check_move_unevictable_page(page, zone);
+               if (page_evictable(page, NULL)) {
+                       enum lru_list lru = page_lru_base_type(page);
+
+                       VM_BUG_ON(PageActive(page));
+                       ClearPageUnevictable(page);
+                       __dec_zone_state(zone, NR_UNEVICTABLE);
+                       lruvec = mem_cgroup_lru_move_lists(zone, page,
+                                               LRU_UNEVICTABLE, lru);
+                       list_move(&page->lru, &lruvec->lists[lru]);
+                       __inc_zone_state(zone, NR_INACTIVE_ANON + lru);
+                       pgrescued++;
                }
-               if (zone)
-                       spin_unlock_irq(&zone->lru_lock);
-               pagevec_release(&pvec);
-
-               count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
        }
 
-}
-
-/**
- * scan_zone_unevictable_pages - check unevictable list for evictable pages
- * @zone - zone of which to scan the unevictable list
- *
- * Scan @zone's unevictable LRU lists to check for pages that have become
- * evictable.  Move those that have to @zone's inactive list where they
- * become candidates for reclaim, unless shrink_inactive_zone() decides
- * to reactivate them.  Pages that are still unevictable are rotated
- * back onto @zone's unevictable list.
- */
-#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
-static void scan_zone_unevictable_pages(struct zone *zone)
-{
-       struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
-       unsigned long scan;
-       unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
-
-       while (nr_to_scan > 0) {
-               unsigned long batch_size = min(nr_to_scan,
-                                               SCAN_UNEVICTABLE_BATCH_SIZE);
-
-               spin_lock_irq(&zone->lru_lock);
-               for (scan = 0;  scan < batch_size; scan++) {
-                       struct page *page = lru_to_page(l_unevictable);
-
-                       if (!trylock_page(page))
-                               continue;
-
-                       prefetchw_prev_lru_page(page, l_unevictable, flags);
-
-                       if (likely(PageLRU(page) && PageUnevictable(page)))
-                               check_move_unevictable_page(page, zone);
-
-                       unlock_page(page);
-               }
+       if (zone) {
+               __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
+               __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
                spin_unlock_irq(&zone->lru_lock);
-
-               nr_to_scan -= batch_size;
        }
 }
+#endif /* CONFIG_SHMEM */
 
-
-/**
- * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
- *
- * A really big hammer:  scan all zones' unevictable LRU lists to check for
- * pages that have become evictable.  Move those back to the zones'
- * inactive list where they become candidates for reclaim.
- * This occurs when, e.g., we have unswappable pages on the unevictable lists,
- * and we add swap to the system.  As such, it runs in the context of a task
- * that has possibly/probably made some previously unevictable pages
- * evictable.
- */
-static void scan_all_zones_unevictable_pages(void)
+static void warn_scan_unevictable_pages(void)
 {
-       struct zone *zone;
-
-       for_each_zone(zone) {
-               scan_zone_unevictable_pages(zone);
-       }
+       printk_once(KERN_WARNING
+                   "%s: The scan_unevictable_pages sysctl/node-interface has been "
+                   "disabled for lack of a legitimate use case.  If you have "
+                   "one, please send an email to linux-mm@kvack.org.\n",
+                   current->comm);
 }
 
 /*
@@ -3413,11 +3617,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
                           void __user *buffer,
                           size_t *length, loff_t *ppos)
 {
+       warn_scan_unevictable_pages();
        proc_doulongvec_minmax(table, write, buffer, length, ppos);
-
-       if (write && *(unsigned long *)table->data)
-               scan_all_zones_unevictable_pages();
-
        scan_unevictable_pages = 0;
        return 0;
 }
@@ -3428,45 +3629,34 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
  * a specified node's per zone unevictable lists for evictable pages.
  */
 
-static ssize_t read_scan_unevictable_node(struct sys_device *dev,
-                                         struct sysdev_attribute *attr,
+static ssize_t read_scan_unevictable_node(struct device *dev,
+                                         struct device_attribute *attr,
                                          char *buf)
 {
+       warn_scan_unevictable_pages();
        return sprintf(buf, "0\n");     /* always zero; should fit... */
 }
 
-static ssize_t write_scan_unevictable_node(struct sys_device *dev,
-                                          struct sysdev_attribute *attr,
+static ssize_t write_scan_unevictable_node(struct device *dev,
+                                          struct device_attribute *attr,
                                        const char *buf, size_t count)
 {
-       struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
-       struct zone *zone;
-       unsigned long res;
-       unsigned long req = strict_strtoul(buf, 10, &res);
-
-       if (!req)
-               return 1;       /* zero is no-op */
-
-       for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-               if (!populated_zone(zone))
-                       continue;
-               scan_zone_unevictable_pages(zone);
-       }
+       warn_scan_unevictable_pages();
        return 1;
 }
 
 
-static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
+static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
                        read_scan_unevictable_node,
                        write_scan_unevictable_node);
 
 int scan_unevictable_register_node(struct node *node)
 {
-       return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
+       return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
 }
 
 void scan_unevictable_unregister_node(struct node *node)
 {
-       sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
+       device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
 }
 #endif