Update to 3.4-final.
[linux-flexiantxendom0-3.2.10.git] / mm / compaction.c
index e005a30..74a8c82 100644 (file)
@@ -35,11 +35,7 @@ struct compact_control {
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
        bool sync;                      /* Synchronous migration */
 
-       /* Account for isolated anon and file pages */
-       unsigned long nr_anon;
-       unsigned long nr_file;
-
-       unsigned int order;             /* order a direct compactor needs */
+       int order;                      /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
 };
@@ -144,16 +140,26 @@ static void isolate_freepages(struct zone *zone,
        int nr_freepages = cc->nr_freepages;
        struct list_head *freelist = &cc->freepages;
 
+       /*
+        * Initialise the free scanner. The starting point is where we last
+        * scanned from (or the end of the zone if starting). The low point
+        * is the end of the pageblock the migration scanner is using.
+        */
        pfn = cc->free_pfn;
        low_pfn = cc->migrate_pfn + pageblock_nr_pages;
-       high_pfn = low_pfn;
+
+       /*
+        * Take care that if the migration scanner is at the end of the zone
+        * that the free scanner does not accidentally move to the next zone
+        * in the next isolation cycle.
+        */
+       high_pfn = min(low_pfn, pfn);
 
        /*
         * Isolate free pages until enough are available to migrate the
         * pages on cc->migratepages. We stop searching if the migrate
         * and free page scanners meet or enough free pages are isolated.
         */
-       spin_lock_irqsave(&zone->lock, flags);
        for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
                                        pfn -= pageblock_nr_pages) {
                unsigned long isolated;
@@ -176,9 +182,19 @@ static void isolate_freepages(struct zone *zone,
                if (!suitable_migration_target(page))
                        continue;
 
-               /* Found a block suitable for isolating free pages from */
-               isolated = isolate_freepages_block(zone, pfn, freelist);
-               nr_freepages += isolated;
+               /*
+                * Found a block suitable for isolating free pages from. Now
+                * we disabled interrupts, double check things are ok and
+                * isolate the pages. This is to minimise the time IRQs
+                * are disabled
+                */
+               isolated = 0;
+               spin_lock_irqsave(&zone->lock, flags);
+               if (suitable_migration_target(page)) {
+                       isolated = isolate_freepages_block(zone, pfn, freelist);
+                       nr_freepages += isolated;
+               }
+               spin_unlock_irqrestore(&zone->lock, flags);
 
                /*
                 * Record the highest PFN we isolated pages from. When next
@@ -188,7 +204,6 @@ static void isolate_freepages(struct zone *zone,
                if (isolated)
                        high_pfn = max(high_pfn, pfn);
        }
-       spin_unlock_irqrestore(&zone->lock, flags);
 
        /* split_free_page does not map the pages */
        list_for_each_entry(page, freelist, lru) {
@@ -204,17 +219,13 @@ static void isolate_freepages(struct zone *zone,
 static void acct_isolated(struct zone *zone, struct compact_control *cc)
 {
        struct page *page;
-       unsigned int count[NR_LRU_LISTS] = { 0, };
+       unsigned int count[2] = { 0, };
 
-       list_for_each_entry(page, &cc->migratepages, lru) {
-               int lru = page_lru_base_type(page);
-               count[lru]++;
-       }
+       list_for_each_entry(page, &cc->migratepages, lru)
+               count[!!page_is_file_cache(page)]++;
 
-       cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
-       cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
-       __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
-       __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
+       __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+       __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
 }
 
 /* Similar to reclaim, but different enough that they don't share logic */
@@ -232,16 +243,25 @@ static bool too_many_isolated(struct zone *zone)
        return isolated > (inactive + active) / 2;
 }
 
+/* possible outcome of isolate_migratepages */
+typedef enum {
+       ISOLATE_ABORT,          /* Abort compaction now */
+       ISOLATE_NONE,           /* No pages isolated, continue scanning */
+       ISOLATE_SUCCESS,        /* Pages isolated, migrate */
+} isolate_migrate_t;
+
 /*
  * Isolate all pages that can be migrated from the block pointed to by
  * the migrate scanner within compact_control.
  */
-static unsigned long isolate_migratepages(struct zone *zone,
+static isolate_migrate_t isolate_migratepages(struct zone *zone,
                                        struct compact_control *cc)
 {
        unsigned long low_pfn, end_pfn;
+       unsigned long last_pageblock_nr = 0, pageblock_nr;
        unsigned long nr_scanned = 0, nr_isolated = 0;
        struct list_head *migratelist = &cc->migratepages;
+       isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
 
        /* Do not scan outside zone boundaries */
        low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
@@ -252,7 +272,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
        /* Do not cross the free scanner or scan within a memory hole */
        if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
                cc->migrate_pfn = end_pfn;
-               return 0;
+               return ISOLATE_NONE;
        }
 
        /*
@@ -261,29 +281,105 @@ static unsigned long isolate_migratepages(struct zone *zone,
         * delay for some time until fewer pages are isolated
         */
        while (unlikely(too_many_isolated(zone))) {
+               /* async migration should just abort */
+               if (!cc->sync)
+                       return ISOLATE_ABORT;
+
                congestion_wait(BLK_RW_ASYNC, HZ/10);
 
                if (fatal_signal_pending(current))
-                       return 0;
+                       return ISOLATE_ABORT;
        }
 
        /* Time to isolate some pages for migration */
+       cond_resched();
        spin_lock_irq(&zone->lru_lock);
        for (; low_pfn < end_pfn; low_pfn++) {
                struct page *page;
+               bool locked = true;
+
+               /* give a chance to irqs before checking need_resched() */
+               if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+                       spin_unlock_irq(&zone->lru_lock);
+                       locked = false;
+               }
+               if (need_resched() || spin_is_contended(&zone->lru_lock)) {
+                       if (locked)
+                               spin_unlock_irq(&zone->lru_lock);
+                       cond_resched();
+                       spin_lock_irq(&zone->lru_lock);
+                       if (fatal_signal_pending(current))
+                               break;
+               } else if (!locked)
+                       spin_lock_irq(&zone->lru_lock);
+
+               /*
+                * migrate_pfn does not necessarily start aligned to a
+                * pageblock. Ensure that pfn_valid is called when moving
+                * into a new MAX_ORDER_NR_PAGES range in case of large
+                * memory holes within the zone
+                */
+               if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+                       if (!pfn_valid(low_pfn)) {
+                               low_pfn += MAX_ORDER_NR_PAGES - 1;
+                               continue;
+                       }
+               }
+
                if (!pfn_valid_within(low_pfn))
                        continue;
                nr_scanned++;
 
-               /* Get the page and skip if free */
+               /*
+                * Get the page and ensure the page is within the same zone.
+                * See the comment in isolate_freepages about overlapping
+                * nodes. It is deliberate that the new zone lock is not taken
+                * as memory compaction should not move pages between nodes.
+                */
                page = pfn_to_page(low_pfn);
+               if (page_zone(page) != zone)
+                       continue;
+
+               /* Skip if free */
                if (PageBuddy(page))
                        continue;
 
+               /*
+                * For async migration, also only scan in MOVABLE blocks. Async
+                * migration is optimistic to see if the minimum amount of work
+                * satisfies the allocation
+                */
+               pageblock_nr = low_pfn >> pageblock_order;
+               if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+                               get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
+                       low_pfn += pageblock_nr_pages;
+                       low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+                       last_pageblock_nr = pageblock_nr;
+                       continue;
+               }
+
+               if (!PageLRU(page))
+                       continue;
+
+               /*
+                * PageLRU is set, and lru_lock excludes isolation,
+                * splitting and collapsing (collapsing has already
+                * happened if PageLRU is set).
+                */
+               if (PageTransHuge(page)) {
+                       low_pfn += (1 << compound_order(page)) - 1;
+                       continue;
+               }
+
+               if (!cc->sync)
+                       mode |= ISOLATE_ASYNC_MIGRATE;
+
                /* Try isolate the page */
-               if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
+               if (__isolate_lru_page(page, mode, 0) != 0)
                        continue;
 
+               VM_BUG_ON(PageTransCompound(page));
+
                /* Successfully isolated */
                del_page_from_lru_list(zone, page, page_lru(page));
                list_add(&page->lru, migratelist);
@@ -291,8 +387,10 @@ static unsigned long isolate_migratepages(struct zone *zone,
                nr_isolated++;
 
                /* Avoid isolating too much */
-               if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
+               if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+                       ++low_pfn;
                        break;
+               }
        }
 
        acct_isolated(zone, cc);
@@ -302,7 +400,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
 
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
 
-       return cc->nr_migratepages;
+       return ISOLATE_SUCCESS;
 }
 
 /*
@@ -352,10 +450,10 @@ static void update_nr_listpages(struct compact_control *cc)
 }
 
 static int compact_finished(struct zone *zone,
-                                               struct compact_control *cc)
+                           struct compact_control *cc)
 {
        unsigned int order;
-       unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
+       unsigned long watermark;
 
        if (fatal_signal_pending(current))
                return COMPACT_PARTIAL;
@@ -364,11 +462,18 @@ static int compact_finished(struct zone *zone,
        if (cc->free_pfn <= cc->migrate_pfn)
                return COMPACT_COMPLETE;
 
-       /* Compaction run is not finished if the watermark is not met */
-       if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
+       /*
+        * order == -1 is expected when compacting via
+        * /proc/sys/vm/compact_memory
+        */
+       if (cc->order == -1)
                return COMPACT_CONTINUE;
 
-       if (cc->order == -1)
+       /* Compaction run is not finished if the watermark is not met */
+       watermark = low_wmark_pages(zone);
+       watermark += (1 << cc->order);
+
+       if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
                return COMPACT_CONTINUE;
 
        /* Direct compactor: Is a suitable page free? */
@@ -398,6 +503,13 @@ unsigned long compaction_suitable(struct zone *zone, int order)
        unsigned long watermark;
 
        /*
+        * order == -1 is expected when compacting via
+        * /proc/sys/vm/compact_memory
+        */
+       if (order == -1)
+               return COMPACT_CONTINUE;
+
+       /*
         * Watermarks for order-0 must be met for compaction. Note the 2UL.
         * This is because during migration, copies of pages need to be
         * allocated and for a short time, the footprint is higher
@@ -410,7 +522,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
         * fragmentation index determines if allocation failures are due to
         * low memory or external fragmentation
         *
-        * index of -1 implies allocations might succeed dependingon watermarks
+        * index of -1000 implies allocations might succeed depending on
+        * watermarks
         * index towards 0 implies failure is due to lack of memory
         * index towards 1000 implies failure is due to fragmentation
         *
@@ -420,7 +533,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
        if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
                return COMPACT_SKIPPED;
 
-       if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+       if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
+           0, 0))
                return COMPACT_PARTIAL;
 
        return COMPACT_CONTINUE;
@@ -450,14 +564,22 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
        while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
                unsigned long nr_migrate, nr_remaining;
+               int err;
 
-               if (!isolate_migratepages(zone, cc))
+               switch (isolate_migratepages(zone, cc)) {
+               case ISOLATE_ABORT:
+                       ret = COMPACT_PARTIAL;
+                       goto out;
+               case ISOLATE_NONE:
                        continue;
+               case ISOLATE_SUCCESS:
+                       ;
+               }
 
                nr_migrate = cc->nr_migratepages;
-               migrate_pages(&cc->migratepages, compaction_alloc,
+               err = migrate_pages(&cc->migratepages, compaction_alloc,
                                (unsigned long)cc, false,
-                               cc->sync);
+                               cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
                update_nr_listpages(cc);
                nr_remaining = cc->nr_migratepages;
 
@@ -469,13 +591,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                                                nr_remaining);
 
                /* Release LRU pages not migrated */
-               if (!list_empty(&cc->migratepages)) {
+               if (err) {
                        putback_lru_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
                }
 
        }
 
+out:
        /* Release free pages and check accounting */
        cc->nr_freepages -= release_freepages(&cc->freepages);
        VM_BUG_ON(cc->nr_freepages != 0);
@@ -483,9 +606,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        return ret;
 }
 
-unsigned long compact_zone_order(struct zone *zone,
-                                               int order, gfp_t gfp_mask,
-                                               bool sync)
+static unsigned long compact_zone_order(struct zone *zone,
+                                int order, gfp_t gfp_mask,
+                                bool sync)
 {
        struct compact_control cc = {
                .nr_freepages = 0,
@@ -529,7 +652,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
         * made because an assumption is made that the page allocator can satisfy
         * the "cheaper" orders without taking special steps
         */
-       if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
+       if (!order || !may_enter_fs || !may_perform_io)
                return rc;
 
        count_vm_event(COMPACTSTALL);
@@ -552,48 +675,71 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 
 
 /* Compact all zones within a node */
-static int compact_node(int nid)
+static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 {
        int zoneid;
-       pg_data_t *pgdat;
        struct zone *zone;
 
-       if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
-               return -EINVAL;
-       pgdat = NODE_DATA(nid);
-
-       /* Flush pending updates to the LRU lists */
-       lru_add_drain_all();
-
        for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
-               struct compact_control cc = {
-                       .nr_freepages = 0,
-                       .nr_migratepages = 0,
-                       .order = -1,
-               };
 
                zone = &pgdat->node_zones[zoneid];
                if (!populated_zone(zone))
                        continue;
 
-               cc.zone = zone;
-               INIT_LIST_HEAD(&cc.freepages);
-               INIT_LIST_HEAD(&cc.migratepages);
-
-               compact_zone(zone, &cc);
+               cc->nr_freepages = 0;
+               cc->nr_migratepages = 0;
+               cc->zone = zone;
+               INIT_LIST_HEAD(&cc->freepages);
+               INIT_LIST_HEAD(&cc->migratepages);
+
+               if (cc->order == -1 || !compaction_deferred(zone, cc->order))
+                       compact_zone(zone, cc);
+
+               if (cc->order > 0) {
+                       int ok = zone_watermark_ok(zone, cc->order,
+                                               low_wmark_pages(zone), 0, 0);
+                       if (ok && cc->order > zone->compact_order_failed)
+                               zone->compact_order_failed = cc->order + 1;
+                       /* Currently async compaction is never deferred. */
+                       else if (!ok && cc->sync)
+                               defer_compaction(zone, cc->order);
+               }
 
-               VM_BUG_ON(!list_empty(&cc.freepages));
-               VM_BUG_ON(!list_empty(&cc.migratepages));
+               VM_BUG_ON(!list_empty(&cc->freepages));
+               VM_BUG_ON(!list_empty(&cc->migratepages));
        }
 
        return 0;
 }
 
+int compact_pgdat(pg_data_t *pgdat, int order)
+{
+       struct compact_control cc = {
+               .order = order,
+               .sync = false,
+       };
+
+       return __compact_pgdat(pgdat, &cc);
+}
+
+static int compact_node(int nid)
+{
+       struct compact_control cc = {
+               .order = -1,
+               .sync = true,
+       };
+
+       return __compact_pgdat(NODE_DATA(nid), &cc);
+}
+
 /* Compact all nodes in the system */
 static int compact_nodes(void)
 {
        int nid;
 
+       /* Flush pending updates to the LRU lists */
+       lru_add_drain_all();
+
        for_each_online_node(nid)
                compact_node(nid);
 
@@ -622,23 +768,30 @@ int sysctl_extfrag_handler(struct ctl_table *table, int write,
 }
 
 #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
-ssize_t sysfs_compact_node(struct sys_device *dev,
-                       struct sysdev_attribute *attr,
+ssize_t sysfs_compact_node(struct device *dev,
+                       struct device_attribute *attr,
                        const char *buf, size_t count)
 {
-       compact_node(dev->id);
+       int nid = dev->id;
+
+       if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
+               /* Flush pending updates to the LRU lists */
+               lru_add_drain_all();
+
+               compact_node(nid);
+       }
 
        return count;
 }
-static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
+static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
 
 int compaction_register_node(struct node *node)
 {
-       return sysdev_create_file(&node->sysdev, &attr_compact);
+       return device_create_file(&node->dev, &dev_attr_compact);
 }
 
 void compaction_unregister_node(struct node *node)
 {
-       return sysdev_remove_file(&node->sysdev, &attr_compact);
+       return device_remove_file(&node->dev, &dev_attr_compact);
 }
 #endif /* CONFIG_SYSFS && CONFIG_NUMA */