pata_marvell: Add support for 88SE91A0, 88SE91A4
[linux-flexiantxendom0-natty.git] / mm / memcontrol.c
index 6d59a2b..da53a25 100644 (file)
@@ -612,8 +612,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
        /* pagein of a big page is an event. So, ignore page size */
        if (nr_pages > 0)
                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
-       else
+       else {
                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
+               nr_pages = -nr_pages; /* for event */
+       }
 
        __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
 
@@ -814,7 +816,8 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
         * removed from global LRU.
         */
        mz = page_cgroup_zoneinfo(pc);
-       MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+       /* huge page split is done under lru_lock. so, we have no races. */
+       MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
        if (mem_cgroup_is_root(pc->mem_cgroup))
                return;
        VM_BUG_ON(list_empty(&pc->lru));
@@ -835,13 +838,12 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
                return;
 
        pc = lookup_page_cgroup(page);
-       /*
-        * Used bit is set without atomic ops but after smp_wmb().
-        * For making pc->mem_cgroup visible, insert smp_rmb() here.
-        */
-       smp_rmb();
        /* unused or root page is not rotated. */
-       if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
+       if (!PageCgroupUsed(pc))
+               return;
+       /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+       smp_rmb();
+       if (mem_cgroup_is_root(pc->mem_cgroup))
                return;
        mz = page_cgroup_zoneinfo(pc);
        list_move(&pc->lru, &mz->lists[lru]);
@@ -856,16 +858,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
                return;
        pc = lookup_page_cgroup(page);
        VM_BUG_ON(PageCgroupAcctLRU(pc));
-       /*
-        * Used bit is set without atomic ops but after smp_wmb().
-        * For making pc->mem_cgroup visible, insert smp_rmb() here.
-        */
-       smp_rmb();
        if (!PageCgroupUsed(pc))
                return;
-
+       /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+       smp_rmb();
        mz = page_cgroup_zoneinfo(pc);
-       MEM_CGROUP_ZSTAT(mz, lru) += 1;
+       /* huge page split is done under lru_lock. so, we have no races. */
+       MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
        SetPageCgroupAcctLRU(pc);
        if (mem_cgroup_is_root(pc->mem_cgroup))
                return;
@@ -1029,14 +1028,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
                return NULL;
 
        pc = lookup_page_cgroup(page);
-       /*
-        * Used bit is set without atomic ops but after smp_wmb().
-        * For making pc->mem_cgroup visible, insert smp_rmb() here.
-        */
-       smp_rmb();
        if (!PageCgroupUsed(pc))
                return NULL;
-
+       /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+       smp_rmb();
        mz = page_cgroup_zoneinfo(pc);
        if (!mz)
                return NULL;
@@ -1118,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
        return false;
 }
 
+/**
+ * mem_cgroup_check_margin - check if the memory cgroup allows charging
+ * @mem: memory cgroup to check
+ * @bytes: the number of bytes the caller intends to charge
+ *
+ * Returns a boolean value on whether @mem can be charged @bytes or
+ * whether this would exceed the limit.
+ */
+static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
+{
+       if (!res_counter_check_margin(&mem->res, bytes))
+               return false;
+       if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
+               return false;
+       return true;
+}
+
 static unsigned int get_swappiness(struct mem_cgroup *memcg)
 {
        struct cgroup *cgrp = memcg->css.cgroup;
@@ -1614,7 +1626,7 @@ void mem_cgroup_update_page_stat(struct page *page,
        if (unlikely(!mem || !PageCgroupUsed(pc)))
                goto out;
        /* pc->mem_cgroup is unstable ? */
-       if (unlikely(mem_cgroup_stealed(mem))) {
+       if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
                /* take a lock against to access pc->mem_cgroup */
                move_lock_page_cgroup(pc, &flags);
                need_unlock = true;
@@ -1839,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
                if (likely(!ret))
                        return CHARGE_OK;
 
+               res_counter_uncharge(&mem->res, csize);
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
        } else
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
-
-       if (csize > PAGE_SIZE) /* change csize and retry */
+       /*
+        * csize can be either a huge page (HPAGE_SIZE), a batch of
+        * regular pages (CHARGE_SIZE), or a single regular page
+        * (PAGE_SIZE).
+        *
+        * Never reclaim on behalf of optional batching, retry with a
+        * single page instead.
+        */
+       if (csize == CHARGE_SIZE)
                return CHARGE_RETRY;
 
        if (!(gfp_mask & __GFP_WAIT))
                return CHARGE_WOULDBLOCK;
 
        ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-                                       gfp_mask, flags);
+                                             gfp_mask, flags);
+       if (mem_cgroup_check_margin(mem_over_limit, csize))
+               return CHARGE_RETRY;
        /*
-        * try_to_free_mem_cgroup_pages() might not give us a full
-        * picture of reclaim. Some pages are reclaimed and might be
-        * moved to swap cache or just unmapped from the cgroup.
-        * Check the limit again to see if the reclaim reduced the
-        * current usage of the cgroup before giving up
+        * Even though the limit is exceeded at this point, reclaim
+        * may have been able to free some pages.  Retry the charge
+        * before killing the task.
+        *
+        * Only for regular pages, though: huge pages are rather
+        * unlikely to succeed so close to the limit, and we fall back
+        * to regular pages anyway in case of failure.
         */
-       if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+       if (csize == PAGE_SIZE && ret)
                return CHARGE_RETRY;
 
        /*
@@ -2083,14 +2107,27 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
        return mem;
 }
 
-/*
- * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
- * USED state. If already USED, uncharge and return.
- */
-static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
-                                        struct page_cgroup *pc,
-                                        enum charge_type ctype)
+static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+                                      struct page_cgroup *pc,
+                                      enum charge_type ctype,
+                                      int page_size)
 {
+       int nr_pages = page_size >> PAGE_SHIFT;
+
+       /* try_charge() can return NULL to *memcg, taking care of it. */
+       if (!mem)
+               return;
+
+       lock_page_cgroup(pc);
+       if (unlikely(PageCgroupUsed(pc))) {
+               unlock_page_cgroup(pc);
+               mem_cgroup_cancel_charge(mem, page_size);
+               return;
+       }
+       /*
+        * we don't need page_cgroup_lock about tail pages, becase they are not
+        * accessed by any other context at this point.
+        */
        pc->mem_cgroup = mem;
        /*
         * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2114,43 +2151,57 @@ static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
                break;
        }
 
-       mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), 1);
+       mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
+       unlock_page_cgroup(pc);
+       /*
+        * "charge_statistics" updated event counter. Then, check it.
+        * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+        * if they exceeds softlimit.
+        */
+       memcg_check_events(mem, pc->page);
 }
 
-static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
-                                      struct page_cgroup *pc,
-                                      enum charge_type ctype,
-                                      int page_size)
-{
-       int i;
-       int count = page_size >> PAGE_SHIFT;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 
-       /* try_charge() can return NULL to *memcg, taking care of it. */
-       if (!mem)
-               return;
+#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
+                       (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
+/*
+ * Because tail pages are not marked as "used", set it. We're under
+ * zone->lru_lock, 'splitting on pmd' and compund_lock.
+ */
+void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
+{
+       struct page_cgroup *head_pc = lookup_page_cgroup(head);
+       struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
+       unsigned long flags;
 
-       lock_page_cgroup(pc);
-       if (unlikely(PageCgroupUsed(pc))) {
-               unlock_page_cgroup(pc);
-               mem_cgroup_cancel_charge(mem, page_size);
+       if (mem_cgroup_disabled())
                return;
-       }
-
        /*
-        * we don't need page_cgroup_lock about tail pages, becase they are not
-        * accessed by any other context at this point.
+        * We have no races with charge/uncharge but will have races with
+        * page state accounting.
         */
-       for (i = 0; i < count; i++)
-               ____mem_cgroup_commit_charge(mem, pc + i, ctype);
+       move_lock_page_cgroup(head_pc, &flags);
 
-       unlock_page_cgroup(pc);
-       /*
-        * "charge_statistics" updated event counter. Then, check it.
-        * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
-        * if they exceeds softlimit.
-        */
-       memcg_check_events(mem, pc->page);
+       tail_pc->mem_cgroup = head_pc->mem_cgroup;
+       smp_wmb(); /* see __commit_charge() */
+       if (PageCgroupAcctLRU(head_pc)) {
+               enum lru_list lru;
+               struct mem_cgroup_per_zone *mz;
+
+               /*
+                * LRU flags cannot be copied because we need to add tail
+                *.page to LRU by generic call and our hook will be called.
+                * We hold lru_lock, then, reduce counter directly.
+                */
+               lru = page_lru(head);
+               mz = page_cgroup_zoneinfo(head_pc);
+               MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+       }
+       tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+       move_unlock_page_cgroup(head_pc, &flags);
 }
+#endif
 
 /**
  * __mem_cgroup_move_account - move account of the page
@@ -2170,8 +2221,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
  */
 
 static void __mem_cgroup_move_account(struct page_cgroup *pc,
-       struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
+       struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge,
+       int charge_size)
 {
+       int nr_pages = charge_size >> PAGE_SHIFT;
+
        VM_BUG_ON(from == to);
        VM_BUG_ON(PageLRU(pc->page));
        VM_BUG_ON(!page_is_cgroup_locked(pc));
@@ -2185,14 +2239,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
                __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
                preempt_enable();
        }
-       mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -1);
+       mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
        if (uncharge)
                /* This is not "cancel", but cancel_charge does all we need. */
-               mem_cgroup_cancel_charge(from, PAGE_SIZE);
+               mem_cgroup_cancel_charge(from, charge_size);
 
        /* caller should have done css_get */
        pc->mem_cgroup = to;
-       mem_cgroup_charge_statistics(to, PageCgroupCache(pc), 1);
+       mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
        /*
         * We charges against "to" which may not have any tasks. Then, "to"
         * can be under rmdir(). But in current implementation, caller of
@@ -2207,15 +2261,24 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
  * __mem_cgroup_move_account()
  */
 static int mem_cgroup_move_account(struct page_cgroup *pc,
-               struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
+               struct mem_cgroup *from, struct mem_cgroup *to,
+               bool uncharge, int charge_size)
 {
        int ret = -EINVAL;
        unsigned long flags;
+       /*
+        * The page is isolated from LRU. So, collapse function
+        * will not handle this page. But page splitting can happen.
+        * Do this check under compound_page_lock(). The caller should
+        * hold it.
+        */
+       if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
+               return -EBUSY;
 
        lock_page_cgroup(pc);
        if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
                move_lock_page_cgroup(pc, &flags);
-               __mem_cgroup_move_account(pc, from, to, uncharge);
+               __mem_cgroup_move_account(pc, from, to, uncharge, charge_size);
                move_unlock_page_cgroup(pc, &flags);
                ret = 0;
        }
@@ -2240,6 +2303,8 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
        struct cgroup *cg = child->css.cgroup;
        struct cgroup *pcg = cg->parent;
        struct mem_cgroup *parent;
+       int page_size = PAGE_SIZE;
+       unsigned long flags;
        int ret;
 
        /* Is ROOT ? */
@@ -2252,15 +2317,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
        if (isolate_lru_page(page))
                goto put;
 
+       if (PageTransHuge(page))
+               page_size = HPAGE_SIZE;
+
        parent = mem_cgroup_from_cont(pcg);
-       ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false,
-                                     PAGE_SIZE);
+       ret = __mem_cgroup_try_charge(NULL, gfp_mask,
+                               &parent, false, page_size);
        if (ret || !parent)
                goto put_back;
 
-       ret = mem_cgroup_move_account(pc, child, parent, true);
+       if (page_size > PAGE_SIZE)
+               flags = compound_lock_irqsave(page);
+
+       ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
        if (ret)
-               mem_cgroup_cancel_charge(parent, PAGE_SIZE);
+               mem_cgroup_cancel_charge(parent, page_size);
+
+       if (page_size > PAGE_SIZE)
+               compound_unlock_irqrestore(page, flags);
 put_back:
        putback_lru_page(page);
 put:
@@ -2279,13 +2353,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask, enum charge_type ctype)
 {
        struct mem_cgroup *mem = NULL;
+       int page_size = PAGE_SIZE;
        struct page_cgroup *pc;
+       bool oom = true;
        int ret;
-       int page_size = PAGE_SIZE;
 
        if (PageTransHuge(page)) {
                page_size <<= compound_order(page);
                VM_BUG_ON(!PageTransHuge(page));
+               /*
+                * Never OOM-kill a process for a huge page.  The
+                * fault handler will fall back to regular pages.
+                */
+               oom = false;
        }
 
        pc = lookup_page_cgroup(page);
@@ -2294,7 +2374,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                return 0;
        prefetchw(pc);
 
-       ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
+       ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
        if (ret || !mem)
                return ret;
 
@@ -2545,7 +2625,6 @@ direct_uncharge:
 static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
-       int i;
        int count;
        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
@@ -2595,8 +2674,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
 
-       for (i = 0; i < count; i++)
-               mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -1);
+       mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count);
 
        ClearPageCgroupUsed(pc);
        /*
@@ -4843,7 +4921,7 @@ retry:
                                goto put;
                        pc = lookup_page_cgroup(page);
                        if (!mem_cgroup_move_account(pc,
-                                               mc.from, mc.to, false)) {
+                                       mc.from, mc.to, false, PAGE_SIZE)) {
                                mc.precharge--;
                                /* we uncharge from mc.from later. */
                                mc.moved_charge++;
@@ -4982,9 +5060,9 @@ struct cgroup_subsys mem_cgroup_subsys = {
 static int __init enable_swap_account(char *s)
 {
        /* consider enabled if no parameter or 1 is given */
-       if (!s || !strcmp(s, "1"))
+       if (!(*s) || !strcmp(s, "=1"))
                really_do_swap_account = 1;
-       else if (!strcmp(s, "0"))
+       else if (!strcmp(s, "=0"))
                really_do_swap_account = 0;
        return 1;
 }
@@ -4992,7 +5070,8 @@ __setup("swapaccount", enable_swap_account);
 
 static int __init disable_swap_account(char *s)
 {
-       enable_swap_account("0");
+       printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
+       enable_swap_account("=0");
        return 1;
 }
 __setup("noswapaccount", disable_swap_account);