* rcu_read_unlock()
* start move here.
*/
+
+/* for quick checking without looking up memcg */
+atomic_t memcg_moving __read_mostly;
+
static void mem_cgroup_start_move(struct mem_cgroup *memcg)
{
+ atomic_inc(&memcg_moving);
atomic_inc(&memcg->moving_account);
synchronize_rcu();
}
* Now, mem_cgroup_clear_mc() may call this function with NULL.
* We check NULL in callee rather than caller.
*/
- if (memcg)
+ if (memcg) {
+ atomic_dec(&memcg_moving);
atomic_dec(&memcg->moving_account);
+ }
}
/*
* 2 routines for checking "mem" is under move_account() or not.
*
- * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
- * for avoiding race in accounting. If true,
+ * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This
+ * is used for avoiding races in accounting. If true,
* pc->mem_cgroup may be overwritten.
*
* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
* waiting at hith-memory prressure caused by "move".
*/
-static bool mem_cgroup_stealed(struct mem_cgroup *memcg)
+static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
{
VM_BUG_ON(!rcu_read_lock_held());
return atomic_read(&memcg->moving_account) > 0;
* Take this lock when
* - a code tries to modify page's memcg while it's USED.
* - a code tries to modify page state accounting in a memcg.
- * see mem_cgroup_stealed(), too.
+ * see mem_cgroup_stolen(), too.
*/
static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
unsigned long *flags)
* If this memory cgroup is not under account moving, we don't
* need to take move_lock_page_cgroup(). Because we already hold
* rcu_read_lock(), any calls to move_account will be delayed until
- * rcu_read_unlock() if mem_cgroup_stealed() == true.
+ * rcu_read_unlock() if mem_cgroup_stolen() == true.
*/
- if (!mem_cgroup_stealed(memcg))
+ if (!mem_cgroup_stolen(memcg))
return;
move_lock_mem_cgroup(memcg, flags);
if (action == CPU_ONLINE)
return NOTIFY_OK;
- if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
+ if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
return NOTIFY_OK;
for_each_mem_cgroup(iter)
static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
struct page *page,
unsigned int nr_pages,
- struct page_cgroup *pc,
enum charge_type ctype,
bool lrucare)
{
+ struct page_cgroup *pc = lookup_page_cgroup(page);
struct zone *uninitialized_var(zone);
bool was_on_lru = false;
bool anon;
{
struct mem_cgroup *memcg = NULL;
unsigned int nr_pages = 1;
- struct page_cgroup *pc;
bool oom = true;
int ret;
oom = false;
}
- pc = lookup_page_cgroup(page);
ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
if (ret == -ENOMEM)
return ret;
- __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false);
+ __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
return 0;
}
__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
enum charge_type ctype)
{
- struct page_cgroup *pc;
-
if (mem_cgroup_disabled())
return;
if (!memcg)
return;
cgroup_exclude_rmdir(&memcg->css);
- pc = lookup_page_cgroup(page);
- __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true);
+ __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
/*
* Now swap is on-memory. This means this page may be
* counted both as mem and swap....double count.
* page. In the case new page is migrated but not remapped, new page's
* mapcount will be finally 0 and we call uncharge in end_migration().
*/
- pc = lookup_page_cgroup(newpage);
if (PageAnon(page))
ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
else if (page_is_file_cache(page))
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
else
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
- __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false);
+ __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
return ret;
}
* the newpage may be on LRU(or pagevec for LRU) already. We lock
* LRU while we overwrite pc->mem_cgroup.
*/
- __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true);
+ __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
}
#ifdef CONFIG_DEBUG_VM
goto try_to_free;
cond_resched();
/* "ret" should also be checked to ensure all lists are empty. */
- } while (memcg->res.usage > 0 || ret);
+ } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
out:
css_put(&memcg->css);
return ret;
lru_add_drain_all();
/* try to free all pages in this cgroup */
shrink = 1;
- while (nr_retries && memcg->res.usage > 0) {
+ while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
int progress;
if (signal_pending(current)) {
break;
default:
BUG();
- break;
}
return val;
}
else
BUG();
- /*
- * Something went wrong if we trying to unregister a threshold
- * if we don't have thresholds
- */
- BUG_ON(!thresholds);
-
if (!thresholds->primary)
goto unlock;
swap_buffers:
/* Swap primary and spare array */
thresholds->spare = thresholds->primary;
+ /* If all events are unregistered, free the spare array */
+ if (!new) {
+ kfree(thresholds->spare);
+ thresholds->spare = NULL;
+ }
+
rcu_assign_pointer(thresholds->primary, new);
/* To be sure that nobody uses thresholds */
}
/**
- * is_target_pte_for_mc - check a pte whether it is valid for move charge
+ * get_mctgt_type - get target type of moving charge
* @vma: the vma the pte to be checked belongs
* @addr: the address corresponding to the pte to be checked
* @ptent: the pte to be checked
};
enum mc_target_type {
- MC_TARGET_NONE, /* not used */
+ MC_TARGET_NONE = 0,
MC_TARGET_PAGE,
MC_TARGET_SWAP,
};
return page;
}
-static int is_target_pte_for_mc(struct vm_area_struct *vma,
+static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
struct page *page = NULL;
struct page_cgroup *pc;
- int ret = 0;
+ enum mc_target_type ret = MC_TARGET_NONE;
swp_entry_t ent = { .val = 0 };
if (pte_present(ptent))
page = mc_handle_file_pte(vma, addr, ptent, &ent);
if (!page && !ent.val)
- return 0;
+ return ret;
if (page) {
pc = lookup_page_cgroup(page);
/*
return ret;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * We don't consider swapping or file mapped pages because THP does not
+ * support them for now.
+ * Caller should make sure that pmd_trans_huge(pmd) is true.
+ */
+static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, union mc_target *target)
+{
+ struct page *page = NULL;
+ struct page_cgroup *pc;
+ enum mc_target_type ret = MC_TARGET_NONE;
+
+ page = pmd_page(pmd);
+ VM_BUG_ON(!page || !PageHead(page));
+ if (!move_anon())
+ return ret;
+ pc = lookup_page_cgroup(page);
+ if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
+ ret = MC_TARGET_PAGE;
+ if (target) {
+ get_page(page);
+ target->page = page;
+ }
+ }
+ return ret;
+}
+#else
+static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, union mc_target *target)
+{
+ return MC_TARGET_NONE;
+}
+#endif
+
static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
pte_t *pte;
spinlock_t *ptl;
- split_huge_page_pmd(walk->mm, pmd);
- if (pmd_trans_unstable(pmd))
+ if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
+ mc.precharge += HPAGE_PMD_NR;
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
+ }
+ if (pmd_trans_unstable(pmd))
+ return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
- if (is_target_pte_for_mc(vma, addr, *pte, NULL))
+ if (get_mctgt_type(vma, addr, *pte, NULL))
mc.precharge++; /* increment precharge temporarily */
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
struct vm_area_struct *vma = walk->private;
pte_t *pte;
spinlock_t *ptl;
+ enum mc_target_type target_type;
+ union mc_target target;
+ struct page *page;
+ struct page_cgroup *pc;
+
+ /*
+ * We don't take compound_lock() here but no race with splitting thp
+ * happens because:
+ * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
+ * under splitting, which means there's no concurrent thp split,
+ * - if another thread runs into split_huge_page() just after we
+ * entered this if-block, the thread must wait for page table lock
+ * to be unlocked in __split_huge_page_splitting(), where the main
+ * part of thp split is not executed yet.
+ */
+ if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (mc.precharge < HPAGE_PMD_NR) {
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ return 0;
+ }
+ target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
+ if (target_type == MC_TARGET_PAGE) {
+ page = target.page;
+ if (!isolate_lru_page(page)) {
+ pc = lookup_page_cgroup(page);
+ if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
+ pc, mc.from, mc.to,
+ false)) {
+ mc.precharge -= HPAGE_PMD_NR;
+ mc.moved_charge += HPAGE_PMD_NR;
+ }
+ putback_lru_page(page);
+ }
+ put_page(page);
+ }
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ return 0;
+ }
- split_huge_page_pmd(walk->mm, pmd);
if (pmd_trans_unstable(pmd))
return 0;
retry:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
- union mc_target target;
- int type;
- struct page *page;
- struct page_cgroup *pc;
swp_entry_t ent;
if (!mc.precharge)
break;
- type = is_target_pte_for_mc(vma, addr, ptent, &target);
- switch (type) {
+ switch (get_mctgt_type(vma, addr, ptent, &target)) {
case MC_TARGET_PAGE:
page = target.page;
if (isolate_lru_page(page))
mc.moved_charge++;
}
putback_lru_page(page);
-put: /* is_target_pte_for_mc() gets the page */
+put: /* get_mctgt_type() gets the page */
put_page(page);
break;
case MC_TARGET_SWAP: