#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/string.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
+#include <linux/random.h>
#include "internal.h"
/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
-#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;
/*
* run-time system-wide default policy => local allocation
*/
-struct mempolicy default_policy = {
+static struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
.mode = MPOL_PREFERRED,
.flags = MPOL_F_LOCAL,
},
};
-static void gather_stats(struct page *, void *, int pte_dirty);
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
continue;
- if (flags & MPOL_MF_STATS)
- gather_stats(page, private, pte_dirty(*pte));
- else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
migrate_page_add(page, private, flags);
else
break;
do {
next = pmd_addr_end(addr, end);
split_huge_page_pmd(vma->vm_mm, pmd);
- if (pmd_none_or_clear_bad(pmd))
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
if (check_pte_range(vma, pmd, addr, next, nodes,
flags, private))
unsigned long vmstart;
unsigned long vmend;
- vma = find_vma_prev(mm, start, &prev);
+ vma = find_vma(mm, start);
if (!vma || vma->vm_start > start)
return -EFAULT;
+ prev = vma->vm_prev;
+ if (start > vma->vm_start)
+ prev = vma;
+
for (; vma && vma->vm_start < end; prev = vma, vma = next) {
next = vma->vm_next;
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
- pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ if (mpol_equal(vma_policy(vma), new_pol))
+ continue;
+
+ pgoff = vma->vm_pgoff +
+ ((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, new_pol);
+ vma->anon_vma, vma->vm_file, pgoff,
+ new_pol);
if (prev) {
vma = prev;
next = vma->vm_next;
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_node_page, dest,
- false, true);
+ false, MIGRATE_SYNC);
if (err)
putback_lru_pages(&pagelist);
}
* most recent <s, d> pair that moved (s != d). If we find a pair
* that not only moved, but what's better, moved to an empty slot
* (d is not set in tmp), then we break out then, with that pair.
- * Otherwise when we finish scannng from_tmp, we at least have the
+ * Otherwise when we finish scanning from_tmp, we at least have the
* most recent <s, d> pair that moved. If we get all the way through
* the scan of tmp without finding any node that moved, much less
* moved to an empty node, then there is nothing left worth migrating.
err = -ESRCH;
goto out;
}
- mm = get_task_mm(task);
- rcu_read_unlock();
+ get_task_struct(task);
err = -EINVAL;
- if (!mm)
- goto out;
/*
* Check if this process has the right to modify the specified
* capabilities, superuser privileges or the same
* userid as the target process.
*/
- rcu_read_lock();
tcred = __task_cred(task);
if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
cred->uid != tcred->suid && cred->uid != tcred->uid &&
!capable(CAP_SYS_NICE)) {
rcu_read_unlock();
err = -EPERM;
- goto out;
+ goto out_put;
}
rcu_read_unlock();
/* Is the user allowed to access the target nodes? */
if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
- goto out;
+ goto out_put;
}
if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
err = -EINVAL;
- goto out;
+ goto out_put;
}
err = security_task_movememory(task);
if (err)
+ goto out_put;
+
+ mm = get_task_mm(task);
+ put_task_struct(task);
+
+ if (!mm) {
+ err = -EINVAL;
goto out;
+ }
err = do_migrate_pages(mm, old, new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
+
+ mmput(mm);
out:
- if (mm)
- mmput(mm);
NODEMASK_SCRATCH_FREE(scratch);
return err;
+
+out_put:
+ put_task_struct(task);
+ goto out;
+
}
err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
if (!err && nmask) {
- err = copy_from_user(bm, nm, alloc_size);
+ unsigned long copy_size;
+ copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
+ err = copy_from_user(bm, nm, copy_size);
/* ensure entire bitmap is zeroed */
err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
err |= compat_put_bitmap(nmask, bm, nr_bits);
* freeing by another task. It is the caller's responsibility to free the
* extra reference for shared policies.
*/
-static struct mempolicy *get_vma_policy(struct task_struct *task,
+struct mempolicy *get_vma_policy(struct task_struct *task,
struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = task->mempolicy;
return interleave_nodes(pol);
}
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ * (returns -1 if nodemask is empty)
+ */
+int node_random(const nodemask_t *maskp)
+{
+ int w, bit = -1;
+
+ w = nodes_weight(*maskp);
+ if (w)
+ bit = bitmap_ord_to_pos(maskp->bits,
+ get_random_int() % w, MAX_NUMNODES);
+ return bit;
+}
+
#ifdef CONFIG_HUGETLBFS
/*
* huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node)
{
- struct mempolicy *pol = get_vma_policy(current, vma, addr);
+ struct mempolicy *pol;
struct zonelist *zl;
struct page *page;
+ unsigned int cpuset_mems_cookie;
+
+retry_cpuset:
+ pol = get_vma_policy(current, vma, addr);
+ cpuset_mems_cookie = get_mems_allowed();
- get_mems_allowed();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
+
return page;
}
zl = policy_zonelist(gfp, pol, node);
struct page *page = __alloc_pages_nodemask(gfp, order,
zl, policy_nodemask(gfp, pol));
__mpol_put(pol);
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
return page;
}
/*
*/
page = __alloc_pages_nodemask(gfp, order, zl,
policy_nodemask(gfp, pol));
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
return page;
}
{
struct mempolicy *pol = current->mempolicy;
struct page *page;
+ unsigned int cpuset_mems_cookie;
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
- get_mems_allowed();
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
+
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
- put_mems_allowed();
+
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
+
return page;
}
EXPORT_SYMBOL(alloc_pages_current);
}
/* Slow path of a mempolicy comparison */
-int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
+bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
if (!a || !b)
- return 0;
+ return false;
if (a->mode != b->mode)
- return 0;
+ return false;
if (a->flags != b->flags)
- return 0;
+ return false;
if (mpol_store_user_nodemask(a))
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
- return 0;
+ return false;
switch (a->mode) {
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
- return nodes_equal(a->v.nodes, b->v.nodes);
+ return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
- return a->v.preferred_node == b->v.preferred_node &&
- a->flags == b->flags;
+ return a->v.preferred_node == b->v.preferred_node;
default:
BUG();
- return 0;
+ return false;
}
}
}
return p - buffer;
}
-
-struct numa_maps {
- unsigned long pages;
- unsigned long anon;
- unsigned long active;
- unsigned long writeback;
- unsigned long mapcount_max;
- unsigned long dirty;
- unsigned long swapcache;
- unsigned long node[MAX_NUMNODES];
-};
-
-static void gather_stats(struct page *page, void *private, int pte_dirty)
-{
- struct numa_maps *md = private;
- int count = page_mapcount(page);
-
- md->pages++;
- if (pte_dirty || PageDirty(page))
- md->dirty++;
-
- if (PageSwapCache(page))
- md->swapcache++;
-
- if (PageActive(page) || PageUnevictable(page))
- md->active++;
-
- if (PageWriteback(page))
- md->writeback++;
-
- if (PageAnon(page))
- md->anon++;
-
- if (count > md->mapcount_max)
- md->mapcount_max = count;
-
- md->node[page_to_nid(page)]++;
-}
-
-#ifdef CONFIG_HUGETLB_PAGE
-static void check_huge_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct numa_maps *md)
-{
- unsigned long addr;
- struct page *page;
- struct hstate *h = hstate_vma(vma);
- unsigned long sz = huge_page_size(h);
-
- for (addr = start; addr < end; addr += sz) {
- pte_t *ptep = huge_pte_offset(vma->vm_mm,
- addr & huge_page_mask(h));
- pte_t pte;
-
- if (!ptep)
- continue;
-
- pte = *ptep;
- if (pte_none(pte))
- continue;
-
- page = pte_page(pte);
- if (!page)
- continue;
-
- gather_stats(page, md, pte_dirty(*ptep));
- }
-}
-#else
-static inline void check_huge_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct numa_maps *md)
-{
-}
-#endif
-
-/*
- * Display pages allocated per node and memory policy via /proc.
- */
-int show_numa_map(struct seq_file *m, void *v)
-{
- struct proc_maps_private *priv = m->private;
- struct vm_area_struct *vma = v;
- struct numa_maps *md;
- struct file *file = vma->vm_file;
- struct mm_struct *mm = vma->vm_mm;
- struct mempolicy *pol;
- int n;
- char buffer[50];
-
- if (!mm)
- return 0;
-
- md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
- if (!md)
- return 0;
-
- pol = get_vma_policy(priv->task, vma, vma->vm_start);
- mpol_to_str(buffer, sizeof(buffer), pol, 0);
- mpol_cond_put(pol);
-
- seq_printf(m, "%08lx %s", vma->vm_start, buffer);
-
- if (file) {
- seq_printf(m, " file=");
- seq_path(m, &file->f_path, "\n\t= ");
- } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
- seq_printf(m, " heap");
- } else if (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack) {
- seq_printf(m, " stack");
- }
-
- if (is_vm_hugetlb_page(vma)) {
- check_huge_range(vma, vma->vm_start, vma->vm_end, md);
- seq_printf(m, " huge");
- } else {
- check_pgd_range(vma, vma->vm_start, vma->vm_end,
- &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
- }
-
- if (!md->pages)
- goto out;
-
- if (md->anon)
- seq_printf(m," anon=%lu",md->anon);
-
- if (md->dirty)
- seq_printf(m," dirty=%lu",md->dirty);
-
- if (md->pages != md->anon && md->pages != md->dirty)
- seq_printf(m, " mapped=%lu", md->pages);
-
- if (md->mapcount_max > 1)
- seq_printf(m, " mapmax=%lu", md->mapcount_max);
-
- if (md->swapcache)
- seq_printf(m," swapcache=%lu", md->swapcache);
-
- if (md->active < md->pages && !is_vm_hugetlb_page(vma))
- seq_printf(m," active=%lu", md->active);
-
- if (md->writeback)
- seq_printf(m," writeback=%lu", md->writeback);
-
- for_each_node_state(n, N_HIGH_MEMORY)
- if (md->node[n])
- seq_printf(m, " N%d=%lu", n, md->node[n]);
-out:
- seq_putc(m, '\n');
- kfree(md);
-
- if (m->count < m->size)
- m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
- return 0;
-}