*/
#include <linux/migrate.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pagemap.h>
#include "internal.h"
-#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
-
/*
* migrate_prep() needs to be called before we start compiling a list of pages
* to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
ptep = pte_offset_map(pmd, addr);
- if (!is_swap_pte(*ptep)) {
- pte_unmap(ptep);
- goto out;
- }
+ /*
+ * Peek to check is_swap_pte() before taking ptlock? No, we
+ * can race mremap's move_ptes(), which skips anon_vma lock.
+ */
ptl = pte_lockptr(mm, pmd);
}
* Something used the pte of a page under migration. We need to
* get to the page and wait until migration is finished.
* When we return from this function the fault will be retried.
- *
- * This function is called from do_swap_page().
*/
void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address)
pte_unmap_unlock(ptep, ptl);
}
+#ifdef CONFIG_BLOCK
+/* Returns true if all buffers are successfully locked */
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
+ enum migrate_mode mode)
+{
+ struct buffer_head *bh = head;
+
+ /* Simple case, sync compaction */
+ if (mode != MIGRATE_ASYNC) {
+ do {
+ get_bh(bh);
+ lock_buffer(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ return true;
+ }
+
+ /* async case, we cannot block on lock_buffer so use trylock_buffer */
+ do {
+ get_bh(bh);
+ if (!trylock_buffer(bh)) {
+ /*
+ * We failed to lock the buffer and cannot stall in
+ * async migration. Release the taken locks
+ */
+ struct buffer_head *failed_bh = bh;
+ put_bh(failed_bh);
+ bh = head;
+ while (bh != failed_bh) {
+ unlock_buffer(bh);
+ put_bh(bh);
+ bh = bh->b_this_page;
+ }
+ return false;
+ }
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+ return true;
+}
+#else
+static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
+ enum migrate_mode mode)
+{
+ return true;
+}
+#endif /* CONFIG_BLOCK */
+
/*
* Replace the page in the mapping.
*
* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
*/
static int migrate_page_move_mapping(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ struct buffer_head *head, enum migrate_mode mode)
{
int expected_count;
void **pslot;
expected_count = 2 + page_has_private(page);
if (page_count(page) != expected_count ||
- (struct page *)radix_tree_deref_slot(pslot) != page) {
+ radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
}
/*
+ * In the async migration case of moving a page with buffers, lock the
+ * buffers using trylock before the mapping is moved. If the mapping
+ * was moved, we later failed to lock the buffers and could not move
+ * the mapping back due to an elevated page count, we would have to
+ * block waiting on other references to be dropped.
+ */
+ if (mode == MIGRATE_ASYNC && head &&
+ !buffer_migrate_lock_buffers(head, mode)) {
+ page_unfreeze_refs(page, expected_count);
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ /*
* Now we know that no one else is looking at the page.
*/
get_page(newpage); /* add cache reference */
radix_tree_replace_slot(pslot, newpage);
- page_unfreeze_refs(page, expected_count);
/*
- * Drop cache reference from old page.
+ * Drop cache reference from old page by unfreezing
+ * to one less reference.
* We know this isn't the last reference.
*/
- __put_page(page);
+ page_unfreeze_refs(page, expected_count - 1);
/*
* If moved to a different zone then also account
*/
__dec_zone_page_state(page, NR_FILE_PAGES);
__inc_zone_page_state(newpage, NR_FILE_PAGES);
- if (PageSwapBacked(page)) {
+ if (!PageSwapCache(page) && PageSwapBacked(page)) {
__dec_zone_page_state(page, NR_SHMEM);
__inc_zone_page_state(newpage, NR_SHMEM);
}
expected_count = 2 + page_has_private(page);
if (page_count(page) != expected_count ||
- (struct page *)radix_tree_deref_slot(pslot) != page) {
+ radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
radix_tree_replace_slot(pslot, newpage);
- page_unfreeze_refs(page, expected_count);
-
- __put_page(page);
+ page_unfreeze_refs(page, expected_count - 1);
spin_unlock_irq(&mapping->tree_lock);
return 0;
* redo the accounting that clear_page_dirty_for_io undid,
* but we can't use set_page_dirty because that function
* is actually a signal that all of the page has become dirty.
- * Wheras only part of our page may be dirty.
+ * Whereas only part of our page may be dirty.
*/
__set_page_dirty_nobuffers(newpage);
}
ClearPageSwapCache(page);
ClearPagePrivate(page);
set_page_private(page, 0);
- page->mapping = NULL;
/*
* If any waiters have accumulated on the new page then
* Pages are locked upon entry and exit.
*/
int migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
int rc;
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- rc = migrate_page_move_mapping(mapping, newpage, page);
+ rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
if (rc)
return rc;
* exist.
*/
int buffer_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page, enum migrate_mode mode)
{
struct buffer_head *bh, *head;
int rc;
if (!page_has_buffers(page))
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
head = page_buffers(page);
- rc = migrate_page_move_mapping(mapping, newpage, page);
+ rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
if (rc)
return rc;
- bh = head;
- do {
- get_bh(bh);
- lock_buffer(bh);
- bh = bh->b_this_page;
-
- } while (bh != head);
+ /*
+ * In the async case, migrate_page_move_mapping locked the buffers
+ * with an IRQ-safe spinlock held. In the sync case, the buffers
+ * need to be locked now
+ */
+ if (mode != MIGRATE_ASYNC)
+ BUG_ON(!buffer_migrate_lock_buffers(head, mode));
ClearPagePrivate(page);
set_page_private(newpage, page_private(page));
* Default handling if a filesystem does not provide a migration function.
*/
static int fallback_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page, enum migrate_mode mode)
{
- if (PageDirty(page))
+ if (PageDirty(page)) {
+ /* Only writeback pages in full synchronous migration */
+ if (mode != MIGRATE_SYNC)
+ return -EBUSY;
return writeout(mapping, page);
+ }
/*
* Buffers may be managed in a filesystem specific way.
!try_to_release_page(page, GFP_KERNEL))
return -EAGAIN;
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
}
/*
* == 0 - success
*/
static int move_to_new_page(struct page *newpage, struct page *page,
- int remap_swapcache)
+ int remap_swapcache, enum migrate_mode mode)
{
struct address_space *mapping;
int rc;
mapping = page_mapping(page);
if (!mapping)
- rc = migrate_page(mapping, newpage, page);
+ rc = migrate_page(mapping, newpage, page, mode);
else if (mapping->a_ops->migratepage)
/*
- * Most pages have a mapping and most filesystems
- * should provide a migration function. Anonymous
- * pages are part of swap space which also has its
- * own migration function. This is the most common
- * path for page migration.
+ * Most pages have a mapping and most filesystems provide a
+ * migratepage callback. Anonymous pages are part of swap
+ * space which also has its own migratepage callback. This
+ * is the most common path for page migration.
*/
rc = mapping->a_ops->migratepage(mapping,
- newpage, page);
+ newpage, page, mode);
else
- rc = fallback_migrate_page(mapping, newpage, page);
+ rc = fallback_migrate_page(mapping, newpage, page, mode);
if (rc) {
newpage->mapping = NULL;
} else {
if (remap_swapcache)
remove_migration_ptes(page, newpage);
+ page->mapping = NULL;
}
unlock_page(newpage);
return rc;
}
-/*
- * Obtain the lock on page, remove all ptes and migrate the page
- * to the newly allocated page in newpage.
- */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
- struct page *page, int force, bool offlining, bool sync)
+static int __unmap_and_move(struct page *page, struct page *newpage,
+ int force, bool offlining, enum migrate_mode mode)
{
- int rc = 0;
- int *result = NULL;
- struct page *newpage = get_new_page(page, private, &result);
+ int rc = -EAGAIN;
int remap_swapcache = 1;
- int rcu_locked = 0;
int charge = 0;
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *mem;
struct anon_vma *anon_vma = NULL;
- if (!newpage)
- return -ENOMEM;
-
- if (page_count(page) == 1) {
- /* page was freed from under us. So we are done. */
- goto move_newpage;
- }
- if (unlikely(PageTransHuge(page)))
- if (unlikely(split_huge_page(page)))
- goto move_newpage;
-
- /* prepare cgroup just returns 0 or -ENOMEM */
- rc = -EAGAIN;
-
if (!trylock_page(page)) {
- if (!force)
- goto move_newpage;
+ if (!force || mode == MIGRATE_ASYNC)
+ goto out;
/*
* It's not safe for direct compaction to call lock_page.
* altogether.
*/
if (current->flags & PF_MEMALLOC)
- goto move_newpage;
+ goto out;
lock_page(page);
}
}
/* charge against new page */
- charge = mem_cgroup_prepare_migration(page, newpage, &mem);
+ charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
if (charge == -ENOMEM) {
rc = -ENOMEM;
goto unlock;
BUG_ON(charge);
if (PageWriteback(page)) {
- if (!force || !sync)
+ /*
+ * Only in the case of a full syncronous migration is it
+ * necessary to wait for PageWriteback. In the async case,
+ * the retry loop is too short and in the sync-light case,
+ * the overhead of stalling is too much
+ */
+ if (mode != MIGRATE_SYNC) {
+ rc = -EBUSY;
+ goto uncharge;
+ }
+ if (!force)
goto uncharge;
wait_on_page_writeback(page);
}
/*
* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
* we cannot notice that anon_vma is freed while we migrates a page.
- * This rcu_read_lock() delays freeing anon_vma pointer until the end
+ * This get_anon_vma() delays freeing anon_vma pointer until the end
* of migration. File cache pages are no problem because of page_lock()
* File Caches may use write_page() or lock_page() in migration, then,
* just care Anon page here.
*/
if (PageAnon(page)) {
- rcu_read_lock();
- rcu_locked = 1;
-
- /* Determine how to safely use anon_vma */
- if (!page_mapped(page)) {
- if (!PageSwapCache(page))
- goto rcu_unlock;
-
+ /*
+ * Only page_lock_anon_vma() understands the subtleties of
+ * getting a hold on an anon_vma from outside one of its mms.
+ */
+ anon_vma = page_get_anon_vma(page);
+ if (anon_vma) {
+ /*
+ * Anon page
+ */
+ } else if (PageSwapCache(page)) {
/*
* We cannot be sure that the anon_vma of an unmapped
* swapcache page is safe to use because we don't
*/
remap_swapcache = 0;
} else {
- /*
- * Take a reference count on the anon_vma if the
- * page is mapped so that it is guaranteed to
- * exist when the page is remapped later
- */
- anon_vma = page_anon_vma(page);
- get_anon_vma(anon_vma);
+ goto uncharge;
}
}
* free the metadata, so the page can be freed.
*/
if (!page->mapping) {
- if (!PageAnon(page) && page_has_private(page)) {
- /*
- * Go direct to try_to_free_buffers() here because
- * a) that's what try_to_release_page() would do anyway
- * b) we may be under rcu_read_lock() here, so we can't
- * use GFP_KERNEL which is what try_to_release_page()
- * needs to be effective.
- */
+ VM_BUG_ON(PageAnon(page));
+ if (page_has_private(page)) {
try_to_free_buffers(page);
- goto rcu_unlock;
+ goto uncharge;
}
goto skip_unmap;
}
skip_unmap:
if (!page_mapped(page))
- rc = move_to_new_page(newpage, page, remap_swapcache);
+ rc = move_to_new_page(newpage, page, remap_swapcache, mode);
if (rc && remap_swapcache)
remove_migration_ptes(page, page);
-rcu_unlock:
/* Drop an anon_vma reference if we took one */
if (anon_vma)
- drop_anon_vma(anon_vma);
+ put_anon_vma(anon_vma);
- if (rcu_locked)
- rcu_read_unlock();
uncharge:
if (!charge)
- mem_cgroup_end_migration(mem, page, newpage);
+ mem_cgroup_end_migration(mem, page, newpage, rc == 0);
unlock:
unlock_page(page);
+out:
+ return rc;
+}
+
+/*
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+ struct page *page, int force, bool offlining,
+ enum migrate_mode mode)
+{
+ int rc = 0;
+ int *result = NULL;
+ struct page *newpage = get_new_page(page, private, &result);
+
+ if (!newpage)
+ return -ENOMEM;
+
+ if (page_count(page) == 1) {
+ /* page was freed from under us. So we are done. */
+ goto out;
+ }
+ if (unlikely(PageTransHuge(page)))
+ if (unlikely(split_huge_page(page)))
+ goto out;
+
+ rc = __unmap_and_move(page, newpage, force, offlining, mode);
+out:
if (rc != -EAGAIN) {
- /*
- * A page that has been migrated has all references
- * removed and will be freed. A page that has not been
- * migrated will have kepts its references and be
- * restored.
- */
- list_del(&page->lru);
+ /*
+ * A page that has been migrated has all references
+ * removed and will be freed. A page that has not been
+ * migrated will have kepts its references and be
+ * restored.
+ */
+ list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
putback_lru_page(page);
}
-
-move_newpage:
-
/*
* Move the new page to the LRU. If migration was not successful
* then this will free the page.
*/
putback_lru_page(newpage);
-
if (result) {
if (rc)
*result = rc;
*/
static int unmap_and_move_huge_page(new_page_t get_new_page,
unsigned long private, struct page *hpage,
- int force, bool offlining, bool sync)
+ int force, bool offlining,
+ enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
struct page *new_hpage = get_new_page(hpage, private, &result);
- int rcu_locked = 0;
struct anon_vma *anon_vma = NULL;
if (!new_hpage)
rc = -EAGAIN;
if (!trylock_page(hpage)) {
- if (!force || !sync)
+ if (!force || mode != MIGRATE_SYNC)
goto out;
lock_page(hpage);
}
- if (PageAnon(hpage)) {
- rcu_read_lock();
- rcu_locked = 1;
-
- if (page_mapped(hpage)) {
- anon_vma = page_anon_vma(hpage);
- atomic_inc(&anon_vma->external_refcount);
- }
- }
+ if (PageAnon(hpage))
+ anon_vma = page_get_anon_vma(hpage);
try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
if (!page_mapped(hpage))
- rc = move_to_new_page(new_hpage, hpage, 1);
+ rc = move_to_new_page(new_hpage, hpage, 1, mode);
if (rc)
remove_migration_ptes(hpage, hpage);
- if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
- &anon_vma->lock)) {
- int empty = list_empty(&anon_vma->head);
- spin_unlock(&anon_vma->lock);
- if (empty)
- anon_vma_free(anon_vma);
- }
-
- if (rcu_locked)
- rcu_read_unlock();
-out:
+ if (anon_vma)
+ put_anon_vma(anon_vma);
unlock_page(hpage);
+out:
if (rc != -EAGAIN) {
list_del(&hpage->lru);
put_page(hpage);
* are movable anymore because to has become empty
* or no retryable pages exist anymore.
* Caller should call putback_lru_pages to return pages to the LRU
- * or free list.
+ * or free list only if ret != 0.
*
* Return: Number of pages not migrated or error code.
*/
int migrate_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, bool offlining,
- bool sync)
+ enum migrate_mode mode)
{
int retry = 1;
int nr_failed = 0;
rc = unmap_and_move(get_new_page, private,
page, pass > 2, offlining,
- sync);
+ mode);
switch(rc) {
case -ENOMEM:
int migrate_huge_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, bool offlining,
- bool sync)
+ enum migrate_mode mode)
{
int retry = 1;
int nr_failed = 0;
rc = unmap_and_move_huge_page(get_new_page,
private, page, pass > 2, offlining,
- sync);
+ mode);
switch(rc) {
case -ENOMEM:
}
rc = 0;
out:
-
- list_for_each_entry_safe(page, page2, from, lru)
- put_page(page);
-
if (rc)
return rc;
err = 0;
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_page_node,
- (unsigned long)pm, 0, true);
+ (unsigned long)pm, 0, MIGRATE_SYNC);
if (err)
putback_lru_pages(&pagelist);
}
* Migrate an array of page address onto an array of nodes and fill
* the corresponding array of status.
*/
-static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
+static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
unsigned long nr_pages,
const void __user * __user *pages,
const int __user *nodes,
int __user *status, int flags)
{
struct page_to_node *pm;
- nodemask_t task_nodes;
unsigned long chunk_nr_pages;
unsigned long chunk_start;
int err;
- task_nodes = cpuset_mems_allowed(task);
-
err = -ENOMEM;
pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
if (!pm)
struct task_struct *task;
struct mm_struct *mm;
int err;
+ nodemask_t task_nodes;
/* Check flags */
if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
return -EPERM;
/* Find the mm_struct */
- read_lock(&tasklist_lock);
+ rcu_read_lock();
task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return -ESRCH;
}
- mm = get_task_mm(task);
- read_unlock(&tasklist_lock);
-
- if (!mm)
- return -EINVAL;
+ get_task_struct(task);
/*
* Check if this process has the right to modify the specified
* capabilities, superuser privileges or the same
* userid as the target process.
*/
- rcu_read_lock();
tcred = __task_cred(task);
if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
cred->uid != tcred->suid && cred->uid != tcred->uid &&
if (err)
goto out;
- if (nodes) {
- err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
- flags);
- } else {
+ task_nodes = cpuset_mems_allowed(task);
+ mm = get_task_mm(task);
+ put_task_struct(task);
+
+ if (!mm)
+ return -EINVAL;
+
+ if (nodes)
+ err = do_pages_move(mm, task_nodes, nr_pages, pages,
+ nodes, status, flags);
+ else
err = do_pages_stat(mm, nr_pages, pages, status);
- }
-out:
mmput(mm);
return err;
+
+out:
+ put_task_struct(task);
+ return err;
}
/*