2 * mm/rmap.c - physical to virtual reverse mappings
4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5 * Released under the General Public License (GPL).
7 * Simple, low overhead reverse mapping scheme.
8 * Please try to keep this thing as modular as possible.
10 * Provides methods for unmapping each kind of mapped page:
11 * the anon methods track anonymous pages, and
12 * the file methods track pages belonging to an inode.
14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
21 * Locking: see "Lock ordering" summary in filemap.c.
22 * In swapout, page_map_lock is held on entry to page_referenced and
23 * try_to_unmap, so they trylock for i_mmap_lock and page_table_lock.
27 #include <linux/pagemap.h>
28 #include <linux/swap.h>
29 #include <linux/swapops.h>
30 #include <linux/slab.h>
31 #include <linux/init.h>
32 #include <linux/rmap.h>
34 #include <asm/tlbflush.h>
36 //#define RMAP_DEBUG /* can be enabled only for debugging */
38 kmem_cache_t *anon_vma_cachep;
40 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
43 struct anon_vma *anon_vma = find_vma->anon_vma;
44 struct vm_area_struct *vma;
45 unsigned int mapcount = 0;
48 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
50 BUG_ON(mapcount > 100000);
58 /* This must be called under the mmap_sem. */
59 int anon_vma_prepare(struct vm_area_struct *vma)
61 struct anon_vma *anon_vma = vma->anon_vma;
64 if (unlikely(!anon_vma)) {
65 struct mm_struct *mm = vma->vm_mm;
66 struct anon_vma *allocated = NULL;
68 anon_vma = find_mergeable_anon_vma(vma);
70 anon_vma = anon_vma_alloc();
71 if (unlikely(!anon_vma))
76 /* page_table_lock to protect against threads */
77 spin_lock(&mm->page_table_lock);
78 if (likely(!vma->anon_vma)) {
80 spin_lock(&anon_vma->lock);
81 vma->anon_vma = anon_vma;
82 list_add(&vma->anon_vma_node, &anon_vma->head);
84 spin_unlock(&anon_vma->lock);
87 spin_unlock(&mm->page_table_lock);
88 if (unlikely(allocated))
89 anon_vma_free(allocated);
94 void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
97 BUG_ON(!next->anon_vma);
98 vma->anon_vma = next->anon_vma;
99 list_add(&vma->anon_vma_node, &next->anon_vma_node);
101 /* if they're both non-null they must be the same */
102 BUG_ON(vma->anon_vma != next->anon_vma);
104 list_del(&next->anon_vma_node);
107 void __anon_vma_link(struct vm_area_struct *vma)
109 struct anon_vma *anon_vma = vma->anon_vma;
112 list_add(&vma->anon_vma_node, &anon_vma->head);
113 validate_anon_vma(vma);
117 void anon_vma_link(struct vm_area_struct *vma)
119 struct anon_vma *anon_vma = vma->anon_vma;
122 spin_lock(&anon_vma->lock);
123 list_add(&vma->anon_vma_node, &anon_vma->head);
124 validate_anon_vma(vma);
125 spin_unlock(&anon_vma->lock);
129 void anon_vma_unlink(struct vm_area_struct *vma)
131 struct anon_vma *anon_vma = vma->anon_vma;
137 spin_lock(&anon_vma->lock);
138 validate_anon_vma(vma);
139 list_del(&vma->anon_vma_node);
141 /* We must garbage collect the anon_vma if it's empty */
142 empty = list_empty(&anon_vma->head);
143 spin_unlock(&anon_vma->lock);
146 anon_vma_free(anon_vma);
149 static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
151 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
152 SLAB_CTOR_CONSTRUCTOR) {
153 struct anon_vma *anon_vma = data;
155 spin_lock_init(&anon_vma->lock);
156 INIT_LIST_HEAD(&anon_vma->head);
160 void __init anon_vma_init(void)
162 anon_vma_cachep = kmem_cache_create("anon_vma",
163 sizeof(struct anon_vma), 0, SLAB_PANIC, anon_vma_ctor, NULL);
166 /* this needs the page->flags PG_maplock held */
167 static inline void clear_page_anon(struct page *page)
169 BUG_ON(!page->mapping);
170 page->mapping = NULL;
175 * At what user virtual address is page expected in vma?
177 static inline unsigned long
178 vma_address(struct page *page, struct vm_area_struct *vma)
180 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
181 unsigned long address;
183 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
184 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
185 /* page should be within any vma from prio_tree_next */
186 BUG_ON(!PageAnon(page));
193 * Subfunctions of page_referenced: page_referenced_one called
194 * repeatedly from either page_referenced_anon or page_referenced_file.
196 static int page_referenced_one(struct page *page,
197 struct vm_area_struct *vma, unsigned int *mapcount)
199 struct mm_struct *mm = vma->vm_mm;
200 unsigned long address;
208 address = vma_address(page, vma);
209 if (address == -EFAULT)
212 if (!spin_trylock(&mm->page_table_lock))
215 pgd = pgd_offset(mm, address);
216 if (!pgd_present(*pgd))
219 pmd = pmd_offset(pgd, address);
220 if (!pmd_present(*pmd))
223 pte = pte_offset_map(pmd, address);
224 if (!pte_present(*pte))
227 if (page_to_pfn(page) != pte_pfn(*pte))
230 if (ptep_clear_flush_young(vma, address, pte))
238 spin_unlock(&mm->page_table_lock);
243 static inline int page_referenced_anon(struct page *page)
245 unsigned int mapcount = page->mapcount;
246 struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
247 struct vm_area_struct *vma;
250 spin_lock(&anon_vma->lock);
251 BUG_ON(list_empty(&anon_vma->head));
252 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
253 referenced += page_referenced_one(page, vma, &mapcount);
257 spin_unlock(&anon_vma->lock);
262 * page_referenced_file - referenced check for object-based rmap
263 * @page: the page we're checking references on.
265 * For an object-based mapped page, find all the places it is mapped and
266 * check/clear the referenced flag. This is done by following the page->mapping
267 * pointer, then walking the chain of vmas it holds. It returns the number
268 * of references it found.
270 * This function is only called from page_referenced for object-based pages.
272 * The spinlock address_space->i_mmap_lock is tried. If it can't be gotten,
273 * assume a reference count of 0, so try_to_unmap will then have a go.
275 static inline int page_referenced_file(struct page *page)
277 unsigned int mapcount = page->mapcount;
278 struct address_space *mapping = page->mapping;
279 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
280 struct vm_area_struct *vma = NULL;
281 struct prio_tree_iter iter;
284 if (!spin_trylock(&mapping->i_mmap_lock))
287 while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
288 &iter, pgoff, pgoff)) != NULL) {
289 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
290 == (VM_LOCKED|VM_MAYSHARE)) {
294 referenced += page_referenced_one(page, vma, &mapcount);
299 spin_unlock(&mapping->i_mmap_lock);
304 * page_referenced - test if the page was referenced
305 * @page: the page to test
307 * Quick test_and_clear_referenced for all mappings to a page,
308 * returns the number of ptes which referenced the page.
309 * Caller needs to hold the rmap lock.
311 int page_referenced(struct page *page)
315 if (page_test_and_clear_young(page))
318 if (TestClearPageReferenced(page))
321 if (page->mapcount && page->mapping) {
323 referenced += page_referenced_anon(page);
325 referenced += page_referenced_file(page);
331 * page_add_anon_rmap - add pte mapping to an anonymous page
332 * @page: the page to add the mapping to
333 * @vma: the vm area in which the mapping is added
334 * @address: the user virtual address mapped
336 * The caller needs to hold the mm->page_table_lock.
338 void page_add_anon_rmap(struct page *page,
339 struct vm_area_struct *vma, unsigned long address)
341 struct anon_vma *anon_vma = vma->anon_vma;
344 BUG_ON(PageReserved(page));
347 index = (address - vma->vm_start) >> PAGE_SHIFT;
348 index += vma->vm_pgoff;
349 index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
352 * Setting and clearing PG_anon must always happen inside
353 * page_map_lock to avoid races between mapping and
354 * unmapping on different processes of the same
355 * shared cow swapcache page. And while we take the
356 * page_map_lock PG_anon cannot change from under us.
357 * Actually PG_anon cannot change under fork either
358 * since fork holds a reference on the page so it cannot
359 * be unmapped under fork and in turn copy_page_range is
360 * allowed to read PG_anon outside the page_map_lock.
363 if (!page->mapcount) {
364 BUG_ON(PageAnon(page));
365 BUG_ON(page->mapping);
368 page->mapping = (struct address_space *) anon_vma;
369 inc_page_state(nr_mapped);
371 BUG_ON(!PageAnon(page));
372 BUG_ON(page->index != index);
373 BUG_ON(page->mapping != (struct address_space *) anon_vma);
376 page_map_unlock(page);
380 * page_add_file_rmap - add pte mapping to a file page
381 * @page: the page to add the mapping to
383 * The caller needs to hold the mm->page_table_lock.
385 void page_add_file_rmap(struct page *page)
387 BUG_ON(PageAnon(page));
388 if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
393 inc_page_state(nr_mapped);
395 page_map_unlock(page);
399 * page_remove_rmap - take down pte mapping from a page
400 * @page: page to remove mapping from
402 * Caller needs to hold the mm->page_table_lock.
404 void page_remove_rmap(struct page *page)
406 BUG_ON(PageReserved(page));
407 BUG_ON(!page->mapcount);
411 if (!page->mapcount) {
412 if (page_test_and_clear_dirty(page))
413 set_page_dirty(page);
415 clear_page_anon(page);
416 dec_page_state(nr_mapped);
418 page_map_unlock(page);
422 * Subfunctions of try_to_unmap: try_to_unmap_one called
423 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
425 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
427 struct mm_struct *mm = vma->vm_mm;
428 unsigned long address;
433 int ret = SWAP_AGAIN;
437 address = vma_address(page, vma);
438 if (address == -EFAULT)
442 * We need the page_table_lock to protect us from page faults,
443 * munmap, fork, etc...
445 if (!spin_trylock(&mm->page_table_lock))
448 pgd = pgd_offset(mm, address);
449 if (!pgd_present(*pgd))
452 pmd = pmd_offset(pgd, address);
453 if (!pmd_present(*pmd))
456 pte = pte_offset_map(pmd, address);
457 if (!pte_present(*pte))
460 if (page_to_pfn(page) != pte_pfn(*pte))
464 * If the page is mlock()d, we cannot swap it out.
465 * If it's recently referenced (perhaps page_referenced
466 * skipped over this mm) then we should reactivate it.
468 if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
469 ptep_clear_flush_young(vma, address, pte)) {
475 * Don't pull an anonymous page out from under get_user_pages.
476 * GUP carefully breaks COW and raises page count (while holding
477 * page_table_lock, as we have here) to make sure that the page
478 * cannot be freed. If we unmap that page here, a user write
479 * access to the virtual address will bring back the page, but
480 * its raised count will (ironically) be taken to mean it's not
481 * an exclusive swap page, do_wp_page will replace it by a copy
482 * page, and the user never get to see the data GUP was holding
483 * the original page for.
485 * This test is also useful for when swapoff (unuse_process) has
486 * to drop page lock: its reference to the page stops existing
487 * ptes from being unmapped, so swapoff can make progress.
489 if (PageSwapCache(page) &&
490 page_count(page) != page->mapcount + 2) {
495 /* Nuke the page table entry. */
496 flush_cache_page(vma, address);
497 pteval = ptep_clear_flush(vma, address, pte);
499 /* Move the dirty bit to the physical page now the pte is gone. */
500 if (pte_dirty(pteval))
501 set_page_dirty(page);
503 if (PageAnon(page)) {
504 swp_entry_t entry = { .val = page->private };
506 * Store the swap location in the pte.
507 * See handle_pte_fault() ...
509 BUG_ON(!PageSwapCache(page));
510 swap_duplicate(entry);
511 set_pte(pte, swp_entry_to_pte(entry));
512 BUG_ON(pte_file(*pte));
516 BUG_ON(!page->mapcount);
518 page_cache_release(page);
523 spin_unlock(&mm->page_table_lock);
529 * objrmap doesn't work for nonlinear VMAs because the assumption that
530 * offset-into-file correlates with offset-into-virtual-addresses does not hold.
531 * Consequently, given a particular page and its ->index, we cannot locate the
532 * ptes which are mapping that page without an exhaustive linear search.
534 * So what this code does is a mini "virtual scan" of each nonlinear VMA which
535 * maps the file to which the target page belongs. The ->vm_private_data field
536 * holds the current cursor into that scan. Successive searches will circulate
537 * around the vma's virtual address space.
539 * So as more replacement pressure is applied to the pages in a nonlinear VMA,
540 * more scanning pressure is placed against them as well. Eventually pages
541 * will become fully unmapped and are eligible for eviction.
543 * For very sparsely populated VMAs this is a little inefficient - chances are
544 * there there won't be many ptes located within the scan cluster. In this case
545 * maybe we could scan further - to the end of the pte page, perhaps.
547 #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
548 #define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
550 static int try_to_unmap_cluster(unsigned long cursor,
551 unsigned int *mapcount, struct vm_area_struct *vma)
553 struct mm_struct *mm = vma->vm_mm;
559 unsigned long address;
564 * We need the page_table_lock to protect us from page faults,
565 * munmap, fork, etc...
567 if (!spin_trylock(&mm->page_table_lock))
570 address = (vma->vm_start + cursor) & CLUSTER_MASK;
571 end = address + CLUSTER_SIZE;
572 if (address < vma->vm_start)
573 address = vma->vm_start;
574 if (end > vma->vm_end)
577 pgd = pgd_offset(mm, address);
578 if (!pgd_present(*pgd))
581 pmd = pmd_offset(pgd, address);
582 if (!pmd_present(*pmd))
585 for (pte = pte_offset_map(pmd, address);
586 address < end; pte++, address += PAGE_SIZE) {
588 if (!pte_present(*pte))
595 page = pfn_to_page(pfn);
596 BUG_ON(PageAnon(page));
597 if (PageReserved(page))
600 if (ptep_clear_flush_young(vma, address, pte))
603 /* Nuke the page table entry. */
604 flush_cache_page(vma, address);
605 pteval = ptep_clear_flush(vma, address, pte);
607 /* If nonlinear, store the file page offset in the pte. */
608 if (page->index != linear_page_index(vma, address))
609 set_pte(pte, pgoff_to_pte(page->index));
611 /* Move the dirty bit to the physical page now the pte is gone. */
612 if (pte_dirty(pteval))
613 set_page_dirty(page);
615 page_remove_rmap(page);
616 page_cache_release(page);
624 spin_unlock(&mm->page_table_lock);
628 static inline int try_to_unmap_anon(struct page *page)
630 struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
631 struct vm_area_struct *vma;
632 int ret = SWAP_AGAIN;
634 spin_lock(&anon_vma->lock);
635 BUG_ON(list_empty(&anon_vma->head));
636 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
637 ret = try_to_unmap_one(page, vma);
638 if (ret == SWAP_FAIL || !page->mapcount)
641 spin_unlock(&anon_vma->lock);
646 * try_to_unmap_file - unmap file page using the object-based rmap method
647 * @page: the page to unmap
649 * Find all the mappings of a page using the mapping pointer and the vma chains
650 * contained in the address_space struct it points to.
652 * This function is only called from try_to_unmap for object-based pages.
654 * The spinlock address_space->i_mmap_lock is tried. If it can't be gotten,
655 * return a temporary error.
657 static inline int try_to_unmap_file(struct page *page)
659 struct address_space *mapping = page->mapping;
660 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
661 struct vm_area_struct *vma = NULL;
662 struct prio_tree_iter iter;
663 int ret = SWAP_AGAIN;
664 unsigned long cursor;
665 unsigned long max_nl_cursor = 0;
666 unsigned long max_nl_size = 0;
667 unsigned int mapcount;
669 if (!spin_trylock(&mapping->i_mmap_lock))
672 while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
673 &iter, pgoff, pgoff)) != NULL) {
674 ret = try_to_unmap_one(page, vma);
675 if (ret == SWAP_FAIL || !page->mapcount)
679 if (list_empty(&mapping->i_mmap_nonlinear))
682 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
683 shared.vm_set.list) {
684 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
686 cursor = (unsigned long) vma->vm_private_data;
687 if (cursor > max_nl_cursor)
688 max_nl_cursor = cursor;
689 cursor = vma->vm_end - vma->vm_start;
690 if (cursor > max_nl_size)
691 max_nl_size = cursor;
694 if (max_nl_size == 0) /* any nonlinears locked or reserved */
698 * We don't try to search for this page in the nonlinear vmas,
699 * and page_referenced wouldn't have found it anyway. Instead
700 * just walk the nonlinear vmas trying to age and unmap some.
701 * The mapcount of the page we came in with is irrelevant,
702 * but even so use it as a guide to how hard we should try?
704 mapcount = page->mapcount;
705 page_map_unlock(page);
706 cond_resched_lock(&mapping->i_mmap_lock);
708 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
709 if (max_nl_cursor == 0)
710 max_nl_cursor = CLUSTER_SIZE;
713 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
714 shared.vm_set.list) {
715 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
717 cursor = (unsigned long) vma->vm_private_data;
718 while (vma->vm_mm->rss &&
719 cursor < max_nl_cursor &&
720 cursor < vma->vm_end - vma->vm_start) {
721 ret = try_to_unmap_cluster(
722 cursor, &mapcount, vma);
723 if (ret == SWAP_FAIL)
725 cursor += CLUSTER_SIZE;
726 vma->vm_private_data = (void *) cursor;
727 if ((int)mapcount <= 0)
730 if (ret != SWAP_FAIL)
731 vma->vm_private_data =
732 (void *) max_nl_cursor;
735 cond_resched_lock(&mapping->i_mmap_lock);
736 max_nl_cursor += CLUSTER_SIZE;
737 } while (max_nl_cursor <= max_nl_size);
740 * Don't loop forever (perhaps all the remaining pages are
741 * in locked vmas). Reset cursor on all unreserved nonlinear
742 * vmas, now forgetting on which ones it had fallen behind.
744 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
745 shared.vm_set.list) {
746 if (!(vma->vm_flags & VM_RESERVED))
747 vma->vm_private_data = NULL;
752 spin_unlock(&mapping->i_mmap_lock);
757 * try_to_unmap - try to remove all page table mappings to a page
758 * @page: the page to get unmapped
760 * Tries to remove all the page table entries which are mapping this
761 * page, used in the pageout path. Caller must hold the page lock
762 * and its rmap lock. Return values are:
764 * SWAP_SUCCESS - we succeeded in removing all mappings
765 * SWAP_AGAIN - we missed a trylock, try again later
766 * SWAP_FAIL - the page is unswappable
768 int try_to_unmap(struct page *page)
772 BUG_ON(PageReserved(page));
773 BUG_ON(!PageLocked(page));
774 BUG_ON(!page->mapcount);
777 ret = try_to_unmap_anon(page);
779 ret = try_to_unmap_file(page);
781 if (!page->mapcount) {
782 if (page_test_and_clear_dirty(page))
783 set_page_dirty(page);
785 clear_page_anon(page);
786 dec_page_state(nr_mapped);