#include <linux/mm_inline.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/mman.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
* allocations.
*/
unsigned long transparent_hugepage_flags __read_mostly =
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
(1<<TRANSPARENT_HUGEPAGE_FLAG)|
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+#endif
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
/* default scan 8*512 pte (or vmas) every 30 second */
struct list_head mm_head;
struct mm_slot *mm_slot;
unsigned long address;
-} khugepaged_scan = {
+};
+static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};
+
+static int set_recommended_min_free_kbytes(void)
+{
+ struct zone *zone;
+ int nr_zones = 0;
+ unsigned long recommended_min;
+ extern int min_free_kbytes;
+
+ if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags) &&
+ !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags))
+ return 0;
+
+ for_each_populated_zone(zone)
+ nr_zones++;
+
+ /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+ recommended_min = pageblock_nr_pages * nr_zones * 2;
+
+ /*
+ * Make sure that on average at least two pageblocks are almost free
+ * of another type, one for a migratetype to fall back to and a
+ * second to avoid subsequent fallbacks of other types There are 3
+ * MIGRATE_TYPES we care about.
+ */
+ recommended_min += pageblock_nr_pages * nr_zones *
+ MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
+
+ /* don't ever allow to reserve more than 5% of the lowmem */
+ recommended_min = min(recommended_min,
+ (unsigned long) nr_free_buffer_pages() / 20);
+ recommended_min <<= (PAGE_SHIFT-10);
+
+ if (recommended_min > min_free_kbytes)
+ min_free_kbytes = recommended_min;
+ setup_per_zone_wmarks();
+ return 0;
+}
+late_initcall(set_recommended_min_free_kbytes);
+
static int start_khugepaged(void)
{
int err = 0;
mutex_unlock(&khugepaged_mutex);
if (wakeup)
wake_up_interruptible(&khugepaged_wait);
+
+ set_recommended_min_free_kbytes();
} else
/* wakeup to exit */
wake_up_interruptible(&khugepaged_wait);
ret = err;
}
+ if (ret > 0 &&
+ (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags) ||
+ test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags)))
+ set_recommended_min_free_kbytes();
+
return ret;
}
static struct kobj_attribute enabled_attr =
struct kobj_attribute *attr, char *buf,
enum transparent_hugepage_flag flag)
{
- if (test_bit(flag, &transparent_hugepage_flags))
- return sprintf(buf, "[yes] no\n");
- else
- return sprintf(buf, "yes [no]\n");
+ return sprintf(buf, "%d\n",
+ !!test_bit(flag, &transparent_hugepage_flags));
}
+
static ssize_t single_flag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count,
enum transparent_hugepage_flag flag)
{
- if (!memcmp("yes", buf,
- min(sizeof("yes")-1, count))) {
+ unsigned long value;
+ int ret;
+
+ ret = kstrtoul(buf, 10, &value);
+ if (ret < 0)
+ return ret;
+ if (value > 1)
+ return -EINVAL;
+
+ if (value)
set_bit(flag, &transparent_hugepage_flags);
- } else if (!memcmp("no", buf,
- min(sizeof("no")-1, count))) {
+ else
clear_bit(flag, &transparent_hugepage_flags);
- } else
- return -EINVAL;
return count;
}
int err;
#ifdef CONFIG_SYSFS
static struct kobject *hugepage_kobj;
+#endif
+ err = -EINVAL;
+ if (!has_transparent_hugepage()) {
+ transparent_hugepage_flags = 0;
+ goto out;
+ }
+
+#ifdef CONFIG_SYSFS
err = -ENOMEM;
hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
if (unlikely(!hugepage_kobj)) {
goto out;
}
+ /*
+ * By default disable transparent hugepages on smaller systems,
+ * where the extra memory used could hurt more than TLB overhead
+ * is likely to save. The admin can still enable it through /sys.
+ */
+ if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+ transparent_hugepage_flags = 0;
+
start_khugepaged();
+ set_recommended_min_free_kbytes();
+
out:
return err;
}
set_pmd_at(mm, haddr, pmd, entry);
prepare_pmd_huge_pte(pgtable, mm);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ mm->nr_ptes++;
spin_unlock(&mm->page_table_lock);
}
return ret;
}
+static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
+{
+ return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
+}
+
+static inline struct page *alloc_hugepage_vma(int defrag,
+ struct vm_area_struct *vma,
+ unsigned long haddr, int nd,
+ gfp_t extra_gfp)
+{
+ return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
+ HPAGE_PMD_ORDER, vma, haddr, nd);
+}
+
+#ifndef CONFIG_NUMA
static inline struct page *alloc_hugepage(int defrag)
{
- return alloc_pages(GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT),
+ return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
HPAGE_PMD_ORDER);
}
+#endif
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma)))
return VM_FAULT_OOM;
- page = alloc_hugepage(transparent_hugepage_defrag(vma));
- if (unlikely(!page))
+ page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr, numa_node_id(), 0);
+ if (unlikely(!page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
goto out;
+ }
+ count_vm_event(THP_FAULT_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
put_page(page);
goto out;
pmd = pmd_mkold(pmd_wrprotect(pmd));
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
prepare_pmd_huge_pte(pgtable, dst_mm);
+ dst_mm->nr_ptes++;
ret = 0;
out_unlock:
}
for (i = 0; i < HPAGE_PMD_NR; i++) {
- pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
- vma, address);
+ pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
+ __GFP_OTHER_NODE,
+ vma, address, page_to_nid(page));
if (unlikely(!pages[i] ||
mem_cgroup_newpage_charge(pages[i], mm,
GFP_KERNEL))) {
for (i = 0; i < HPAGE_PMD_NR; i++) {
copy_user_highpage(pages[i], page + i,
- haddr + PAGE_SHIFT*i, vma);
+ haddr + PAGE_SIZE * i, vma);
__SetPageUptodate(pages[i]);
cond_resched();
}
}
kfree(pages);
- mm->nr_ptes++;
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
page_remove_rmap(page);
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
- new_page = alloc_hugepage(transparent_hugepage_defrag(vma));
+ new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr, numa_node_id(), 0);
else
new_page = NULL;
if (unlikely(!new_page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
pmd, orig_pmd, page, haddr);
put_page(page);
goto out;
}
+ count_vm_event(THP_FAULT_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON(!PageCompound(page));
if (flags & FOLL_GET)
- get_page(page);
+ get_page_foll(page);
out:
return page;
VM_BUG_ON(page_mapcount(page) < 0);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON(!PageHead(page));
+ tlb->mm->nr_ptes--;
spin_unlock(&tlb->mm->page_table_lock);
tlb_remove_page(tlb, page);
pte_free(tlb->mm, pgtable);
return ret;
}
+int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
+ unsigned long old_addr,
+ unsigned long new_addr, unsigned long old_end,
+ pmd_t *old_pmd, pmd_t *new_pmd)
+{
+ int ret = 0;
+ pmd_t pmd;
+
+ struct mm_struct *mm = vma->vm_mm;
+
+ if ((old_addr & ~HPAGE_PMD_MASK) ||
+ (new_addr & ~HPAGE_PMD_MASK) ||
+ old_end - old_addr < HPAGE_PMD_SIZE ||
+ (new_vma->vm_flags & VM_NOHUGEPAGE))
+ goto out;
+
+ /*
+ * The destination pmd shouldn't be established, free_pgtables()
+ * should have release it.
+ */
+ if (WARN_ON(!pmd_none(*new_pmd))) {
+ VM_BUG_ON(pmd_trans_huge(*new_pmd));
+ goto out;
+ }
+
+ spin_lock(&mm->page_table_lock);
+ if (likely(pmd_trans_huge(*old_pmd))) {
+ if (pmd_trans_splitting(*old_pmd)) {
+ spin_unlock(&mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma, old_pmd);
+ ret = -1;
+ } else {
+ pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+ VM_BUG_ON(!pmd_none(*new_pmd));
+ set_pmd_at(mm, new_addr, new_pmd, pmd);
+ spin_unlock(&mm->page_table_lock);
+ ret = 1;
+ }
+ } else {
+ spin_unlock(&mm->page_table_lock);
+ }
+out:
+ return ret;
+}
+
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot)
{
goto out;
if (pmd_page(*pmd) != page)
goto out;
- VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
- pmd_trans_splitting(*pmd));
+ /*
+ * split_vma() may create temporary aliased mappings. There is
+ * no risk as long as all huge pmd are found and have their
+ * splitting bit set before __split_huge_page_refcount
+ * runs. Finding the same huge pmd more than once during the
+ * same rmap walk is not a problem.
+ */
+ if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
+ pmd_trans_splitting(*pmd))
+ goto out;
if (pmd_trans_huge(*pmd)) {
VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
!pmd_trans_splitting(*pmd));
* We can't temporarily set the pmd to null in order
* to split it, the pmd must remain marked huge at all
* times or the VM won't take the pmd_trans_huge paths
- * and it won't wait on the anon_vma->root->lock to
+ * and it won't wait on the anon_vma->root->mutex to
* serialize against split_huge_page*.
*/
pmdp_splitting_flush_notify(vma, address, pmd);
int i;
unsigned long head_index = page->index;
struct zone *zone = page_zone(page);
+ int zonestat;
+ int tail_count = 0;
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irq(&zone->lru_lock);
for (i = 1; i < HPAGE_PMD_NR; i++) {
struct page *page_tail = page + i;
- /* tail_page->_count cannot change */
- atomic_sub(atomic_read(&page_tail->_count), &page->_count);
- BUG_ON(page_count(page) <= 0);
- atomic_add(page_mapcount(page) + 1, &page_tail->_count);
- BUG_ON(atomic_read(&page_tail->_count) <= 0);
+ /* tail_page->_mapcount cannot change */
+ BUG_ON(page_mapcount(page_tail) < 0);
+ tail_count += page_mapcount(page_tail);
+ /* check for overflow */
+ BUG_ON(tail_count < 0);
+ BUG_ON(atomic_read(&page_tail->_count) != 0);
+ /*
+ * tail_page->_count is zero and not changing from
+ * under us. But get_page_unless_zero() may be running
+ * from under us on the tail_page. If we used
+ * atomic_set() below instead of atomic_add(), we
+ * would then run atomic_set() concurrently with
+ * get_page_unless_zero(), and atomic_set() is
+ * implemented in C not using locked ops. spin_unlock
+ * on x86 sometime uses locked ops because of PPro
+ * errata 66, 92, so unless somebody can guarantee
+ * atomic_set() here would be safe on all archs (and
+ * not only on x86), it's safer to use atomic_add().
+ */
+ atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
+ &page_tail->_count);
/* after clearing PageTail the gup refcount can be released */
smp_mb();
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ /*
+ * retain hwpoison flag of the poisoned tail page:
+ * fix for the unsuitable process killed on Guest Machine(KVM)
+ * by the memory-failure.
+ */
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
page_tail->flags |= (page->flags &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
(1L << PG_uptodate)));
page_tail->flags |= (1L << PG_dirty);
- /*
- * 1) clear PageTail before overwriting first_page
- * 2) clear PageTail before clearing PageHead for VM_BUG_ON
- */
+ /* clear PageTail before overwriting first_page */
smp_wmb();
/*
* status is achieved setting a reserved bit in the
* pmd, not by clearing the present bit.
*/
- BUG_ON(page_mapcount(page_tail));
page_tail->_mapcount = page->_mapcount;
BUG_ON(page_tail->mapping);
BUG_ON(!PageDirty(page_tail));
BUG_ON(!PageSwapBacked(page_tail));
+ mem_cgroup_split_huge_fixup(page, page_tail);
+
lru_add_page_tail(zone, page, page_tail);
}
+ atomic_sub(tail_count, &page->_count);
+ BUG_ON(atomic_read(&page->_count) <= 0);
__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+ /*
+ * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
+ * so adjust those appropriately if this page is on the LRU.
+ */
+ if (PageLRU(page)) {
+ zonestat = NR_LRU_BASE + page_lru(page);
+ __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
+ }
+
ClearPageCompound(page);
compound_unlock(page);
spin_unlock_irq(&zone->lru_lock);
pte_unmap(pte);
}
- mm->nr_ptes++;
smp_wmb(); /* make pte visible before pmd */
/*
* Up to this point the pmd is present and huge and
return ret;
}
-/* must be called with anon_vma->root->lock hold */
+/* must be called with anon_vma->root->mutex hold */
static void __split_huge_page(struct page *page,
struct anon_vma *anon_vma)
{
BUG_ON(!PageSwapBacked(page));
__split_huge_page(page, anon_vma);
+ count_vm_event(THP_SPLIT);
BUG_ON(PageCompound(page));
out_unlock:
return ret;
}
-int hugepage_madvise(unsigned long *vm_flags)
-{
- /*
- * Be somewhat over-protective like KSM for now!
- */
- if (*vm_flags & (VM_HUGEPAGE | VM_SHARED | VM_MAYSHARE |
- VM_PFNMAP | VM_IO | VM_DONTEXPAND |
- VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
- VM_MIXEDMAP | VM_SAO))
- return -EINVAL;
+#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
+ VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
- *vm_flags |= VM_HUGEPAGE;
+int hugepage_madvise(struct vm_area_struct *vma,
+ unsigned long *vm_flags, int advice)
+{
+ switch (advice) {
+ case MADV_HUGEPAGE:
+ /*
+ * Be somewhat over-protective like KSM for now!
+ */
+ if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
+ return -EINVAL;
+ *vm_flags &= ~VM_NOHUGEPAGE;
+ *vm_flags |= VM_HUGEPAGE;
+ /*
+ * If the vma become good for khugepaged to scan,
+ * register it here without waiting a page fault that
+ * may not happen any time soon.
+ */
+ if (unlikely(khugepaged_enter_vma_merge(vma)))
+ return -ENOMEM;
+ break;
+ case MADV_NOHUGEPAGE:
+ /*
+ * Be somewhat over-protective like KSM for now!
+ */
+ if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
+ return -EINVAL;
+ *vm_flags &= ~VM_HUGEPAGE;
+ *vm_flags |= VM_NOHUGEPAGE;
+ /*
+ * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+ * this vma even if we leave the mm registered in khugepaged if
+ * it got registered before VM_NOHUGEPAGE was set.
+ */
+ break;
+ }
return 0;
}
* page fault if needed.
*/
return 0;
- if (vma->vm_file || vma->vm_ops)
+ if (vma->vm_ops)
/* khugepaged not yet working on file or special mappings */
return 0;
- VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+ /*
+ * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
+ * true too, verify it here.
+ */
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
list_del(&mm_slot->mm_node);
free = 1;
}
+ spin_unlock(&khugepaged_mm_lock);
if (free) {
- spin_unlock(&khugepaged_mm_lock);
clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
free_mm_slot(mm_slot);
mmdrop(mm);
} else if (mm_slot) {
- spin_unlock(&khugepaged_mm_lock);
/*
* This is required to serialize against
* khugepaged_test_exit() (which is guaranteed to run
*/
down_write(&mm->mmap_sem);
up_write(&mm->mmap_sem);
- } else
- spin_unlock(&khugepaged_mm_lock);
+ }
}
static void release_pte_page(struct page *page)
VM_BUG_ON(PageLRU(page));
/* If there is no mapped pte young don't collapse the page */
- if (pte_young(pteval))
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
referenced = 1;
}
if (unlikely(!referenced))
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
- struct page **hpage)
+ struct page **hpage,
+ struct vm_area_struct *vma,
+ int node)
{
- struct vm_area_struct *vma;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd, _pmd;
unsigned long hstart, hend;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifndef CONFIG_NUMA
+ up_read(&mm->mmap_sem);
VM_BUG_ON(!*hpage);
+ new_page = *hpage;
+#else
+ VM_BUG_ON(*hpage);
+ /*
+ * Allocate the page while the vma is still valid and under
+ * the mmap_sem read mode so there is no memory allocation
+ * later when we take the mmap_sem in write mode. This is more
+ * friendly behavior (OTOH it may actually hide bugs) to
+ * filesystems in userland with daemons allocating memory in
+ * the userland I/O paths. Allocating memory with the
+ * mmap_sem in read mode is good idea also to allow greater
+ * scalability.
+ */
+ new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+ node, __GFP_OTHER_NODE);
+
+ /*
+ * After allocating the hugepage, release the mmap_sem read lock in
+ * preparation for taking it in write mode.
+ */
+ up_read(&mm->mmap_sem);
+ if (unlikely(!new_page)) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ *hpage = ERR_PTR(-ENOMEM);
+ return;
+ }
+#endif
+
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+#ifdef CONFIG_NUMA
+ put_page(new_page);
+#endif
+ return;
+ }
/*
* Prevent all access to pagetables with the exception of
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
goto out;
- if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+ if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE))
goto out;
- /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
- if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+ if (!vma->anon_vma || vma->vm_ops)
goto out;
- VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+ if (is_vma_temporary_stack(vma))
+ goto out;
+ /*
+ * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
+ * true too, verify it here.
+ */
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
goto out;
- new_page = *hpage;
- if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
- goto out;
-
anon_vma_lock(vma->anon_vma);
pte = pte_offset_map(pmd, address);
spin_lock(ptl);
isolated = __collapse_huge_page_isolate(vma, address, pte);
spin_unlock(ptl);
- pte_unmap(pte);
if (unlikely(!isolated)) {
+ pte_unmap(pte);
spin_lock(&mm->page_table_lock);
BUG_ON(!pmd_none(*pmd));
set_pmd_at(mm, address, pmd, _pmd);
spin_unlock(&mm->page_table_lock);
anon_vma_unlock(vma->anon_vma);
- mem_cgroup_uncharge_page(new_page);
goto out;
}
anon_vma_unlock(vma->anon_vma);
__collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+ pte_unmap(pte);
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
VM_BUG_ON(page_count(pgtable) != 1);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address);
set_pmd_at(mm, address, pmd, _pmd);
- update_mmu_cache(vma, address, entry);
+ update_mmu_cache(vma, address, _pmd);
prepare_pmd_huge_pte(pgtable, mm);
- mm->nr_ptes--;
spin_unlock(&mm->page_table_lock);
+#ifndef CONFIG_NUMA
*hpage = NULL;
+#endif
khugepaged_pages_collapsed++;
-out:
+out_up_write:
up_write(&mm->mmap_sem);
+ return;
+
+out:
+ mem_cgroup_uncharge_page(new_page);
+#ifdef CONFIG_NUMA
+ put_page(new_page);
+#endif
+ goto out_up_write;
}
static int khugepaged_scan_pmd(struct mm_struct *mm,
struct page *page;
unsigned long _address;
spinlock_t *ptl;
+ int node = -1;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
page = vm_normal_page(vma, _address, pteval);
if (unlikely(!page))
goto out_unmap;
+ /*
+ * Chose the node of the first page. This could
+ * be more sophisticated and look at more pages,
+ * but isn't for now.
+ */
+ if (node == -1)
+ node = page_to_nid(page);
VM_BUG_ON(PageCompound(page));
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
/* cannot use mapcount: can't collapse if there's a gup pin */
if (page_count(page) != 1)
goto out_unmap;
- if (pte_young(pteval))
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
referenced = 1;
}
if (referenced)
ret = 1;
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret) {
- up_read(&mm->mmap_sem);
- collapse_huge_page(mm, address, hpage);
- }
+ if (ret)
+ /* collapse_huge_page will return with the mmap_sem released */
+ collapse_huge_page(mm, address, hpage, vma, node);
out:
return ret;
}
{
struct mm_struct *mm = mm_slot->mm;
- VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+ VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
if (khugepaged_test_exit(mm)) {
/* free mm_slot */
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
struct page **hpage)
+ __releases(&khugepaged_mm_lock)
+ __acquires(&khugepaged_mm_lock)
{
struct mm_slot *mm_slot;
struct mm_struct *mm;
int progress = 0;
VM_BUG_ON(!pages);
- VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+ VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
if (khugepaged_scan.mm_slot)
mm_slot = khugepaged_scan.mm_slot;
break;
}
- if (!(vma->vm_flags & VM_HUGEPAGE) &&
- !khugepaged_always()) {
+ if ((!(vma->vm_flags & VM_HUGEPAGE) &&
+ !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE)) {
+ skip:
progress++;
continue;
}
-
- /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
- if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
- khugepaged_scan.address = vma->vm_end;
- progress++;
- continue;
- }
- VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+ if (!vma->anon_vma || vma->vm_ops)
+ goto skip;
+ if (is_vma_temporary_stack(vma))
+ goto skip;
+ /*
+ * If is_pfn_mapping() is true is_learn_pfn_mapping()
+ * must be true too, verify it here.
+ */
+ VM_BUG_ON(is_linear_pfn_mapping(vma) ||
+ vma->vm_flags & VM_NO_THP);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
- if (hstart >= hend) {
- progress++;
- continue;
- }
+ if (hstart >= hend)
+ goto skip;
+ if (khugepaged_scan.address > hend)
+ goto skip;
if (khugepaged_scan.address < hstart)
khugepaged_scan.address = hstart;
- if (khugepaged_scan.address > hend) {
- khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
- progress++;
- continue;
- }
- BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
while (khugepaged_scan.address < hend) {
int ret;
breakouterloop_mmap_sem:
spin_lock(&khugepaged_mm_lock);
- BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+ VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
/*
* Release the current mm_slot if this mm is about to die, or
* if we scanned all vmas of this mm.
while (progress < pages) {
cond_resched();
+#ifndef CONFIG_NUMA
if (!*hpage) {
*hpage = alloc_hugepage(khugepaged_defrag());
- if (unlikely(!*hpage))
+ if (unlikely(!*hpage)) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
break;
+ }
+ count_vm_event(THP_COLLAPSE_ALLOC);
}
+#else
+ if (IS_ERR(*hpage))
+ break;
+#endif
+
+ if (unlikely(kthread_should_stop() || freezing(current)))
+ break;
spin_lock(&khugepaged_mm_lock);
if (!khugepaged_scan.mm_slot)
}
}
+static void khugepaged_alloc_sleep(void)
+{
+ wait_event_freezable_timeout(khugepaged_wait, false,
+ msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+}
+
+#ifndef CONFIG_NUMA
static struct page *khugepaged_alloc_hugepage(void)
{
struct page *hpage;
do {
hpage = alloc_hugepage(khugepaged_defrag());
if (!hpage) {
- DEFINE_WAIT(wait);
- add_wait_queue(&khugepaged_wait, &wait);
- schedule_timeout_interruptible(
- msecs_to_jiffies(
- khugepaged_alloc_sleep_millisecs));
- remove_wait_queue(&khugepaged_wait, &wait);
- }
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ khugepaged_alloc_sleep();
+ } else
+ count_vm_event(THP_COLLAPSE_ALLOC);
} while (unlikely(!hpage) &&
likely(khugepaged_enabled()));
return hpage;
}
+#endif
static void khugepaged_loop(void)
{
struct page *hpage;
+#ifdef CONFIG_NUMA
+ hpage = NULL;
+#endif
while (likely(khugepaged_enabled())) {
+#ifndef CONFIG_NUMA
hpage = khugepaged_alloc_hugepage();
if (unlikely(!hpage))
break;
+#else
+ if (IS_ERR(hpage)) {
+ khugepaged_alloc_sleep();
+ hpage = NULL;
+ }
+#endif
khugepaged_do_scan(&hpage);
+#ifndef CONFIG_NUMA
if (hpage)
put_page(hpage);
+#endif
+ try_to_freeze();
+ if (unlikely(kthread_should_stop()))
+ break;
if (khugepaged_has_work()) {
- DEFINE_WAIT(wait);
if (!khugepaged_scan_sleep_millisecs)
continue;
- add_wait_queue(&khugepaged_wait, &wait);
- schedule_timeout_interruptible(
- msecs_to_jiffies(
- khugepaged_scan_sleep_millisecs));
- remove_wait_queue(&khugepaged_wait, &wait);
+ wait_event_freezable_timeout(khugepaged_wait, false,
+ msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
} else if (khugepaged_enabled())
- wait_event_interruptible(khugepaged_wait,
- khugepaged_wait_event());
+ wait_event_freezable(khugepaged_wait,
+ khugepaged_wait_event());
}
}
{
struct mm_slot *mm_slot;
+ set_freezable();
set_user_nice(current, 19);
/* serialize with start_khugepaged() */
for (;;) {
mutex_unlock(&khugepaged_mutex);
- BUG_ON(khugepaged_thread != current);
+ VM_BUG_ON(khugepaged_thread != current);
khugepaged_loop();
- BUG_ON(khugepaged_thread != current);
+ VM_BUG_ON(khugepaged_thread != current);
mutex_lock(&khugepaged_mutex);
if (!khugepaged_enabled())
break;
+ if (unlikely(kthread_should_stop()))
+ break;
}
spin_lock(&khugepaged_mm_lock);
put_page(page);
BUG_ON(pmd_trans_huge(*pmd));
}
+
+static void split_huge_page_address(struct mm_struct *mm,
+ unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return;
+ /*
+ * Caller holds the mmap_sem write mode, so a huge pmd cannot
+ * materialize from under us.
+ */
+ split_huge_page_pmd(mm, pmd);
+}
+
+void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ long adjust_next)
+{
+ /*
+ * If the new start address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (start & ~HPAGE_PMD_MASK &&
+ (start & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_page_address(vma->vm_mm, start);
+
+ /*
+ * If the new end address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (end & ~HPAGE_PMD_MASK &&
+ (end & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_page_address(vma->vm_mm, end);
+
+ /*
+ * If we're also updating the vma->vm_next->vm_start, if the new
+ * vm_next->vm_start isn't page aligned and it could previously
+ * contain an hugepage: check if we need to split an huge pmd.
+ */
+ if (adjust_next > 0) {
+ struct vm_area_struct *next = vma->vm_next;
+ unsigned long nstart = next->vm_start;
+ nstart += adjust_next << PAGE_SHIFT;
+ if (nstart & ~HPAGE_PMD_MASK &&
+ (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
+ (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
+ split_huge_page_address(next->vm_mm, nstart);
+ }
+}