#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/profile.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- unsigned long n;
+ free = global_page_state(NR_FREE_PAGES);
+ free += global_page_state(NR_FILE_PAGES);
+
+ /*
+ * shmem pages shouldn't be counted as free in this
+ * case, they can't be purged, only swapped out, and
+ * that won't affect the overall amount of available
+ * memory in the system.
+ */
+ free -= global_page_state(NR_SHMEM);
- free = global_page_state(NR_FILE_PAGES);
free += nr_swap_pages;
/*
free += global_page_state(NR_SLAB_RECLAIMABLE);
/*
- * Leave the last 3% for root
- */
- if (!cap_sys_admin)
- free -= free / 32;
-
- if (free > pages)
- return 0;
-
- /*
- * nr_free_pages() is very expensive on large systems,
- * only call if we're about to fail.
- */
- n = nr_free_pages();
-
- /*
* Leave reserved pages. The pages are not for anonymous pages.
*/
- if (n <= totalreserve_pages)
+ if (free <= totalreserve_pages)
goto error;
else
- n -= totalreserve_pages;
+ free -= totalreserve_pages;
/*
* Leave the last 3% for root
*/
if (!cap_sys_admin)
- n -= n / 32;
- free += n;
+ free -= free / 32;
if (free > pages)
return 0;
}
/*
- * Requires inode->i_mapping->i_mmap_lock
+ * Requires inode->i_mapping->i_mmap_mutex
*/
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
if (file) {
struct address_space *mapping = file->f_mapping;
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
__remove_shared_vm_struct(vma, file, mapping);
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
}
return next;
}
+static unsigned long do_brk(unsigned long addr, unsigned long len);
+
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long rlim, retval;
return vma;
}
-static inline void
-__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node *rb_parent)
-{
- struct vm_area_struct *next;
-
- vma->vm_prev = prev;
- if (prev) {
- next = prev->vm_next;
- prev->vm_next = vma;
- } else {
- mm->mmap = vma;
- if (rb_parent)
- next = rb_entry(rb_parent,
- struct vm_area_struct, vm_rb);
- else
- next = NULL;
- }
- vma->vm_next = next;
- if (next)
- next->vm_prev = vma;
-}
-
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
struct rb_node **rb_link, struct rb_node *rb_parent)
{
if (vma->vm_file)
mapping = vma->vm_file->f_mapping;
- if (mapping) {
- spin_lock(&mapping->i_mmap_lock);
- vma->vm_truncate_count = mapping->truncate_count;
- }
+ if (mapping)
+ mutex_lock(&mapping->i_mmap_mutex);
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
mm->map_count++;
validate_mm(mm);
}
/*
- * Helper for vma_adjust in the split_vma insert case:
- * insert vm structure into list and rbtree and anon_vma,
- * but it has already been inserted into prio_tree earlier.
+ * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
+ * mm's list and rbtree. It has already been inserted into the prio_tree.
*/
static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
mapping = file->f_mapping;
if (!(vma->vm_flags & VM_NONLINEAR))
root = &mapping->i_mmap;
- spin_lock(&mapping->i_mmap_lock);
- if (importer &&
- vma->vm_truncate_count != next->vm_truncate_count) {
- /*
- * unmap_mapping_range might be in progress:
- * ensure that the expanding vma is rescanned.
- */
- importer->vm_truncate_count = 0;
- }
+ mutex_lock(&mapping->i_mmap_mutex);
if (insert) {
- insert->vm_truncate_count = vma->vm_truncate_count;
/*
* Put into prio_tree now, so instantiated pages
* are visible to arm/parisc __flush_dcache_page
if (anon_vma)
anon_vma_unlock(anon_vma);
if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
if (remove_next) {
if (file) {
if (anon_vma)
return anon_vma;
try_prev:
- /*
- * It is potentially slow to have to call find_vma_prev here.
- * But it's only on the first write fault on the vma, not
- * every time, and we could devise a way to avoid it later
- * (e.g. stash info in next's anon_vma_node when assigning
- * an anon_vma, or when trying vma_merge). Another time.
- */
- BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
+ near = vma->vm_prev;
if (!near)
goto none;
#endif /* CONFIG_PROC_FS */
/*
+ * If a hint addr is less than mmap_min_addr change hint to be as
+ * low as possible but still greater than mmap_min_addr
+ */
+static inline unsigned long round_hint_to_min(unsigned long hint)
+{
+ hint &= PAGE_MASK;
+ if (((void *)hint != NULL) &&
+ (hint < mmap_min_addr))
+ return PAGE_ALIGN(mmap_min_addr);
+ return hint;
+}
+
+/*
* The caller must hold down_write(¤t->mm->mmap_sem).
*/
-unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, unsigned long pgoff)
{
struct mm_struct * mm = current->mm;
struct inode *inode;
- unsigned int vm_flags;
+ vm_flags_t vm_flags;
int error;
unsigned long reqprot = prot;
return mmap_region(file, addr, len, flags, vm_flags, pgoff);
}
-EXPORT_SYMBOL(do_mmap_pgoff);
+
+unsigned long do_mmap(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flag, unsigned long offset)
+{
+ if (unlikely(offset + PAGE_ALIGN(len) < offset))
+ return -EINVAL;
+ if (unlikely(offset & ~PAGE_MASK))
+ return -EINVAL;
+ return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(do_mmap);
+
+unsigned long vm_mmap(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flag, unsigned long offset)
+{
+ unsigned long ret;
+ struct mm_struct *mm = current->mm;
+
+ down_write(&mm->mmap_sem);
+ ret = do_mmap(file, addr, len, prot, flag, offset);
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+EXPORT_SYMBOL(vm_mmap);
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
* A dummy user value is used because we are not locking
* memory so no accounting is necessary
*/
- len = ALIGN(len, huge_page_size(&default_hstate));
- file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
- &user, HUGETLB_ANONHUGE_INODE);
+ file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
+ VM_NORESERVE, &user,
+ HUGETLB_ANONHUGE_INODE);
if (IS_ERR(file))
return PTR_ERR(file);
}
*/
int vma_wants_writenotify(struct vm_area_struct *vma)
{
- unsigned int vm_flags = vma->vm_flags;
+ vm_flags_t vm_flags = vma->vm_flags;
/* If it was private or non-writable, the write bit is already clear */
if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
* We account for memory if it's a private writeable mapping,
* not hugepages and VM_NORESERVE wasn't set.
*/
-static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
+static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
{
/*
* hugetlb has its own accounting separate from the core VM
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, unsigned long flags,
- unsigned int vm_flags, unsigned long pgoff)
+ vm_flags_t vm_flags, unsigned long pgoff)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
*/
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
- if (security_vm_enough_memory(charged))
+ if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}
vma->vm_pgoff = pgoff;
INIT_LIST_HEAD(&vma->anon_vma_chain);
+ error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */
+
if (file) {
- error = -EINVAL;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
goto free_vma;
if (vm_flags & VM_DENYWRITE) {
pgoff = vma->vm_pgoff;
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
+ if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
+ goto free_vma;
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
/*
* Is this a new hole at the lowest possible address?
*/
- if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
+ if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
mm->free_area_cache = addr;
- mm->cached_hole_size = ~0UL;
- }
}
/*
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
- unsigned long addr = addr0;
+ unsigned long addr = addr0, start_addr;
/* requested length too big for entire address space */
if (len > TASK_SIZE)
mm->free_area_cache = mm->mmap_base;
}
+try_again:
/* either no address requested or can't fit in requested address hole */
- addr = mm->free_area_cache;
+ start_addr = addr = mm->free_area_cache;
- /* make sure it can fit in the remaining address space */
- if (addr > len) {
- vma = find_vma(mm, addr-len);
- if (!vma || addr <= vma->vm_start)
- /* remember the address as a hint for next time */
- return (mm->free_area_cache = addr-len);
- }
-
- if (mm->mmap_base < len)
- goto bottomup;
-
- addr = mm->mmap_base-len;
+ if (addr < len)
+ goto fail;
+ addr -= len;
do {
/*
* Lookup failure means no vma is above this address,
addr = vma->vm_start-len;
} while (len < vma->vm_start);
-bottomup:
+fail:
+ /*
+ * if hint left us with no space for the requested
+ * mapping then try again:
+ *
+ * Note: this is different with the case of bottomup
+ * which does the fully line-search, but we use find_vma
+ * here that causes some holes skipped.
+ */
+ if (start_addr != mm->mmap_base) {
+ mm->free_area_cache = mm->mmap_base;
+ mm->cached_hole_size = 0;
+ goto try_again;
+ }
+
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
EXPORT_SYMBOL(find_vma);
-/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
+/*
+ * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
+ */
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
- struct vm_area_struct *vma = NULL, *prev = NULL;
- struct rb_node *rb_node;
- if (!mm)
- goto out;
-
- /* Guard against addr being lower than the first VMA */
- vma = mm->mmap;
-
- /* Go through the RB tree quickly. */
- rb_node = mm->mm_rb.rb_node;
-
- while (rb_node) {
- struct vm_area_struct *vma_tmp;
- vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+ struct vm_area_struct *vma;
- if (addr < vma_tmp->vm_end) {
- rb_node = rb_node->rb_left;
- } else {
- prev = vma_tmp;
- if (!prev->vm_next || (addr < prev->vm_next->vm_end))
- break;
+ vma = find_vma(mm, addr);
+ if (vma) {
+ *pprev = vma->vm_prev;
+ } else {
+ struct rb_node *rb_node = mm->mm_rb.rb_node;
+ *pprev = NULL;
+ while (rb_node) {
+ *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
rb_node = rb_node->rb_right;
}
}
-
-out:
- *pprev = prev;
- return prev ? prev->vm_next : vma;
+ return vma;
}
/*
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
*/
-static int expand_downwards(struct vm_area_struct *vma,
+int expand_downwards(struct vm_area_struct *vma,
unsigned long address)
{
int error;
return error;
}
-int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address)
-{
- return expand_downwards(vma, address);
-}
-
#ifdef CONFIG_STACK_GROWSUP
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
unsigned long start, unsigned long end)
{
struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
- struct mmu_gather *tlb;
+ struct mmu_gather tlb;
unsigned long nr_accounted = 0;
lru_add_drain();
- tlb = tlb_gather_mmu(mm, 0);
+ tlb_gather_mmu(&tlb, mm, 0);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
- next? next->vm_start: 0);
- tlb_finish_mmu(tlb, start, end);
+ free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+ next ? next->vm_start : 0);
+ tlb_finish_mmu(&tlb, start, end);
}
/*
return -EINVAL;
/* Find the first overlapping VMA */
- vma = find_vma_prev(mm, start, &prev);
+ vma = find_vma(mm, start);
if (!vma)
return 0;
+ prev = vma->vm_prev;
/* we have start < vma->vm_end */
/* if it doesn't overlap, we have nothing.. */
return 0;
}
-
EXPORT_SYMBOL(do_munmap);
-SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+int vm_munmap(struct mm_struct *mm, unsigned long start, size_t len)
{
int ret;
- struct mm_struct *mm = current->mm;
-
- profile_munmap(addr);
down_write(&mm->mmap_sem);
- ret = do_munmap(mm, addr, len);
+ ret = do_munmap(mm, start, len);
up_write(&mm->mmap_sem);
return ret;
}
+EXPORT_SYMBOL(vm_munmap);
+
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+{
+ profile_munmap(addr);
+ return vm_munmap(current->mm, addr, len);
+}
static inline void verify_mm_writelocked(struct mm_struct *mm)
{
* anonymous maps. eventually we may be able to do some
* brk-specific accounting here.
*/
-unsigned long do_brk(unsigned long addr, unsigned long len)
+static unsigned long do_brk(unsigned long addr, unsigned long len)
{
struct mm_struct * mm = current->mm;
struct vm_area_struct * vma, * prev;
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
- if (security_vm_enough_memory(len >> PAGE_SHIFT))
+ if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
/* Can we just expand an old private anonymous mapping? */
return addr;
}
-EXPORT_SYMBOL(do_brk);
+unsigned long vm_brk(unsigned long addr, unsigned long len)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long ret;
+
+ down_write(&mm->mmap_sem);
+ ret = do_brk(addr, len);
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+EXPORT_SYMBOL(vm_brk);
/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
- struct mmu_gather *tlb;
+ struct mmu_gather tlb;
struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
- unsigned long end;
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);
lru_add_drain();
flush_cache_mm(mm);
- tlb = tlb_gather_mmu(mm, 1);
+ tlb_gather_mmu(&tlb, mm, 1);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
- end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+ unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
- tlb_finish_mmu(tlb, 0, end);
+ free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+ tlb_finish_mmu(&tlb, 0, -1);
/*
* Walk the list again, actually closing and freeing it,
/* Insert vm structure into process list sorted by address
* and into the inode's i_mmap tree. If vm_file is non-NULL
- * then i_mmap_lock is taken here.
+ * then i_mmap_mutex is taken here.
*/
int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
{
struct vm_area_struct *new_vma, *prev;
struct rb_node **rb_link, *rb_parent;
struct mempolicy *pol;
+ bool faulted_in_anon_vma = true;
/*
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
*/
- if (!vma->vm_file && !vma->anon_vma)
+ if (unlikely(!vma->vm_file && !vma->anon_vma)) {
pgoff = addr >> PAGE_SHIFT;
+ faulted_in_anon_vma = false;
+ }
find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
/*
* Source vma may have been merged into new_vma
*/
- if (vma_start >= new_vma->vm_start &&
- vma_start < new_vma->vm_end)
+ if (unlikely(vma_start >= new_vma->vm_start &&
+ vma_start < new_vma->vm_end)) {
+ /*
+ * The only way we can get a vma_merge with
+ * self during an mremap is if the vma hasn't
+ * been faulted in yet and we were allowed to
+ * reset the dst vma->vm_pgoff to the
+ * destination address of the mremap to allow
+ * the merge to happen. mremap must change the
+ * vm_pgoff linearity between src and dst vmas
+ * (in turn preventing a vma_merge) to be
+ * safe. It is only safe to keep the vm_pgoff
+ * linear if there are no pages mapped yet.
+ */
+ VM_BUG_ON(faulted_in_anon_vma);
*vmap = new_vma;
+ } else
+ anon_vma_moveto_tail(new_vma);
} else {
new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (new_vma) {
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
- spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
+ mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
/*
* We can safely modify head.next after taking the
- * anon_vma->root->lock. If some other vma in this mm shares
+ * anon_vma->root->mutex. If some other vma in this mm shares
* the same anon_vma we won't take it again.
*
* No need of atomic instructions here, head.next
* can't change from under us thanks to the
- * anon_vma->root->lock.
+ * anon_vma->root->mutex.
*/
if (__test_and_set_bit(0, (unsigned long *)
&anon_vma->root->head.next))
*/
if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
BUG();
- spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
+ mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
}
}
* vma in this mm is backed by the same anon_vma or address_space.
*
* We can take all the locks in random order because the VM code
- * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never
* takes more than one of them in a row. Secondly we're protected
* against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
*
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
- int ret = -EINTR;
BUG_ON(down_read_trylock(&mm->mmap_sem));
vm_lock_anon_vma(mm, avc->anon_vma);
}
- ret = 0;
+ return 0;
out_unlock:
- if (ret)
- mm_drop_all_locks(mm);
-
- return ret;
+ mm_drop_all_locks(mm);
+ return -EINTR;
}
static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
*
* No need of atomic instructions here, head.next
* can't change from under us until we release the
- * anon_vma->root->lock.
+ * anon_vma->root->mutex.
*/
if (!__test_and_clear_bit(0, (unsigned long *)
&anon_vma->root->head.next))
* AS_MM_ALL_LOCKS can't change to 0 from under us
* because we hold the mm_all_locks_mutex.
*/
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
&mapping->flags))
BUG();