*/
static DEFINE_SPINLOCK(hugetlb_lock);
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+{
+ bool free = (spool->count == 0) && (spool->used_hpages == 0);
+
+ spin_unlock(&spool->lock);
+
+ /* If no pages are used, and no other handles to the subpool
+ * remain, free the subpool the subpool remain */
+ if (free)
+ kfree(spool);
+}
+
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+{
+ struct hugepage_subpool *spool;
+
+ spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+ if (!spool)
+ return NULL;
+
+ spin_lock_init(&spool->lock);
+ spool->count = 1;
+ spool->max_hpages = nr_blocks;
+ spool->used_hpages = 0;
+
+ return spool;
+}
+
+void hugepage_put_subpool(struct hugepage_subpool *spool)
+{
+ spin_lock(&spool->lock);
+ BUG_ON(!spool->count);
+ spool->count--;
+ unlock_or_release_subpool(spool);
+}
+
+static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+ long delta)
+{
+ int ret = 0;
+
+ if (!spool)
+ return 0;
+
+ spin_lock(&spool->lock);
+ if ((spool->used_hpages + delta) <= spool->max_hpages) {
+ spool->used_hpages += delta;
+ } else {
+ ret = -ENOMEM;
+ }
+ spin_unlock(&spool->lock);
+
+ return ret;
+}
+
+static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+ long delta)
+{
+ if (!spool)
+ return;
+
+ spin_lock(&spool->lock);
+ spool->used_hpages -= delta;
+ /* If hugetlbfs_put_super couldn't free spool due to
+ * an outstanding quota reference, free it now. */
+ unlock_or_release_subpool(spool);
+}
+
+static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
+{
+ return HUGETLBFS_SB(inode->i_sb)->spool;
+}
+
+static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
+{
+ return subpool_inode(vma->vm_file->f_dentry->d_inode);
+}
+
/*
* Region tracking -- allows tracking of reservations and instantiated pages
* across the pages in a mapping.
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
+ unsigned int cpuset_mems_cookie;
- get_mems_allowed();
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
zonelist = huge_zonelist(vma, address,
htlb_alloc_mask, &mpol, &nodemask);
/*
}
}
}
-err:
+
mpol_cond_put(mpol);
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
return page;
+
+err:
+ mpol_cond_put(mpol);
+ return NULL;
}
static void update_and_free_page(struct hstate *h, struct page *page)
*/
struct hstate *h = page_hstate(page);
int nid = page_to_nid(page);
- struct address_space *mapping;
+ struct hugepage_subpool *spool =
+ (struct hugepage_subpool *)page_private(page);
- mapping = (struct address_space *) page_private(page);
set_page_private(page, 0);
page->mapping = NULL;
BUG_ON(page_count(page));
enqueue_huge_page(h, page);
}
spin_unlock(&hugetlb_lock);
- if (mapping)
- hugetlb_put_quota(mapping, 1);
+ hugepage_subpool_put_pages(spool, 1);
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
if (page && arch_prepare_hugepage(page)) {
__free_pages(page, huge_page_order(h));
- return NULL;
+ page = NULL;
}
spin_lock(&hugetlb_lock);
struct page *page, *tmp;
int ret, i;
int needed, allocated;
+ bool alloc_ok = true;
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
if (needed <= 0) {
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
- if (!page)
- /*
- * We were not able to allocate enough pages to
- * satisfy the entire reservation so we free what
- * we've allocated so far.
- */
- goto free;
-
+ if (!page) {
+ alloc_ok = false;
+ break;
+ }
list_add(&page->lru, &surplus_list);
}
- allocated += needed;
+ allocated += i;
/*
* After retaking hugetlb_lock, we need to recalculate 'needed'
spin_lock(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) -
(h->free_huge_pages + allocated);
- if (needed > 0)
- goto retry;
-
+ if (needed > 0) {
+ if (alloc_ok)
+ goto retry;
+ /*
+ * We were not able to allocate enough pages to
+ * satisfy the entire reservation so we free what
+ * we've allocated so far.
+ */
+ goto free;
+ }
/*
* The surplus_list now contains _at_least_ the number of extra pages
* needed to accommodate the reservation. Add the appropriate number
h->resv_huge_pages += delta;
ret = 0;
- spin_unlock(&hugetlb_lock);
/* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
if ((--needed) < 0)
VM_BUG_ON(page_count(page));
enqueue_huge_page(h, page);
}
+free:
+ spin_unlock(&hugetlb_lock);
/* Free unnecessary surplus pages to the buddy allocator */
-free:
if (!list_empty(&surplus_list)) {
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
list_del(&page->lru);
/*
* Determine if the huge page at addr within the vma has an associated
* reservation. Where it does not we will need to logically increase
- * reservation and actually increase quota before an allocation can occur.
- * Where any new reservation would be required the reservation change is
- * prepared, but not committed. Once the page has been quota'd allocated
- * an instantiated the change should be committed via vma_commit_reservation.
- * No action is required on failure.
+ * reservation and actually increase subpool usage before an allocation
+ * can occur. Where any new reservation would be required the
+ * reservation change is prepared, but not committed. Once the page
+ * has been allocated from the subpool and instantiated the change should
+ * be committed via vma_commit_reservation. No action is required on
+ * failure.
*/
static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
static struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
+ struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
- struct address_space *mapping = vma->vm_file->f_mapping;
- struct inode *inode = mapping->host;
long chg;
/*
- * Processes that did not create the mapping will have no reserves and
- * will not have accounted against quota. Check that the quota can be
- * made before satisfying the allocation
- * MAP_NORESERVE mappings may also need pages and quota allocated
- * if no reserve mapping overlaps.
+ * Processes that did not create the mapping will have no
+ * reserves and will not have accounted against subpool
+ * limit. Check that the subpool limit can be made before
+ * satisfying the allocation MAP_NORESERVE mappings may also
+ * need pages and subpool limit allocated allocated if no reserve
+ * mapping overlaps.
*/
chg = vma_needs_reservation(h, vma, addr);
if (chg < 0)
return ERR_PTR(-VM_FAULT_OOM);
if (chg)
- if (hugetlb_get_quota(inode->i_mapping, chg))
+ if (hugepage_subpool_get_pages(spool, chg))
return ERR_PTR(-VM_FAULT_SIGBUS);
spin_lock(&hugetlb_lock);
if (!page) {
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) {
- hugetlb_put_quota(inode->i_mapping, chg);
+ hugepage_subpool_put_pages(spool, chg);
return ERR_PTR(-VM_FAULT_SIGBUS);
}
}
- set_page_private(page, (unsigned long) mapping);
+ set_page_private(page, (unsigned long)spool);
vma_commit_reservation(h, vma, addr);
/*
* node_hstate/s - associate per node hstate attributes, via their kobjects,
- * with node sysdevs in node_devices[] using a parallel array. The array
- * index of a node sysdev or _hstate == node id.
- * This is here to avoid any static dependency of the node sysdev driver, in
+ * with node devices in node_devices[] using a parallel array. The array
+ * index of a node device or _hstate == node id.
+ * This is here to avoid any static dependency of the node device driver, in
* the base kernel, on the hugetlb module.
*/
struct node_hstate {
struct node_hstate node_hstates[MAX_NUMNODES];
/*
- * A subset of global hstate attributes for node sysdevs
+ * A subset of global hstate attributes for node devices
*/
static struct attribute *per_node_hstate_attrs[] = {
&nr_hugepages_attr.attr,
};
/*
- * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj.
+ * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
* Returns node id via non-NULL nidp.
*/
static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
}
/*
- * Unregister hstate attributes from a single node sysdev.
+ * Unregister hstate attributes from a single node device.
* No-op if no hstate attributes attached.
*/
void hugetlb_unregister_node(struct node *node)
{
struct hstate *h;
- struct node_hstate *nhs = &node_hstates[node->sysdev.id];
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
if (!nhs->hugepages_kobj)
return; /* no hstate attributes */
}
/*
- * hugetlb module exit: unregister hstate attributes from node sysdevs
+ * hugetlb module exit: unregister hstate attributes from node devices
* that have them.
*/
static void hugetlb_unregister_all_nodes(void)
int nid;
/*
- * disable node sysdev registrations.
+ * disable node device registrations.
*/
register_hugetlbfs_with_node(NULL, NULL);
}
/*
- * Register hstate attributes for a single node sysdev.
+ * Register hstate attributes for a single node device.
* No-op if attributes already registered.
*/
void hugetlb_register_node(struct node *node)
{
struct hstate *h;
- struct node_hstate *nhs = &node_hstates[node->sysdev.id];
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
int err;
if (nhs->hugepages_kobj)
return; /* already allocated */
nhs->hugepages_kobj = kobject_create_and_add("hugepages",
- &node->sysdev.kobj);
+ &node->dev.kobj);
if (!nhs->hugepages_kobj)
return;
if (err) {
printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
" for node %d\n",
- h->name, node->sysdev.id);
+ h->name, node->dev.id);
hugetlb_unregister_node(node);
break;
}
/*
* hugetlb init time: register hstate attributes for all registered node
- * sysdevs of nodes that have memory. All on-line nodes should have
- * registered their associated sysdev by this time.
+ * devices of nodes that have memory. All on-line nodes should have
+ * registered their associated device by this time.
*/
static void hugetlb_register_all_nodes(void)
{
for_each_node_state(nid, N_HIGH_MEMORY) {
struct node *node = &node_devices[nid];
- if (node->sysdev.id == nid)
+ if (node->dev.id == nid)
hugetlb_register_node(node);
}
/*
- * Let the node sysdev driver know we're here so it can
+ * Let the node device driver know we're here so it can
* [un]register hstate attributes on node hotplug.
*/
register_hugetlbfs_with_node(hugetlb_register_node,
{
struct hstate *h = hstate_vma(vma);
struct resv_map *reservations = vma_resv_map(vma);
+ struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve;
unsigned long start;
unsigned long end;
if (reserve) {
hugetlb_acct_memory(h, -reserve);
- hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+ hugepage_subpool_put_pages(spool, reserve);
}
}
}
if (huge_pmd_unshare(mm, &address, ptep))
continue;
+ pte = huge_ptep_get(ptep);
+ if (huge_pte_none(pte))
+ continue;
+
+ /*
+ * HWPoisoned hugepage is already unmapped and dropped reference
+ */
+ if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
+ continue;
+
+ page = pte_page(pte);
/*
* If a reference page is supplied, it is because a specific
* page is being unmapped, not a range. Ensure the page we
* are about to unmap is the actual page of interest.
*/
if (ref_page) {
- pte = huge_ptep_get(ptep);
- if (huge_pte_none(pte))
- continue;
- page = pte_page(pte);
if (page != ref_page)
continue;
}
pte = huge_ptep_get_and_clear(mm, address, ptep);
- if (huge_pte_none(pte))
- continue;
-
- /*
- * HWPoisoned hugepage is already unmapped and dropped reference
- */
- if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
- continue;
-
- page = pte_page(pte);
if (pte_dirty(pte))
set_page_dirty(page);
list_add(&page->lru, &page_list);
+
+ /* Bail out after unmapping reference page if supplied */
+ if (ref_page)
+ break;
}
- spin_unlock(&mm->page_table_lock);
flush_tlb_range(vma, start, end);
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, start, end);
list_for_each_entry_safe(page, tmp, &page_list, lru) {
page_remove_rmap(page);
* from page cache lookup which is in HPAGE_SIZE units.
*/
address = address & huge_page_mask(h);
- pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
- + (vma->vm_pgoff >> PAGE_SHIFT);
- mapping = (struct address_space *)page_private(page);
+ pgoff = vma_hugecache_offset(h, vma, address);
+ mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
/*
* Take the mapping lock for the duration of the table walk. As
/*
* Hugetlb_cow() should be called with page lock of the original hugepage held.
+ * Called with hugetlb_instantiation_mutex held and pte_page locked so we
+ * cannot race with other handlers or page migration.
+ * Keep the pte_same checks anyway to make transition from the mutex easier.
*/
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, pte_t pte,
if (outside_reserve) {
BUG_ON(huge_pte_none(pte));
if (unmap_ref_private(mm, vma, old_page, address)) {
- BUG_ON(page_count(old_page) != 1);
BUG_ON(huge_pte_none(pte));
spin_lock(&mm->page_table_lock);
- goto retry_avoidcopy;
+ ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+ if (likely(pte_same(huge_ptep_get(ptep), pte)))
+ goto retry_avoidcopy;
+ /*
+ * race occurs while re-acquiring page_table_lock, and
+ * our job is done.
+ */
+ return 0;
}
WARN_ON_ONCE(1);
}
{
struct hstate *h = hstate_vma(vma);
int ret = VM_FAULT_SIGBUS;
+ int anon_rmap = 0;
pgoff_t idx;
unsigned long size;
struct page *page;
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
- page_dup_rmap(page);
} else {
lock_page(page);
if (unlikely(anon_vma_prepare(vma))) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
}
- hugepage_add_new_anon_rmap(page, vma, address);
+ anon_rmap = 1;
}
} else {
/*
VM_FAULT_SET_HINDEX(h - hstates);
goto backout_unlocked;
}
- page_dup_rmap(page);
}
/*
if (!huge_pte_none(huge_ptep_get(ptep)))
goto backout;
+ if (anon_rmap)
+ hugepage_add_new_anon_rmap(page, vma, address);
+ else
+ page_dup_rmap(page);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, address, ptep, new_pte);
static DEFINE_MUTEX(hugetlb_instantiation_mutex);
struct hstate *h = hstate_vma(vma);
+ address &= huge_page_mask(h);
+
ptep = huge_pte_offset(mm, address);
if (ptep) {
entry = huge_ptep_get(ptep);
* so no worry about deadlock.
*/
page = pte_page(entry);
+ get_page(page);
if (page != pagecache_page)
lock_page(page);
}
if (page != pagecache_page)
unlock_page(page);
+ put_page(page);
out_mutex:
mutex_unlock(&hugetlb_instantiation_mutex);
{
long ret, chg;
struct hstate *h = hstate_inode(inode);
+ struct hugepage_subpool *spool = subpool_inode(inode);
/*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
- * and filesystem quota without using reserves
+ * without using reserves
*/
if (vm_flags & VM_NORESERVE)
return 0;
if (chg < 0)
return chg;
- /* There must be enough filesystem quota for the mapping */
- if (hugetlb_get_quota(inode->i_mapping, chg))
+ /* There must be enough pages in the subpool for the mapping */
+ if (hugepage_subpool_get_pages(spool, chg))
return -ENOSPC;
/*
* Check enough hugepages are available for the reservation.
- * Hand back the quota if there are not
+ * Hand the pages back to the subpool if there are not
*/
ret = hugetlb_acct_memory(h, chg);
if (ret < 0) {
- hugetlb_put_quota(inode->i_mapping, chg);
+ hugepage_subpool_put_pages(spool, chg);
return ret;
}
{
struct hstate *h = hstate_inode(inode);
long chg = region_truncate(&inode->i_mapping->private_list, offset);
+ struct hugepage_subpool *spool = subpool_inode(inode);
spin_lock(&inode->i_lock);
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
- hugetlb_put_quota(inode->i_mapping, (chg - freed));
+ hugepage_subpool_put_pages(spool, (chg - freed));
hugetlb_acct_memory(h, -(chg - freed));
}