#include <linux/sched.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
+#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/backing-dev.h>
#include <linux/swapops.h>
#include <linux/hugetlb.h>
#include <linux/memory_hotplug.h>
+#include <linux/mm_inline.h>
+#include <linux/kfifo.h>
#include "internal.h"
int sysctl_memory_failure_early_kill __read_mostly = 0;
EXPORT_SYMBOL_GPL(hwpoison_filter);
/*
- * Send all the processes who have the page mapped an ``action optional''
- * signal.
+ * Send all the processes who have the page mapped a signal.
+ * ``action optional'' if they are not immediately affected by the error
+ * ``action required'' if error happened in current execution context
*/
-static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
- unsigned long pfn, struct page *page)
+static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
+ unsigned long pfn, struct page *page, int flags)
{
struct siginfo si;
int ret;
printk(KERN_ERR
- "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
+ "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
pfn, t->comm, t->pid);
si.si_signo = SIGBUS;
si.si_errno = 0;
- si.si_code = BUS_MCEERR_AO;
si.si_addr = (void *)addr;
#ifdef __ARCH_SI_TRAPNO
si.si_trapno = trapno;
#endif
si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
- /*
- * Don't use force here, it's convenient if the signal
- * can be temporarily blocked.
- * This could cause a loop when the user sets SIGBUS
- * to SIG_IGN, but hopefully noone will do that?
- */
- ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
+
+ if ((flags & MF_ACTION_REQUIRED) && t == current) {
+ si.si_code = BUS_MCEERR_AR;
+ ret = force_sig_info(SIGBUS, &si, t);
+ } else {
+ /*
+ * Don't use force here, it's convenient if the signal
+ * can be temporarily blocked.
+ * This could cause a loop when the user sets SIGBUS
+ * to SIG_IGN, but hopefully no one will do that?
+ */
+ si.si_code = BUS_MCEERR_AO;
+ ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
+ }
if (ret < 0)
printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
t->comm, t->pid, ret);
}
/*
- * Only all shrink_slab here (which would also
- * shrink other caches) if access is not potentially fatal.
+ * Only call shrink_slab here (which would also shrink other caches) if
+ * access is not potentially fatal.
*/
if (access) {
int nr;
do {
- nr = shrink_slab(1000, GFP_KERNEL, 1000);
+ struct shrink_control shrink = {
+ .gfp_mask = GFP_KERNEL,
+ };
+
+ nr = shrink_slab(&shrink, 1000, 1000);
if (page_count(p) == 1)
break;
} while (nr > 10);
* Also when FAIL is set do a force kill because something went
* wrong earlier.
*/
-static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
- int fail, struct page *page, unsigned long pfn)
+static void kill_procs(struct list_head *to_kill, int doit, int trapno,
+ int fail, struct page *page, unsigned long pfn,
+ int flags)
{
struct to_kill *tk, *next;
* check for that, but we need to tell the
* process anyways.
*/
- else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
- pfn, page) < 0)
+ else if (kill_proc(tk->tsk, tk->addr, trapno,
+ pfn, page, flags) < 0)
printk(KERN_ERR
"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
struct task_struct *tsk;
struct anon_vma *av;
- read_lock(&tasklist_lock);
av = page_lock_anon_vma(page);
if (av == NULL) /* Not actually mapped anymore */
- goto out;
+ return;
+
+ read_lock(&tasklist_lock);
for_each_process (tsk) {
struct anon_vma_chain *vmac;
add_to_kill(tsk, page, vma, to_kill, tkc);
}
}
- page_unlock_anon_vma(av);
-out:
read_unlock(&tasklist_lock);
+ page_unlock_anon_vma(av);
}
/*
struct prio_tree_iter iter;
struct address_space *mapping = page->mapping;
- /*
- * A note on the locking order between the two locks.
- * We don't rely on this particular order.
- * If you have some other code that needs a different order
- * feel free to switch them around. Or add a reverse link
- * from mm_struct to task_struct, then this could be all
- * done without taking tasklist_lock and looping over all tasks.
- */
-
+ mutex_lock(&mapping->i_mmap_mutex);
read_lock(&tasklist_lock);
- spin_lock(&mapping->i_mmap_lock);
for_each_process(tsk) {
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
add_to_kill(tsk, page, vma, to_kill, tkc);
}
}
- spin_unlock(&mapping->i_mmap_lock);
read_unlock(&tasklist_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
/*
* when the page is reread or dropped. If an
* application assumes it will always get error on
* fsync, but does other operations on the fd before
- * and the page is dropped inbetween then the error
+ * and the page is dropped between then the error
* will not be properly reported.
*
* This can already happen even without hwpoisoned
* The table matches them in order and calls the right handler.
*
* This is quite tricky because we can access page at any time
- * in its live cycle, so all accesses have to be extremly careful.
+ * in its live cycle, so all accesses have to be extremely careful.
*
* This is not complete. More states could be added.
* For any missing state don't attempt recovery.
* the pages and send SIGBUS to the processes if the data was dirty.
*/
static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
- int trapno)
+ int trapno, int flags)
{
enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
int ret;
int kill = 1;
struct page *hpage = compound_head(p);
+ struct page *ppage;
if (PageReserved(p) || PageSlab(p))
return SWAP_SUCCESS;
}
}
+ /*
+ * ppage: poisoned page
+ * if p is regular page(4k page)
+ * ppage == real poisoned page;
+ * else p is hugetlb or THP, ppage == head page.
+ */
+ ppage = hpage;
+
if (PageTransHuge(hpage)) {
/*
* Verify that this isn't a hugetlbfs head page, the check for
BUG_ON(!PageHWPoison(p));
return SWAP_FAIL;
}
+ /* THP is split, so ppage should be the real poisoned page. */
+ ppage = p;
}
}
* there's nothing that can be done.
*/
if (kill)
- collect_procs(hpage, &tokill);
+ collect_procs(ppage, &tokill);
+
+ if (hpage != ppage)
+ lock_page(ppage);
- ret = try_to_unmap(hpage, ttu);
+ ret = try_to_unmap(ppage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(hpage));
+ pfn, page_mapcount(ppage));
+
+ if (hpage != ppage)
+ unlock_page(ppage);
/*
* Now that the dirty bit has been propagated to the
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
- ret != SWAP_SUCCESS, p, pfn);
+ kill_procs(&tokill, !!PageDirty(ppage), trapno,
+ ret != SWAP_SUCCESS, p, pfn, flags);
return ret;
}
ClearPageHWPoison(hpage + i);
}
-int __memory_failure(unsigned long pfn, int trapno, int flags)
+/**
+ * memory_failure - Handle memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @trapno: Trap number reported in the signal to user space.
+ * @flags: fine tune action taken
+ *
+ * This function is called by the low level machine check code
+ * of an architecture when it detects hardware memory corruption
+ * of a page. It tries its best to recover, which includes
+ * dropping pages, killing processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Must run in process context (e.g. a work queue) with interrupts
+ * enabled and no spinlocks hold.
+ */
+int memory_failure(unsigned long pfn, int trapno, int flags)
{
struct page_state *ps;
struct page *p;
* Check "just unpoisoned", "filter hit", and
* "race with other subpage."
*/
- lock_page_nosync(hpage);
+ lock_page(hpage);
if (!PageHWPoison(hpage)
|| (hwpoison_filter(p) && TestClearPageHWPoison(p))
|| (p != hpage && TestSetPageHWPoison(hpage))) {
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
- if (!PageLRU(p) && !PageHuge(p))
- shake_page(p, 0);
- if (!PageLRU(p) && !PageHuge(p)) {
- /*
- * shake_page could have turned it free.
- */
- if (is_free_buddy_page(p)) {
- action_result(pfn, "free buddy, 2nd try", DELAYED);
- return 0;
+ if (!PageHuge(p) && !PageTransTail(p)) {
+ if (!PageLRU(p))
+ shake_page(p, 0);
+ if (!PageLRU(p)) {
+ /*
+ * shake_page could have turned it free.
+ */
+ if (is_free_buddy_page(p)) {
+ action_result(pfn, "free buddy, 2nd try",
+ DELAYED);
+ return 0;
+ }
+ action_result(pfn, "non LRU", IGNORED);
+ put_page(p);
+ return -EBUSY;
}
- action_result(pfn, "non LRU", IGNORED);
- put_page(p);
- return -EBUSY;
}
/*
* It's very difficult to mess with pages currently under IO
* and in many cases impossible, so we just avoid it here.
*/
- lock_page_nosync(hpage);
+ lock_page(hpage);
/*
* unpoison always clear PG_hwpoison inside page lock
* For error on the tail page, we should set PG_hwpoison
* on the head page to show that the hugepage is hwpoisoned
*/
- if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+ if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
action_result(pfn, "hugepage already hardware poisoned",
IGNORED);
unlock_page(hpage);
/*
* Now take care of user space mappings.
- * Abort on fail: __remove_from_page_cache() assumes unmapped page.
+ * Abort on fail: __delete_from_page_cache() assumes unmapped page.
*/
- if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
+ if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
res = -EBUSY;
goto out;
unlock_page(hpage);
return res;
}
-EXPORT_SYMBOL_GPL(__memory_failure);
+EXPORT_SYMBOL_GPL(memory_failure);
+
+#define MEMORY_FAILURE_FIFO_ORDER 4
+#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
+
+struct memory_failure_entry {
+ unsigned long pfn;
+ int trapno;
+ int flags;
+};
+
+struct memory_failure_cpu {
+ DECLARE_KFIFO(fifo, struct memory_failure_entry,
+ MEMORY_FAILURE_FIFO_SIZE);
+ spinlock_t lock;
+ struct work_struct work;
+};
+
+static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
/**
- * memory_failure - Handle memory failure of a page.
+ * memory_failure_queue - Schedule handling memory failure of a page.
* @pfn: Page Number of the corrupted page
* @trapno: Trap number reported in the signal to user space.
+ * @flags: Flags for memory failure handling
*
- * This function is called by the low level machine check code
- * of an architecture when it detects hardware memory corruption
- * of a page. It tries its best to recover, which includes
- * dropping pages, killing processes etc.
+ * This function is called by the low level hardware error handler
+ * when it detects hardware memory corruption of a page. It schedules
+ * the recovering of error page, including dropping pages, killing
+ * processes etc.
*
* The function is primarily of use for corruptions that
* happen outside the current execution context (e.g. when
* detected by a background scrubber)
*
- * Must run in process context (e.g. a work queue) with interrupts
- * enabled and no spinlocks hold.
+ * Can run in IRQ context.
*/
-void memory_failure(unsigned long pfn, int trapno)
+void memory_failure_queue(unsigned long pfn, int trapno, int flags)
+{
+ struct memory_failure_cpu *mf_cpu;
+ unsigned long proc_flags;
+ struct memory_failure_entry entry = {
+ .pfn = pfn,
+ .trapno = trapno,
+ .flags = flags,
+ };
+
+ mf_cpu = &get_cpu_var(memory_failure_cpu);
+ spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+ if (kfifo_put(&mf_cpu->fifo, &entry))
+ schedule_work_on(smp_processor_id(), &mf_cpu->work);
+ else
+ pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
+ pfn);
+ spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+ put_cpu_var(memory_failure_cpu);
+}
+EXPORT_SYMBOL_GPL(memory_failure_queue);
+
+static void memory_failure_work_func(struct work_struct *work)
{
- __memory_failure(pfn, trapno, 0);
+ struct memory_failure_cpu *mf_cpu;
+ struct memory_failure_entry entry = { 0, };
+ unsigned long proc_flags;
+ int gotten;
+
+ mf_cpu = &__get_cpu_var(memory_failure_cpu);
+ for (;;) {
+ spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+ gotten = kfifo_get(&mf_cpu->fifo, &entry);
+ spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+ if (!gotten)
+ break;
+ memory_failure(entry.pfn, entry.trapno, entry.flags);
+ }
}
+static int __init memory_failure_init(void)
+{
+ struct memory_failure_cpu *mf_cpu;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ mf_cpu = &per_cpu(memory_failure_cpu, cpu);
+ spin_lock_init(&mf_cpu->lock);
+ INIT_KFIFO(mf_cpu->fifo);
+ INIT_WORK(&mf_cpu->work, memory_failure_work_func);
+ }
+
+ return 0;
+}
+core_initcall(memory_failure_init);
+
/**
* unpoison_memory - Unpoison a previously poisoned page
* @pfn: Page number of the to be unpoisoned page
* to the end.
*/
if (PageHuge(page)) {
- pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+ pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
return 0;
}
if (TestClearPageHWPoison(p))
return 0;
}
- lock_page_nosync(page);
+ lock_page(page);
/*
* This test is racy because PG_hwpoison is set outside of page lock.
* That's acceptable because that won't trigger kernel panic. Instead,
if (PageHWPoison(hpage)) {
put_page(hpage);
- pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
+ pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
return -EBUSY;
}
list_for_each_entry_safe(page1, page2, &pagelist, lru)
put_page(page1);
- pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
- pfn, ret, page->flags);
+ pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
+ pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
return ret;
}
if (!PageLRU(page)) {
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
- pfn, page->flags);
+ pfn, page->flags);
return -EIO;
}
*/
ret = invalidate_inode_page(page);
unlock_page(page);
-
/*
- * Drop count because page migration doesn't like raised
- * counts. The page could get re-allocated, but if it becomes
- * LRU the isolation will just fail.
* RED-PEN would be better to keep it isolated here, but we
* would need to fix isolation locking first.
*/
- put_page(page);
if (ret == 1) {
+ put_page(page);
ret = 0;
pr_info("soft_offline: %#lx: invalidated\n", pfn);
goto done;
* handles a large number of cases for us.
*/
ret = isolate_lru_page(page);
+ /*
+ * Drop page reference which is came from get_any_page()
+ * successful isolate_lru_page() already took another one.
+ */
+ put_page(page);
if (!ret) {
LIST_HEAD(pagelist);
-
+ inc_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
- 0, true);
+ 0, MIGRATE_SYNC);
if (ret) {
putback_lru_pages(&pagelist);
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
}
} else {
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
- pfn, ret, page_count(page), page->flags);
+ pfn, ret, page_count(page), page->flags);
}
if (ret)
return ret;
/* keep elevated page count for bad page */
return ret;
}
-
-/*
- * The caller must hold current->mm->mmap_sem in read mode.
- */
-int is_hwpoison_address(unsigned long addr)
-{
- pgd_t *pgdp;
- pud_t pud, *pudp;
- pmd_t pmd, *pmdp;
- pte_t pte, *ptep;
- swp_entry_t entry;
-
- pgdp = pgd_offset(current->mm, addr);
- if (!pgd_present(*pgdp))
- return 0;
- pudp = pud_offset(pgdp, addr);
- pud = *pudp;
- if (!pud_present(pud) || pud_large(pud))
- return 0;
- pmdp = pmd_offset(pudp, addr);
- pmd = *pmdp;
- if (!pmd_present(pmd) || pmd_large(pmd))
- return 0;
- ptep = pte_offset_map(pmdp, addr);
- pte = *ptep;
- pte_unmap(ptep);
- if (!is_swap_pte(pte))
- return 0;
- entry = pte_to_swp_entry(pte);
- return is_hwpoison_entry(entry);
-}
-EXPORT_SYMBOL_GPL(is_hwpoison_address);