#include <linux/memcontrol.h>
#include <linux/mempolicy.h>
#include <linux/security.h>
+#include <linux/ptrace.h>
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
#endif /* CONFIG_NUMA */
/*
- * If this is a system OOM (not a memcg OOM) and the task selected to be
- * killed is not already running at high (RT) priorities, speed up the
- * recovery by boosting the dying task to the lowest FIFO priority.
- * That helps with the recovery and avoids interfering with RT tasks.
- */
-static void boost_dying_task_prio(struct task_struct *p,
- struct mem_cgroup *mem)
-{
- struct sched_param param = { .sched_priority = 1 };
-
- if (mem)
- return;
-
- if (!rt_task(p))
- sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m);
-}
-
-/*
* The process p may have detached its own ->mm while exiting or through
* use_mm(), but one or more of its subthreads may still have a valid
* pointer. Return p, or any of its subthreads with a valid ->mm, with
return 0;
/*
- * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
- * need to be executed for something that cannot be killed.
+ * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
+ * so the entire heuristic doesn't need to be executed for something
+ * that cannot be killed.
*/
- if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ if (atomic_read(&p->mm->oom_disable_count)) {
task_unlock(p);
return 0;
}
/*
* The baseline for the badness score is the proportion of RAM that each
- * task's rss and swap space use.
+ * task's rss, pagetable and swap space use.
*/
- points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
- totalpages;
+ points = get_mm_rss(p->mm) + p->mm->nr_ptes;
+ points += get_mm_counter(p->mm, MM_SWAPENTS);
+
+ points *= 1000;
+ points /= totalpages;
task_unlock(p);
/*
unsigned long totalpages, struct mem_cgroup *mem,
const nodemask_t *nodemask)
{
- struct task_struct *p;
+ struct task_struct *g, *p;
struct task_struct *chosen = NULL;
*ppoints = 0;
- for_each_process(p) {
+ do_each_thread(g, p) {
unsigned int points;
+ if (!p->mm)
+ continue;
if (oom_unkillable_task(p, mem, nodemask))
continue;
if (test_tsk_thread_flag(p, TIF_MEMDIE))
return ERR_PTR(-1UL);
- /*
- * This is in the process of releasing memory so wait for it
- * to finish before killing some other task by mistake.
- *
- * However, if p is the current task, we allow the 'kill' to
- * go ahead if it is exiting: this will simply set TIF_MEMDIE,
- * which will allow it to gain access to memory reserves in
- * the process of exiting and releasing its resources.
- * Otherwise we could get an easy OOM deadlock.
- */
- if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) {
- if (p != current)
- return ERR_PTR(-1UL);
-
- chosen = p;
- *ppoints = 1000;
+ if (p->flags & PF_EXITING) {
+ /*
+ * If p is the current task and is in the process of
+ * releasing memory, we allow the "kill" to set
+ * TIF_MEMDIE, which will allow it to gain access to
+ * memory reserves. Otherwise, it may stall forever.
+ *
+ * The loop isn't broken here, however, in case other
+ * threads are found to have already been oom killed.
+ */
+ if (p == current) {
+ chosen = p;
+ *ppoints = 1000;
+ } else {
+ /*
+ * If this task is not being ptraced on exit,
+ * then wait for it to finish before killing
+ * some other task unnecessarily.
+ */
+ if (!(task_ptrace(p->group_leader) &
+ PT_TRACE_EXIT))
+ return ERR_PTR(-1UL);
+ }
}
points = oom_badness(p, mem, nodemask, totalpages);
chosen = p;
*ppoints = points;
}
- }
+ } while_each_thread(g, p);
return chosen;
}
#define K(x) ((x) << (PAGE_SHIFT-10))
static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
{
+ struct task_struct *q;
+ struct mm_struct *mm;
+
p = find_lock_task_mm(p);
if (!p)
return 1;
+ /* mm cannot be safely dereferenced after task_unlock(p) */
+ mm = p->mm;
+
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
task_pid_nr(p), p->comm, K(p->mm->total_vm),
K(get_mm_counter(p->mm, MM_ANONPAGES)),
K(get_mm_counter(p->mm, MM_FILEPAGES)));
task_unlock(p);
+ /*
+ * Kill all processes sharing p->mm in other thread groups, if any.
+ * They don't get access to memory reserves or a higher scheduler
+ * priority, though, to avoid depletion of all memory or task
+ * starvation. This prevents mm->mmap_sem livelock when an oom killed
+ * task cannot exit because it requires the semaphore and its contended
+ * by another thread trying to allocate memory itself. That thread will
+ * now get access to memory reserves since it has a pending fatal
+ * signal.
+ */
+ for_each_process(q)
+ if (q->mm == mm && !same_thread_group(q, p)) {
+ task_lock(q); /* Protect ->comm from prctl() */
+ pr_err("Kill process %d (%s) sharing same memory\n",
+ task_pid_nr(q), q->comm);
+ task_unlock(q);
+ force_sig(SIGKILL, q);
+ }
set_tsk_thread_flag(p, TIF_MEMDIE);
force_sig(SIGKILL, p);
- /*
- * We give our sacrificial lamb high priority and access to
- * all the memory it needs. That way it should be able to
- * exit() and clear out its resources quickly...
- */
- boost_dying_task_prio(p, mem);
-
return 0;
}
#undef K
*/
if (p->flags & PF_EXITING) {
set_tsk_thread_flag(p, TIF_MEMDIE);
- boost_dying_task_prio(p, mem);
return 0;
}
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
+ if (child->mm == p->mm)
+ continue;
/*
* oom_badness() returns 0 if the thread is unkillable
*/
*/
if (fatal_signal_pending(current)) {
set_thread_flag(TIF_MEMDIE);
- boost_dying_task_prio(current, NULL);
return;
}
read_lock(&tasklist_lock);
if (sysctl_oom_kill_allocating_task &&
!oom_unkillable_task(current, NULL, nodemask) &&
- (current->signal->oom_adj != OOM_DISABLE)) {
+ current->mm && !atomic_read(¤t->mm->oom_disable_count)) {
/*
* oom_kill_process() needs tasklist_lock held. If it returns
* non-zero, current could not be killed so we must fallback to