#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/magic.h>
#include <linux/pid.h>
#include <linux/nsproxy.h>
+#include <linux/ptrace.h>
#include <asm/futex.h>
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
/*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#define FLAGS_SHARED 0x01
+#define FLAGS_CLOCKRT 0x02
+#define FLAGS_HAS_TIMEOUT 0x04
+
+/*
* Priority Inheritance state:
*/
struct futex_pi_state {
/**
* struct futex_q - The hashed futex queue entry, one per waiting task
+ * @list: priority-sorted list of tasks waiting on this futex
* @task: the task waiting on the futex
* @lock_ptr: the hash bucket lock
* @key: the key the futex is hashed on
*
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
* It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
- * The order of wakup is always to make the first condition true, then
+ * The order of wakeup is always to make the first condition true, then
* the second.
*
* PI futexes are typically woken before they are removed from the hash list via
u32 bitset;
};
+static const struct futex_q futex_q_init = {
+ /* list gets initialized in queue_me()*/
+ .key = FUTEX_KEY_INIT,
+ .bitset = FUTEX_BITSET_MATCH_ANY
+};
+
/*
* Hash buckets are shared by all the futex_keys that hash to the same
* location. Each key may have multiple futex_q structures, one for each task
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
- atomic_inc(&key->shared.inode->i_count);
+ ihold(key->shared.inode);
break;
case FUT_OFF_MMSHARED:
atomic_inc(&key->private.mm->mm_count);
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
+ * @rw: mapping needs to be read/write (values: VERIFY_READ,
+ * VERIFY_WRITE)
*
* Returns a negative error code or 0
* The key words are stored in *key on success.
* lock_page() might sleep, the caller should not hold a spinlock.
*/
static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page;
- int err;
+ struct page *page, *page_head;
+ int err, ro = 0;
/*
* The futex address must be "naturally" aligned.
again:
err = get_user_pages_fast(address, 1, 1, &page);
+ /*
+ * If write access is not required (eg. FUTEX_WAIT), try
+ * and get read-only access.
+ */
+ if (err == -EFAULT && rw == VERIFY_READ) {
+ err = get_user_pages_fast(address, 1, 0, &page);
+ ro = 1;
+ }
if (err < 0)
return err;
+ else
+ err = 0;
- page = compound_head(page);
- lock_page(page);
- if (!page->mapping) {
- unlock_page(page);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ page_head = page;
+ if (unlikely(PageTail(page))) {
+ put_page(page);
+ /* serialize against __split_huge_page_splitting() */
+ local_irq_disable();
+ if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+ page_head = compound_head(page);
+ /*
+ * page_head is valid pointer but we must pin
+ * it before taking the PG_lock and/or
+ * PG_compound_lock. The moment we re-enable
+ * irqs __split_huge_page_splitting() can
+ * return and the head page can be freed from
+ * under us. We can't take the PG_lock and/or
+ * PG_compound_lock on a page that could be
+ * freed from under us.
+ */
+ if (page != page_head) {
+ get_page(page_head);
+ put_page(page);
+ }
+ local_irq_enable();
+ } else {
+ local_irq_enable();
+ goto again;
+ }
+ }
+#else
+ page_head = compound_head(page);
+ if (page != page_head) {
+ get_page(page_head);
put_page(page);
- goto again;
+ }
+#endif
+
+ lock_page(page_head);
+
+ /*
+ * If page_head->mapping is NULL, then it cannot be a PageAnon
+ * page; but it might be the ZERO_PAGE or in the gate area or
+ * in a special mapping (all cases which we are happy to fail);
+ * or it may have been a good file page when get_user_pages_fast
+ * found it, but truncated or holepunched or subjected to
+ * invalidate_complete_page2 before we got the page lock (also
+ * cases which we are happy to fail). And we hold a reference,
+ * so refcount care in invalidate_complete_page's remove_mapping
+ * prevents drop_caches from setting mapping to NULL beneath us.
+ *
+ * The case we do have to guard against is when memory pressure made
+ * shmem_writepage move it from filecache to swapcache beneath us:
+ * an unlikely race, but we do need to retry for page_head->mapping.
+ */
+ if (!page_head->mapping) {
+ int shmem_swizzled = PageSwapCache(page_head);
+ unlock_page(page_head);
+ put_page(page_head);
+ if (shmem_swizzled)
+ goto again;
+ return -EFAULT;
}
/*
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page)) {
+ if (PageAnon(page_head)) {
+ /*
+ * A RO anonymous page will never change and thus doesn't make
+ * sense for futex operations.
+ */
+ if (ro) {
+ err = -EFAULT;
+ goto out;
+ }
+
key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = page->mapping->host;
- key->shared.pgoff = page->index;
+ key->shared.inode = page_head->mapping->host;
+ key->shared.pgoff = page_head->index;
}
get_futex_key_refs(key);
- unlock_page(page);
- put_page(page);
- return 0;
+out:
+ unlock_page(page_head);
+ put_page(page_head);
+ return err;
}
-static inline
-void put_futex_key(int fshared, union futex_key *key)
+static inline void put_futex_key(union futex_key *key)
{
drop_futex_key_refs(key);
}
* Slow path to fixup the fault we just took in the atomic write
* access to @uaddr.
*
- * We have no generic implementation of a non destructive write to the
+ * We have no generic implementation of a non-destructive write to the
* user address. We know that we faulted in the atomic pagefault
* disabled section so we can as well avoid the #PF overhead by
* calling get_user_pages() right away.
int ret;
down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, (unsigned long)uaddr,
- 1, 1, 0, NULL, NULL);
+ ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
+ FAULT_FLAG_WRITE);
up_read(&mm->mmap_sem);
return ret < 0 ? ret : 0;
return NULL;
}
-static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
+static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
+ u32 uval, u32 newval)
{
- u32 curval;
+ int ret;
pagefault_disable();
- curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
pagefault_enable();
- return curval;
+ return ret;
}
static int get_futex_value_locked(u32 *dest, u32 __user *from)
*/
pi_state = this->pi_state;
/*
- * Userspace might have messed up non PI and PI futexes
+ * Userspace might have messed up non-PI and PI futexes
*/
if (unlikely(!pi_state))
return -EINVAL;
struct task_struct *task, int set_waiters)
{
int lock_taken, ret, ownerdied = 0;
- u32 uval, newval, curval;
+ u32 uval, newval, curval, vpid = task_pid_vnr(task);
retry:
ret = lock_taken = 0;
* (by doing a 0 -> TID atomic cmpxchg), while holding all
* the locks. It will most likely not succeed.
*/
- newval = task_pid_vnr(task);
+ newval = vpid;
if (set_waiters)
newval |= FUTEX_WAITERS;
- curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-
- if (unlikely(curval == -EFAULT))
+ if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
return -EFAULT;
/*
* Detect deadlocks.
*/
- if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+ if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
return -EDEADLK;
/*
*/
if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
/* Keep the OWNER_DIED bit */
- newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+ newval = (curval & ~FUTEX_TID_MASK) | vpid;
ownerdied = 0;
lock_taken = 1;
}
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (unlikely(curval == -EFAULT))
+ if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
if (unlikely(curval != uval))
goto retry;
return ret;
}
+/**
+ * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
+ * @q: The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be NULL and must be held by the caller.
+ */
+static void __unqueue_futex(struct futex_q *q)
+{
+ struct futex_hash_bucket *hb;
+
+ if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
+ || WARN_ON(plist_node_empty(&q->list)))
+ return;
+
+ hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+ plist_del(&q->list, &hb->chain);
+}
+
/*
* The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed.
/*
* We set q->lock_ptr = NULL _before_ we wake up the task. If
- * a non futex wake up happens on another CPU then the task
- * might exit and p would dereference a non existing task
+ * a non-futex wake up happens on another CPU then the task
+ * might exit and p would dereference a non-existing task
* struct. Prevent this by holding a reference on p across the
* wake up.
*/
get_task_struct(p);
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
/*
* The waiting task can free the futex_q as soon as
* q->lock_ptr = NULL is written, without taking any locks. A
{
struct task_struct *new_owner;
struct futex_pi_state *pi_state = this->pi_state;
- u32 curval, newval;
+ u32 uninitialized_var(curval), newval;
if (!pi_state)
return -EINVAL;
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
/*
- * This happens when we have stolen the lock and the original
- * pending owner did not enqueue itself back on the rt_mutex.
- * Thats not a tragedy. We know that way, that a lock waiter
- * is on the fly. We make the futex_q waiter the pending owner.
+ * It is possible that the next waiter (the one that brought
+ * this owner to the kernel) timed out and is no longer
+ * waiting on the lock.
*/
if (!new_owner)
new_owner = this->task;
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (curval == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
ret = -EFAULT;
else if (curval != uval)
ret = -EINVAL;
static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
{
- u32 oldval;
+ u32 uninitialized_var(oldval);
/*
* There is no waiter, so we unlock the futex. The owner died
* bit has not to be preserved here. We are the owner:
*/
- oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
-
- if (oldval == -EFAULT)
- return oldval;
+ if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
+ return -EFAULT;
if (oldval != uval)
return -EAGAIN;
/*
* Wake up waiters matching bitset queued on this futex (uaddr).
*/
-static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
if (!bitset)
return -EINVAL;
- ret = get_futex_key(uaddr, fshared, &key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
}
spin_unlock(&hb->lock);
- put_futex_key(fshared, &key);
+ put_futex_key(&key);
out:
return ret;
}
* to this virtual address:
*/
static int
-futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
int ret, op_ret;
retry:
- ret = get_futex_key(uaddr1, fshared, &key1);
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out_put_key1;
if (ret)
goto out_put_keys;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
goto retry;
}
double_unlock_hb(hb1, hb2);
out_put_keys:
- put_futex_key(fshared, &key2);
+ put_futex_key(&key2);
out_put_key1:
- put_futex_key(fshared, &key1);
+ put_futex_key(&key1);
out:
return ret;
}
plist_del(&q->list, &hb1->chain);
plist_add(&q->list, &hb2->chain);
q->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
- q->list.plist.spinlock = &hb2->lock;
-#endif
}
get_futex_key_refs(key2);
q->key = *key2;
get_futex_key_refs(key);
q->key = *key;
- WARN_ON(plist_node_empty(&q->list));
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
WARN_ON(!q->rt_waiter);
q->rt_waiter = NULL;
q->lock_ptr = &hb->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
- q->list.plist.spinlock = &hb->lock;
-#endif
wake_up_state(q->task, TASK_NORMAL);
}
/**
* futex_requeue() - Requeue waiters from uaddr1 to uaddr2
- * uaddr1: source futex user address
- * uaddr2: target futex user address
- * nr_wake: number of waiters to wake (must be 1 for requeue_pi)
- * nr_requeue: number of waiters to requeue (0-INT_MAX)
- * requeue_pi: if we are attempting to requeue from a non-pi futex to a
- * pi futex (pi to pi requeue is not supported)
+ * @uaddr1: source futex user address
+ * @flags: futex flags (FLAGS_SHARED, etc.)
+ * @uaddr2: target futex user address
+ * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
+ * @nr_requeue: number of waiters to requeue (0-INT_MAX)
+ * @cmpval: @uaddr1 expected value (or %NULL)
+ * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
+ * pi futex (pi to pi requeue is not supported)
*
* Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
* uaddr2 atomically on behalf of the top waiter.
* >=0 - on success, the number of tasks requeued or woken
* <0 - on error
*/
-static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
- int nr_wake, int nr_requeue, u32 *cmpval,
- int requeue_pi)
+static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+ u32 __user *uaddr2, int nr_wake, int nr_requeue,
+ u32 *cmpval, int requeue_pi)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
int drop_count = 0, task_count = 0, ret;
pi_state = NULL;
}
- ret = get_futex_key(uaddr1, fshared, &key1);
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+ requeue_pi ? VERIFY_WRITE : VERIFY_READ);
if (unlikely(ret != 0))
goto out_put_key1;
if (ret)
goto out_put_keys;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
goto retry;
}
if (curval != *cmpval) {
break;
case -EFAULT:
double_unlock_hb(hb1, hb2);
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
ret = fault_in_user_writeable(uaddr2);
if (!ret)
goto retry;
case -EAGAIN:
/* The owner was exiting, try again. */
double_unlock_hb(hb1, hb2);
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
cond_resched();
goto retry;
default:
drop_futex_key_refs(&key1);
out_put_keys:
- put_futex_key(fshared, &key2);
+ put_futex_key(&key2);
out_put_key1:
- put_futex_key(fshared, &key1);
+ put_futex_key(&key1);
out:
if (pi_state != NULL)
free_pi_state(pi_state);
/* The key must be already stored in q->key. */
static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
+ __acquires(&hb->lock)
{
struct futex_hash_bucket *hb;
- get_futex_key_refs(&q->key);
hb = hash_futex(&q->key);
q->lock_ptr = &hb->lock;
static inline void
queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
{
spin_unlock(&hb->lock);
- drop_futex_key_refs(&q->key);
}
/**
* an example).
*/
static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
{
int prio;
prio = min(current->normal_prio, MAX_RT_PRIO);
plist_node_init(&q->list, prio);
-#ifdef CONFIG_DEBUG_PI_LIST
- q->list.plist.spinlock = &hb->lock;
-#endif
plist_add(&q->list, &hb->chain);
q->task = current;
spin_unlock(&hb->lock);
spin_unlock(lock_ptr);
goto retry;
}
- WARN_ON(plist_node_empty(&q->list));
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
BUG_ON(q->pi_state);
* and dropped here.
*/
static void unqueue_me_pi(struct futex_q *q)
+ __releases(q->lock_ptr)
{
- WARN_ON(plist_node_empty(&q->list));
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
BUG_ON(!q->pi_state);
free_pi_state(q->pi_state);
q->pi_state = NULL;
spin_unlock(q->lock_ptr);
-
- drop_futex_key_refs(&q->key);
}
/*
* private futexes.
*/
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner, int fshared)
+ struct task_struct *newowner)
{
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
struct task_struct *oldowner = pi_state->owner;
- u32 uval, curval, newval;
+ u32 uval, uninitialized_var(curval), newval;
int ret;
/* Owner died? */
/*
* We are here either because we stole the rtmutex from the
- * pending owner or we are the pending owner which failed to
- * get the rtmutex. We have to replace the pending owner TID
- * in the user space variable. This must be atomic as we have
- * to preserve the owner died bit here.
+ * previous highest priority waiter or we are the highest priority
+ * waiter but failed to get the rtmutex the first time.
+ * We have to replace the newowner TID in the user space variable.
+ * This must be atomic as we have to preserve the owner died bit here.
*
* Note: We write the user space value _before_ changing the pi_state
* because we can fault here. Imagine swapped out pages or a fork
while (1) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (curval == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
goto handle_fault;
if (curval == uval)
break;
/*
* To handle the page fault we need to drop the hash bucket
- * lock here. That gives the other task (either the pending
- * owner itself or the task which stole the rtmutex) the
+ * lock here. That gives the other task (either the highest priority
+ * waiter itself or the task which stole the rtmutex) the
* chance to try the fixup of the pi_state. So once we are
* back from handling the fault we need to check the pi_state
* after reacquiring the hash bucket lock and before trying to
goto retry;
}
-/*
- * In case we must use restart_block to restart a futex_wait,
- * we encode in the 'flags' shared capability
- */
-#define FLAGS_SHARED 0x01
-#define FLAGS_CLOCKRT 0x02
-#define FLAGS_HAS_TIMEOUT 0x04
-
static long futex_wait_restart(struct restart_block *restart);
/**
* fixup_owner() - Post lock pi_state and corner case management
* @uaddr: user address of the futex
- * @fshared: whether the futex is shared (1) or not (0)
* @q: futex_q (contains pi_state and access to the rt_mutex)
* @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
*
* 0 - success, lock not taken
* <0 - on error (-EFAULT)
*/
-static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
- int locked)
+static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
{
struct task_struct *owner;
int ret = 0;
* did a lock-steal - fix up the PI-state in that case:
*/
if (q->pi_state->owner != current)
- ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+ ret = fixup_pi_state_owner(uaddr, q, current);
goto out;
}
/*
* pi_state is incorrect, some other task did a lock steal and
* we returned due to timeout or signal without taking the
- * rt_mutex. Too late. We can access the rt_mutex_owner without
- * locking, as the other task is now blocked on the hash bucket
- * lock. Fix the state up.
+ * rt_mutex. Too late.
*/
+ raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
owner = rt_mutex_owner(&q->pi_state->pi_mutex);
- ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+ if (!owner)
+ owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
+ raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
+ ret = fixup_pi_state_owner(uaddr, q, owner);
goto out;
}
/*
* Paranoia check. If we did not take the lock, then we should not be
- * the owner, nor the pending owner, of the rt_mutex.
+ * the owner of the rt_mutex.
*/
if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
* futex_wait_setup() - Prepare to wait on a futex
* @uaddr: the futex userspace address
* @val: the expected value
- * @fshared: whether the futex is shared (1) or not (0)
+ * @flags: futex flags (FLAGS_SHARED, etc.)
* @q: the associated futex_q
* @hb: storage for hash_bucket pointer to be returned to caller
*
*
* Returns:
* 0 - uaddr contains val and hb has been locked
- * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
+ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
*/
-static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
struct futex_q *q, struct futex_hash_bucket **hb)
{
u32 uval;
*
* The basic logical guarantee of a futex is that it blocks ONLY
* if cond(var) is known to be true at the time of blocking, for
- * any cond. If we queued after testing *uaddr, that would open
- * a race condition where we could block indefinitely with
+ * any cond. If we locked the hash-bucket after testing *uaddr, that
+ * would open a race condition where we could block indefinitely with
* cond(var) false, which would violate the guarantee.
*
- * A consequence is that futex_wait() can return zero and absorb
- * a wakeup when *uaddr != val on entry to the syscall. This is
- * rare, but normal.
+ * On the other hand, we insert q and release the hash-bucket only
+ * after testing *uaddr. This guarantees that futex_wait() will NOT
+ * absorb a wakeup if *uaddr does not match the desired values
+ * while the syscall executes.
*/
retry:
- q->key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q->key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
if (unlikely(ret != 0))
return ret;
if (ret)
goto out;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &q->key);
+ put_futex_key(&q->key);
goto retry;
}
out:
if (ret)
- put_futex_key(fshared, &q->key);
+ put_futex_key(&q->key);
return ret;
}
-static int futex_wait(u32 __user *uaddr, int fshared,
- u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ ktime_t *abs_time, u32 bitset)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct restart_block *restart;
struct futex_hash_bucket *hb;
- struct futex_q q;
+ struct futex_q q = futex_q_init;
int ret;
if (!bitset)
return -EINVAL;
-
- q.pi_state = NULL;
q.bitset = bitset;
- q.rt_waiter = NULL;
- q.requeue_pi_key = NULL;
if (abs_time) {
to = &timeout;
- hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
hrtimer_set_expires_range_ns(&to->timer, *abs_time,
current->timer_slack_ns);
}
retry:
- /* Prepare to wait on uaddr. */
- ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+ /*
+ * Prepare to wait on uaddr. On success, holds hb lock and increments
+ * q.key refs.
+ */
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
if (ret)
goto out;
/* If we were woken (and unqueued), we succeeded, whatever. */
ret = 0;
+ /* unqueue_me() drops q.key ref */
if (!unqueue_me(&q))
- goto out_put_key;
+ goto out;
ret = -ETIMEDOUT;
if (to && !to->task)
- goto out_put_key;
+ goto out;
/*
* We expect signal_pending(current), but we might be the
* victim of a spurious wakeup as well.
*/
- if (!signal_pending(current)) {
- put_futex_key(fshared, &q.key);
+ if (!signal_pending(current))
goto retry;
- }
ret = -ERESTARTSYS;
if (!abs_time)
- goto out_put_key;
+ goto out;
restart = ¤t_thread_info()->restart_block;
restart->fn = futex_wait_restart;
- restart->futex.uaddr = (u32 *)uaddr;
+ restart->futex.uaddr = uaddr;
restart->futex.val = val;
restart->futex.time = abs_time->tv64;
restart->futex.bitset = bitset;
- restart->futex.flags = FLAGS_HAS_TIMEOUT;
-
- if (fshared)
- restart->futex.flags |= FLAGS_SHARED;
- if (clockrt)
- restart->futex.flags |= FLAGS_CLOCKRT;
+ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
ret = -ERESTART_RESTARTBLOCK;
-out_put_key:
- put_futex_key(fshared, &q.key);
out:
if (to) {
hrtimer_cancel(&to->timer);
static long futex_wait_restart(struct restart_block *restart)
{
- u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
- int fshared = 0;
+ u32 __user *uaddr = restart->futex.uaddr;
ktime_t t, *tp = NULL;
if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
tp = &t;
}
restart->fn = do_no_restart_syscall;
- if (restart->futex.flags & FLAGS_SHARED)
- fshared = 1;
- return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
- restart->futex.bitset,
- restart->futex.flags & FLAGS_CLOCKRT);
+
+ return (long)futex_wait(uaddr, restart->futex.flags,
+ restart->futex.val, tp, restart->futex.bitset);
}
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-static int futex_lock_pi(u32 __user *uaddr, int fshared,
- int detect, ktime_t *time, int trylock)
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
+ ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct futex_hash_bucket *hb;
- struct futex_q q;
+ struct futex_q q = futex_q_init;
int res, ret;
if (refill_pi_state_cache())
hrtimer_set_expires(&to->timer, *time);
}
- q.pi_state = NULL;
- q.rt_waiter = NULL;
- q.requeue_pi_key = NULL;
retry:
- q.key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q.key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
* exit to complete.
*/
queue_unlock(&q, hb);
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
cond_resched();
goto retry;
default:
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
- res = fixup_owner(uaddr, fshared, &q, !ret);
+ res = fixup_owner(uaddr, &q, !ret);
/*
* If fixup_owner() returned an error, proprogate that. If it acquired
* the lock, clear our -ETIMEDOUT or -EINTR.
queue_unlock(&q, hb);
out_put_key:
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
out:
if (to)
destroy_hrtimer_on_stack(&to->timer);
if (ret)
goto out_put_key;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
goto retry;
}
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
*/
-static int futex_unlock_pi(u32 __user *uaddr, int fshared)
+static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- u32 uval;
struct plist_head *head;
union futex_key key = FUTEX_KEY_INIT;
+ u32 uval, vpid = task_pid_vnr(current);
int ret;
retry:
/*
* We release only a lock we actually own:
*/
- if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
+ if ((uval & FUTEX_TID_MASK) != vpid)
return -EPERM;
- ret = get_futex_key(uaddr, fshared, &key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
* again. If it succeeds then we can return without waking
* anyone else up:
*/
- if (!(uval & FUTEX_OWNER_DIED))
- uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
-
-
- if (unlikely(uval == -EFAULT))
+ if (!(uval & FUTEX_OWNER_DIED) &&
+ cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
goto pi_faulted;
/*
* Rare case: we managed to release the lock atomically,
* no need to wake anyone else up:
*/
- if (unlikely(uval == task_pid_vnr(current)))
+ if (unlikely(uval == vpid))
goto out_unlock;
/*
out_unlock:
spin_unlock(&hb->lock);
- put_futex_key(fshared, &key);
+ put_futex_key(&key);
out:
return ret;
pi_faulted:
spin_unlock(&hb->lock);
- put_futex_key(fshared, &key);
+ put_futex_key(&key);
ret = fault_in_user_writeable(uaddr);
if (!ret)
* We were woken prior to requeue by a timeout or a signal.
* Unqueue the futex_q and determine which it was.
*/
- plist_del(&q->list, &q->list.plist);
+ plist_del(&q->list, &hb->chain);
/* Handle spurious wakeups gracefully */
ret = -EWOULDBLOCK;
/**
* futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
* @uaddr: the futex we initially wait on (non-pi)
- * @fshared: whether the futexes are shared (1) or not (0). They must be
+ * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
* the same type, no requeueing from private to shared, etc.
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* 0 - On success
* <0 - On error
*/
-static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
u32 val, ktime_t *abs_time, u32 bitset,
- int clockrt, u32 __user *uaddr2)
+ u32 __user *uaddr2)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct rt_mutex_waiter rt_waiter;
struct rt_mutex *pi_mutex = NULL;
struct futex_hash_bucket *hb;
- union futex_key key2;
- struct futex_q q;
+ union futex_key key2 = FUTEX_KEY_INIT;
+ struct futex_q q = futex_q_init;
int res, ret;
if (!bitset)
if (abs_time) {
to = &timeout;
- hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
hrtimer_set_expires_range_ns(&to->timer, *abs_time,
current->timer_slack_ns);
debug_rt_mutex_init_waiter(&rt_waiter);
rt_waiter.task = NULL;
- key2 = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
- q.pi_state = NULL;
q.bitset = bitset;
q.rt_waiter = &rt_waiter;
q.requeue_pi_key = &key2;
- /* Prepare to wait on uaddr. */
- ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+ /*
+ * Prepare to wait on uaddr. On success, increments q.key (key1) ref
+ * count.
+ */
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
if (ret)
goto out_key2;
* In order for us to be here, we know our q.key == key2, and since
* we took the hb->lock above, we also know that futex_requeue() has
* completed and we no longer have to concern ourselves with a wakeup
- * race with the atomic proxy lock acquition by the requeue code.
+ * race with the atomic proxy lock acquisition by the requeue code. The
+ * futex_requeue dropped our key1 reference and incremented our key2
+ * reference count.
*/
/* Check if the requeue code acquired the second futex for us. */
*/
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
- ret = fixup_pi_state_owner(uaddr2, &q, current,
- fshared);
+ ret = fixup_pi_state_owner(uaddr2, &q, current);
spin_unlock(q.lock_ptr);
}
} else {
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
- res = fixup_owner(uaddr2, fshared, &q, !ret);
+ res = fixup_owner(uaddr2, &q, !ret);
/*
* If fixup_owner() returned an error, proprogate that. If it
* acquired the lock, clear -ETIMEDOUT or -EINTR.
}
out_put_keys:
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
out_key2:
- put_futex_key(fshared, &key2);
+ put_futex_key(&key2);
out:
if (to) {
{
struct robust_list_head __user *head;
unsigned long ret;
- const struct cred *cred = current_cred(), *pcred;
+ struct task_struct *p;
if (!futex_cmpxchg_enabled)
return -ENOSYS;
+ rcu_read_lock();
+
+ ret = -ESRCH;
if (!pid)
- head = current->robust_list;
+ p = current;
else {
- struct task_struct *p;
-
- ret = -ESRCH;
- rcu_read_lock();
p = find_task_by_vpid(pid);
if (!p)
goto err_unlock;
- ret = -EPERM;
- pcred = __task_cred(p);
- if (cred->euid != pcred->euid &&
- cred->euid != pcred->uid &&
- !capable(CAP_SYS_PTRACE))
- goto err_unlock;
- head = p->robust_list;
- rcu_read_unlock();
}
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ))
+ goto err_unlock;
+
+ head = p->robust_list;
+ rcu_read_unlock();
+
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(head, head_ptr);
*/
int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
{
- u32 uval, nval, mval;
+ u32 uval, uninitialized_var(nval), mval;
retry:
if (get_user(uval, uaddr))
* userspace.
*/
mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
- nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
-
- if (nval == -EFAULT)
- return -1;
-
+ /*
+ * We are not holding a lock here, but we want to have
+ * the pagefault_disable/enable() protection because
+ * we want to handle the fault gracefully. If the
+ * access fails we try to fault in the futex with R/W
+ * verification via get_user_pages. get_user() above
+ * does not guarantee R/W access. If that fails we
+ * give up and leave the futex locked.
+ */
+ if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+ if (fault_in_user_writeable(uaddr))
+ return -1;
+ goto retry;
+ }
if (nval != uval)
goto retry;
{
struct robust_list_head __user *head = curr->robust_list;
struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int uninitialized_var(next_pi);
unsigned long futex_offset;
int rc;
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
- int clockrt, ret = -ENOSYS;
- int cmd = op & FUTEX_CMD_MASK;
- int fshared = 0;
+ int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
+ unsigned int flags = 0;
if (!(op & FUTEX_PRIVATE_FLAG))
- fshared = 1;
+ flags |= FLAGS_SHARED;
- clockrt = op & FUTEX_CLOCK_REALTIME;
- if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
- return -ENOSYS;
+ if (op & FUTEX_CLOCK_REALTIME) {
+ flags |= FLAGS_CLOCKRT;
+ if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+ return -ENOSYS;
+ }
+
+ switch (cmd) {
+ case FUTEX_LOCK_PI:
+ case FUTEX_UNLOCK_PI:
+ case FUTEX_TRYLOCK_PI:
+ case FUTEX_WAIT_REQUEUE_PI:
+ case FUTEX_CMP_REQUEUE_PI:
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+ }
switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAIT_BITSET:
- ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
+ ret = futex_wait(uaddr, flags, val, timeout, val3);
break;
case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAKE_BITSET:
- ret = futex_wake(uaddr, fshared, val, val3);
+ ret = futex_wake(uaddr, flags, val, val3);
break;
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
+ ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
break;
case FUTEX_CMP_REQUEUE:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
- 0);
+ ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
break;
case FUTEX_WAKE_OP:
- ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
+ ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
break;
case FUTEX_LOCK_PI:
- if (futex_cmpxchg_enabled)
- ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
+ ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
break;
case FUTEX_UNLOCK_PI:
- if (futex_cmpxchg_enabled)
- ret = futex_unlock_pi(uaddr, fshared);
+ ret = futex_unlock_pi(uaddr, flags);
break;
case FUTEX_TRYLOCK_PI:
- if (futex_cmpxchg_enabled)
- ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+ ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
break;
case FUTEX_WAIT_REQUEUE_PI:
val3 = FUTEX_BITSET_MATCH_ANY;
- ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
- clockrt, uaddr2);
+ ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+ uaddr2);
break;
case FUTEX_CMP_REQUEUE_PI:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
- 1);
+ ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
break;
default:
ret = -ENOSYS;
* of the complex code paths. Also we want to prevent
* registration of robust lists in that case. NULL is
* guaranteed to fault and we get -EFAULT on functional
- * implementation, the non functional ones will return
+ * implementation, the non-functional ones will return
* -ENOSYS.
*/
- curval = cmpxchg_futex_value_locked(NULL, 0, 0);
- if (curval == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
futex_cmpxchg_enabled = 1;
for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
- plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
+ plist_head_init(&futex_queues[i].chain);
spin_lock_init(&futex_queues[i].lock);
}