[v2,3/8] futex: make PI support optional

Message ID	20170606232450.30278-4-nicolas.pitre@linaro.org
State	New
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: best guess record for domain of linux-kernel-owner@vger.kernel.org designates 209.132.180.67 as permitted sender) client-ip=209.132.180.67; From: Nicolas Pitre <nicolas.pitre@linaro.org> To: Ingo Molnar <mingo@redhat.com>, Peter Zijlstra <peterz@infradead.org> Cc: linux-kernel@vger.kernel.org Subject: [PATCH v2 3/8] futex: make PI support optional Date: Tue, 6 Jun 2017 19:24:45 -0400 Message-Id: <20170606232450.30278-4-nicolas.pitre@linaro.org> In-Reply-To: <20170606232450.30278-1-nicolas.pitre@linaro.org> References: <20170606232450.30278-1-nicolas.pitre@linaro.org> Sender: linux-kernel-owner@vger.kernel.org Precedence: bulk
Series	scheduler tinification \| expand [v2,0/8] scheduler tinification [v2,1/8] cpuset/sched: cpuset makes sense for SMP only [v2,2/8] sched: omit stop_sched_class when !SMP [v2,3/8] futex: make PI support optional [v2,4/8] sched/deadline: move dl related code out of sched/core.c [v2,5/8] sched/rt: move rt related code out of sched/core.c [v2,6/8] sched/deadline: make it configurable [v2,7/8] rtmutex: compatibility wrappers when no RT support is configured [v2,8/8] sched/rt: make it configurable

diff --git a/include/linux/futex.h b/include/linux/futex.h index 7c5b694864..f36bfd26f9 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -54,7 +54,6 @@ union futex_key { #ifdef CONFIG_FUTEX extern void exit_robust_list(struct task_struct *curr); -extern void exit_pi_state_list(struct task_struct *curr); #ifdef CONFIG_HAVE_FUTEX_CMPXCHG #define futex_cmpxchg_enabled 1 #else @@ -64,8 +63,14 @@ extern int futex_cmpxchg_enabled; static inline void exit_robust_list(struct task_struct *curr) { } +#endif + +#ifdef CONFIG_FUTEX_PI +extern void exit_pi_state_list(struct task_struct *curr); +#else static inline void exit_pi_state_list(struct task_struct *curr) { } #endif + #endif diff --git a/init/Kconfig b/init/Kconfig index b9aed60cac..ad91724f75 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1624,12 +1624,17 @@ config BASE_FULL config FUTEX bool "Enable futex support" if EXPERT default y - select RT_MUTEXES + imply RT_MUTEXES help Disabling this option will cause the kernel to be built without support for "fast userspace mutexes". The resulting kernel may not run glibc-based applications correctly. +config FUTEX_PI + bool + depends on FUTEX && RT_MUTEXES + default y + config HAVE_FUTEX_CMPXCHG bool depends on FUTEX diff --git a/kernel/futex.c b/kernel/futex.c index 357348a6cf..c82ea0098f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -12,17 +12,9 @@ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved * Thanks to Thomas Gleixner for suggestions, analysis and fixes. * - * PI-futex support started by Ingo Molnar and Thomas Gleixner - * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> - * * PRIVATE futexes by Eric Dumazet * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> * - * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> - * Copyright (C) IBM Corporation, 2009 - * Thanks to Thomas Gleixner for conceptual design and careful reviews. - * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -70,8 +62,6 @@ #include <asm/futex.h> -#include "locking/rtmutex_common.h" - /* * READ this before attempting to hack on futexes! * @@ -193,26 +183,7 @@ int __read_mostly futex_cmpxchg_enabled; #define FLAGS_CLOCKRT 0x02 #define FLAGS_HAS_TIMEOUT 0x04 -/* - * Priority Inheritance state: - */ -struct futex_pi_state { - /* - * list of 'owned' pi_state instances - these have to be - * cleaned up in do_exit() if the task exits prematurely: - */ - struct list_head list; - - /* - * The PI object: - */ - struct rt_mutex pi_mutex; - - struct task_struct *owner; - atomic_t refcount; - - union futex_key key; -}; +struct futex_pi_state; /** * struct futex_q - The hashed futex queue entry, one per waiting task @@ -733,25 +704,6 @@ static int fault_in_user_writeable(u32 __user *uaddr) return ret < 0 ? ret : 0; } -/** - * futex_top_waiter() - Return the highest priority waiter on a futex - * @hb: the hash bucket the futex_q's reside in - * @key: the futex key (to distinguish it from other futex futex_q's) - * - * Must be called with the hb lock held. - */ -static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, - union futex_key *key) -{ - struct futex_q *this; - - plist_for_each_entry(this, &hb->chain, list) { - if (match_futex(&this->key, key)) - return this; - } - return NULL; -} - static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) { @@ -779,1114 +731,395 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from) /* * PI code: */ -static int refill_pi_state_cache(void) -{ - struct futex_pi_state *pi_state; - - if (likely(current->pi_state_cache)) - return 0; - - pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); - - if (!pi_state) - return -ENOMEM; - - INIT_LIST_HEAD(&pi_state->list); - /* pi_mutex gets initialized later */ - pi_state->owner = NULL; - atomic_set(&pi_state->refcount, 1); - pi_state->key = FUTEX_KEY_INIT; - - current->pi_state_cache = pi_state; +#ifdef CONFIG_FUTEX_PI +#include "futex_pi.c" +#else +#define get_pi_state(...) +#define put_pi_state(...) +#define refill_pi_state_cache() false +#define lookup_pi_state(...) -ENOSYS +#define rt_mutex_start_proxy_lock(...) -ENOSYS +#define requeue_pi_wake_futex(...) +#define futex_proxy_trylock_atomic(...) -ENOSYS +#define futex_lock_pi(...) -ENOSYS +#define futex_unlock_pi(...) -ENOSYS +#define futex_wait_requeue_pi(...) -ENOSYS +#endif - return 0; -} -static struct futex_pi_state *alloc_pi_state(void) +/** + * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be NULL and must be held by the caller. + */ +static void __unqueue_futex(struct futex_q *q) { - struct futex_pi_state *pi_state = current->pi_state_cache; - - WARN_ON(!pi_state); - current->pi_state_cache = NULL; + struct futex_hash_bucket *hb; - return pi_state; -} + if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) + || WARN_ON(plist_node_empty(&q->list))) + return; -static void get_pi_state(struct futex_pi_state *pi_state) -{ - WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); + hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); + plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); } /* - * Drops a reference to the pi_state object and frees or caches it - * when the last reference is gone. - * - * Must be called with the hb lock held. + * The hash bucket lock must be held when this is called. + * Afterwards, the futex_q must not be accessed. Callers + * must ensure to later call wake_up_q() for the actual + * wakeups to occur. */ -static void put_pi_state(struct futex_pi_state *pi_state) +static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) { - if (!pi_state) - return; + struct task_struct *p = q->task; - if (!atomic_dec_and_test(&pi_state->refcount)) + if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) return; /* - * If pi_state->owner is NULL, the owner is most probably dying - * and has cleaned up the pi_state already + * Queue the task for later wakeup for after we've released + * the hb->lock. wake_q_add() grabs reference to p. */ - if (pi_state->owner) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); - list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); - - rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); - } - - if (current->pi_state_cache) - kfree(pi_state); - else { - /* - * pi_state->list is already empty. - * clear pi_state->owner. - * refcount is at 0 - put it back to 1. - */ - pi_state->owner = NULL; - atomic_set(&pi_state->refcount, 1); - current->pi_state_cache = pi_state; - } + wake_q_add(wake_q, p); + __unqueue_futex(q); + /* + * The waiting task can free the futex_q as soon as q->lock_ptr = NULL + * is written, without taking any locks. This is possible in the event + * of a spurious wakeup, for example. A memory barrier is required here + * to prevent the following store to lock_ptr from getting ahead of the + * plist_del in __unqueue_futex(). + */ + smp_store_release(&q->lock_ptr, NULL); } /* - * Look up the task based on what TID userspace gave us. - * We dont trust it. + * Express the locking dependencies for lockdep: */ -static struct task_struct *futex_find_get_task(pid_t pid) +static inline void +double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { - struct task_struct *p; - - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (p) - get_task_struct(p); - - rcu_read_unlock(); + if (hb1 <= hb2) { + spin_lock(&hb1->lock); + if (hb1 < hb2) + spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); + } else { /* hb1 > hb2 */ + spin_lock(&hb2->lock); + spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); + } +} - return p; +static inline void +double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) +{ + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); } /* - * This task is holding PI mutexes at exit time => bad. - * Kernel cleans up PI-state, but userspace is likely hosed. - * (Robust-futex cleanup is separate and might save the day for userspace.) + * Wake up waiters matching bitset queued on this futex (uaddr). */ -void exit_pi_state_list(struct task_struct *curr) +static int +futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { - struct list_head *next, *head = &curr->pi_state_list; - struct futex_pi_state *pi_state; struct futex_hash_bucket *hb; + struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; + int ret; + DEFINE_WAKE_Q(wake_q); - if (!futex_cmpxchg_enabled) - return; - /* - * We are a ZOMBIE and nobody can enqueue itself on - * pi_state_list anymore, but we have to be careful - * versus waiters unqueueing themselves: - */ - raw_spin_lock_irq(&curr->pi_lock); - while (!list_empty(head)) { + if (!bitset) + return -EINVAL; - next = head->next; - pi_state = list_entry(next, struct futex_pi_state, list); - key = pi_state->key; - hb = hash_futex(&key); - raw_spin_unlock_irq(&curr->pi_lock); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ); + if (unlikely(ret != 0)) + goto out; - spin_lock(&hb->lock); + hb = hash_futex(&key); - raw_spin_lock_irq(&curr->pi_lock); - /* - * We dropped the pi-lock, so re-check whether this - * task still owns the PI-state: - */ - if (head->next != next) { - spin_unlock(&hb->lock); - continue; - } + /* Make sure we really have tasks to wakeup */ + if (!hb_waiters_pending(hb)) + goto out_put_key; - WARN_ON(pi_state->owner != curr); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - pi_state->owner = NULL; - raw_spin_unlock_irq(&curr->pi_lock); + spin_lock(&hb->lock); - get_pi_state(pi_state); - spin_unlock(&hb->lock); + plist_for_each_entry_safe(this, next, &hb->chain, list) { + if (match_futex (&this->key, &key)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + break; + } - rt_mutex_futex_unlock(&pi_state->pi_mutex); - put_pi_state(pi_state); + /* Check if one of the bits is set in both bitsets */ + if (!(this->bitset & bitset)) + continue; - raw_spin_lock_irq(&curr->pi_lock); + mark_wake_futex(&wake_q, this); + if (++ret >= nr_wake) + break; + } } - raw_spin_unlock_irq(&curr->pi_lock); -} -/* - * We need to check the following states: - * - * Waiter | pi_state | pi->owner | uTID | uODIED | ? - * - * [1] NULL | --- | --- | 0 | 0/1 | Valid - * [2] NULL | --- | --- | >0 | 0/1 | Valid - * - * [3] Found | NULL | -- | Any | 0/1 | Invalid - * - * [4] Found | Found | NULL | 0 | 1 | Valid - * [5] Found | Found | NULL | >0 | 1 | Invalid - * - * [6] Found | Found | task | 0 | 1 | Valid - * - * [7] Found | Found | NULL | Any | 0 | Invalid - * - * [8] Found | Found | task | ==taskTID | 0/1 | Valid - * [9] Found | Found | task | 0 | 0 | Invalid - * [10] Found | Found | task | !=taskTID | 0/1 | Invalid - * - * [1] Indicates that the kernel can acquire the futex atomically. We - * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. - * - * [2] Valid, if TID does not belong to a kernel thread. If no matching - * thread is found then it indicates that the owner TID has died. - * - * [3] Invalid. The waiter is queued on a non PI futex - * - * [4] Valid state after exit_robust_list(), which sets the user space - * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. - * - * [5] The user space value got manipulated between exit_robust_list() - * and exit_pi_state_list() - * - * [6] Valid state after exit_pi_state_list() which sets the new owner in - * the pi_state but cannot access the user space value. - * - * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. - * - * [8] Owner and user space value match - * - * [9] There is no transient state which sets the user space TID to 0 - * except exit_robust_list(), but this is indicated by the - * FUTEX_OWNER_DIED bit. See [4] - * - * [10] There is no transient state which leaves owner and user space - * TID out of sync. - * - * - * Serialization and lifetime rules: - * - * hb->lock: - * - * hb -> futex_q, relation - * futex_q -> pi_state, relation - * - * (cannot be raw because hb can contain arbitrary amount - * of futex_q's) - * - * pi_mutex->wait_lock: - * - * {uval, pi_state} - * - * (and pi_mutex 'obviously') - * - * p->pi_lock: - * - * p->pi_state_list -> pi_state->list, relation - * - * pi_state->refcount: - * - * pi_state lifetime - * - * - * Lock order: - * - * hb->lock - * pi_mutex->wait_lock - * p->pi_lock - * - */ + spin_unlock(&hb->lock); + wake_up_q(&wake_q); +out_put_key: + put_futex_key(&key); +out: + return ret; +} /* - * Validate that the existing waiter has a pi_state and sanity check - * the pi_state against the user space value. If correct, attach to - * it. + * Wake up all waiters hashed on the physical page that is mapped + * to this virtual address: */ -static int attach_to_pi_state(u32 __user *uaddr, u32 uval, - struct futex_pi_state *pi_state, - struct futex_pi_state **ps) +static int +futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, + int nr_wake, int nr_wake2, int op) { - pid_t pid = uval & FUTEX_TID_MASK; - u32 uval2; - int ret; - - /* - * Userspace might have messed up non-PI and PI futexes [3] - */ - if (unlikely(!pi_state)) - return -EINVAL; + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + struct futex_hash_bucket *hb1, *hb2; + struct futex_q *this, *next; + int ret, op_ret; + DEFINE_WAKE_Q(wake_q); - /* - * We get here with hb->lock held, and having found a - * futex_top_waiter(). This means that futex_lock_pi() of said futex_q - * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), - * which in turn means that futex_lock_pi() still has a reference on - * our pi_state. - * - * The waiter holding a reference on @pi_state also protects against - * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() - * and futex_wait_requeue_pi() as it cannot go to 0 and consequently - * free pi_state before we can take a reference ourselves. - */ - WARN_ON(!atomic_read(&pi_state->refcount)); +retry: + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); + if (unlikely(ret != 0)) + goto out; + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); + if (unlikely(ret != 0)) + goto out_put_key1; - /* - * Now that we have a pi_state, we can acquire wait_lock - * and do the state validation. - */ - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); - /* - * Since {uval, pi_state} is serialized by wait_lock, and our current - * uval was read without holding it, it can have changed. Verify it - * still is what we expect it to be, otherwise retry the entire - * operation. - */ - if (get_futex_value_locked(&uval2, uaddr)) - goto out_efault; +retry_private: + double_lock_hb(hb1, hb2); + op_ret = futex_atomic_op_inuser(op, uaddr2); + if (unlikely(op_ret < 0)) { - if (uval != uval2) - goto out_eagain; + double_unlock_hb(hb1, hb2); - /* - * Handle the owner died case: - */ - if (uval & FUTEX_OWNER_DIED) { +#ifndef CONFIG_MMU /* - * exit_pi_state_list sets owner to NULL and wakes the - * topmost waiter. The task which acquires the - * pi_state->rt_mutex will fixup owner. + * we don't get EFAULT from MMU faults if we don't have an MMU, + * but we might get them from range checking */ - if (!pi_state->owner) { - /* - * No pi state owner, but the user space TID - * is not 0. Inconsistent state. [5] - */ - if (pid) - goto out_einval; - /* - * Take a ref on the state and return success. [4] - */ - goto out_attach; - } + ret = op_ret; + goto out_put_keys; +#endif - /* - * If TID is 0, then either the dying owner has not - * yet executed exit_pi_state_list() or some waiter - * acquired the rtmutex in the pi state, but did not - * yet fixup the TID in user space. - * - * Take a ref on the state and return success. [6] - */ - if (!pid) - goto out_attach; - } else { - /* - * If the owner died bit is not set, then the pi_state - * must have an owner. [7] - */ - if (!pi_state->owner) - goto out_einval; - } + if (unlikely(op_ret != -EFAULT)) { + ret = op_ret; + goto out_put_keys; + } - /* - * Bail out if user space manipulated the futex value. If pi - * state exists then the owner TID must be the same as the - * user space TID. [9/10] - */ - if (pid != task_pid_vnr(pi_state->owner)) - goto out_einval; + ret = fault_in_user_writeable(uaddr2); + if (ret) + goto out_put_keys; -out_attach: - get_pi_state(pi_state); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - *ps = pi_state; - return 0; + if (!(flags & FLAGS_SHARED)) + goto retry_private; -out_einval: - ret = -EINVAL; - goto out_error; + put_futex_key(&key2); + put_futex_key(&key1); + goto retry; + } -out_eagain: - ret = -EAGAIN; - goto out_error; + plist_for_each_entry_safe(this, next, &hb1->chain, list) { + if (match_futex (&this->key, &key1)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } + mark_wake_futex(&wake_q, this); + if (++ret >= nr_wake) + break; + } + } -out_efault: - ret = -EFAULT; - goto out_error; + if (op_ret > 0) { + op_ret = 0; + plist_for_each_entry_safe(this, next, &hb2->chain, list) { + if (match_futex (&this->key, &key2)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } + mark_wake_futex(&wake_q, this); + if (++op_ret >= nr_wake2) + break; + } + } + ret += op_ret; + } -out_error: - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); +out_unlock: + double_unlock_hb(hb1, hb2); + wake_up_q(&wake_q); +out_put_keys: + put_futex_key(&key2); +out_put_key1: + put_futex_key(&key1); +out: return ret; } -/* - * Lookup the task for the TID provided from user space and attach to - * it after doing proper sanity checks. +/** + * requeue_futex() - Requeue a futex_q from one hb to another + * @q: the futex_q to requeue + * @hb1: the source hash_bucket + * @hb2: the target hash_bucket + * @key2: the new key for the requeued futex_q */ -static int attach_to_pi_owner(u32 uval, union futex_key *key, - struct futex_pi_state **ps) +static inline +void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key2) { - pid_t pid = uval & FUTEX_TID_MASK; - struct futex_pi_state *pi_state; - struct task_struct *p; - - /* - * We are the first waiter - try to look up the real owner and attach - * the new pi_state to it, but bail out when TID = 0 [1] - */ - if (!pid) - return -ESRCH; - p = futex_find_get_task(pid); - if (!p) - return -ESRCH; - - if (unlikely(p->flags & PF_KTHREAD)) { - put_task_struct(p); - return -EPERM; - } /* - * We need to look at the task state flags to figure out, - * whether the task is exiting. To protect against the do_exit - * change of the task flags, we do this protected by - * p->pi_lock: + * If key1 and key2 hash to the same bucket, no need to + * requeue. */ - raw_spin_lock_irq(&p->pi_lock); - if (unlikely(p->flags & PF_EXITING)) { - /* - * The task is on the way out. When PF_EXITPIDONE is - * set, we know that the task has finished the - * cleanup: - */ - int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; - - raw_spin_unlock_irq(&p->pi_lock); - put_task_struct(p); - return ret; + if (likely(&hb1->chain != &hb2->chain)) { + plist_del(&q->list, &hb1->chain); + hb_waiters_dec(hb1); + hb_waiters_inc(hb2); + plist_add(&q->list, &hb2->chain); + q->lock_ptr = &hb2->lock; } - - /* - * No existing pi state. First waiter. [2] - * - * This creates pi_state, we have hb->lock held, this means nothing can - * observe this state, wait_lock is irrelevant. - */ - pi_state = alloc_pi_state(); - - /* - * Initialize the pi_mutex in locked state and make @p - * the owner of it: - */ - rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); - - /* Store the key for possible exit cleanups: */ - pi_state->key = *key; - - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &p->pi_state_list); - pi_state->owner = p; - raw_spin_unlock_irq(&p->pi_lock); - - put_task_struct(p); - - *ps = pi_state; - - return 0; -} - -static int lookup_pi_state(u32 __user *uaddr, u32 uval, - struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps) -{ - struct futex_q *top_waiter = futex_top_waiter(hb, key); - - /* - * If there is a waiter on that futex, validate it and - * attach to the pi_state when the validation succeeds. - */ - if (top_waiter) - return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); - - /* - * We are the first waiter - try to look up the owner based on - * @uval and attach to it. - */ - return attach_to_pi_owner(uval, key, ps); -} - -static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) -{ - u32 uninitialized_var(curval); - - if (unlikely(should_fail_futex(true))) - return -EFAULT; - - if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) - return -EFAULT; - - /* If user space value changed, let the caller retry */ - return curval != uval ? -EAGAIN : 0; + get_futex_key_refs(key2); + q->key = *key2; } /** - * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex - * @uaddr: the pi futex user address - * @hb: the pi futex hash bucket - * @key: the futex key associated with uaddr and hb - * @ps: the pi_state pointer where we store the result of the - * lookup - * @task: the task to perform the atomic lock work for. This will - * be "current" except in the case of requeue pi. - * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 + * @uaddr1: source futex user address + * @flags: futex flags (FLAGS_SHARED, etc.) + * @uaddr2: target futex user address + * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) + * @nr_requeue: number of waiters to requeue (0-INT_MAX) + * @cmpval: @uaddr1 expected value (or %NULL) + * @requeue_pi: if we are attempting to requeue from a non-pi futex to a + * pi futex (pi to pi requeue is not supported) * - * Return: - * 0 - ready to wait; - * 1 - acquired the lock; - * <0 - error + * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire + * uaddr2 atomically on behalf of the top waiter. * - * The hb->lock and futex_key refs shall be held by the caller. + * Return: + * >=0 - on success, the number of tasks requeued or woken; + * <0 - on error */ -static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, - union futex_key *key, - struct futex_pi_state **ps, - struct task_struct *task, int set_waiters) +static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + u32 __user *uaddr2, int nr_wake, int nr_requeue, + u32 *cmpval, int requeue_pi) { - u32 uval, newval, vpid = task_pid_vnr(task); - struct futex_q *top_waiter; - int ret; - - /* - * Read the user space value first so we can validate a few - * things before proceeding further. - */ - if (get_futex_value_locked(&uval, uaddr)) - return -EFAULT; - - if (unlikely(should_fail_futex(true))) - return -EFAULT; - - /* - * Detect deadlocks. - */ - if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) - return -EDEADLK; - - if ((unlikely(should_fail_futex(true)))) - return -EDEADLK; + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + int drop_count = 0, task_count = 0, ret; + struct futex_pi_state *pi_state = NULL; + struct futex_hash_bucket *hb1, *hb2; + struct futex_q *this, *next; + DEFINE_WAKE_Q(wake_q); /* - * Lookup existing state first. If it exists, try to attach to - * its pi_state. + * When PI not supported, return -ENOSYS if requeue_pi is true, + * otherwise this will let the compiler assume requeue_pi is false + * which shoud optimize away all the conditional code further down. */ - top_waiter = futex_top_waiter(hb, key); - if (top_waiter) - return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); + if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) + return -ENOSYS; - /* - * No waiter and user TID is 0. We are here because the - * waiters or the owner died bit is set or called from - * requeue_cmp_pi or for whatever reason something took the - * syscall. - */ - if (!(uval & FUTEX_TID_MASK)) { + if (requeue_pi) { /* - * We take over the futex. No other waiters and the user space - * TID is 0. We preserve the owner died bit. + * Requeue PI only works on two distinct uaddrs. This + * check is only valid for private futexes. See below. */ - newval = uval & FUTEX_OWNER_DIED; - newval |= vpid; - - /* The futex requeue_pi code can enforce the waiters bit */ - if (set_waiters) - newval |= FUTEX_WAITERS; - - ret = lock_pi_update_atomic(uaddr, uval, newval); - /* If the take over worked, return 1 */ - return ret < 0 ? ret : 1; - } - - /* - * First waiter. Set the waiters bit before attaching ourself to - * the owner. If owner tries to unlock, it will be forced into - * the kernel and blocked on hb->lock. - */ - newval = uval | FUTEX_WAITERS; - ret = lock_pi_update_atomic(uaddr, uval, newval); - if (ret) - return ret; - /* - * If the update of the user space value succeeded, we try to - * attach to the owner. If that fails, no harm done, we only - * set the FUTEX_WAITERS bit in the user space variable. - */ - return attach_to_pi_owner(uval, key, ps); -} - -/** - * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket - * @q: The futex_q to unqueue - * - * The q->lock_ptr must not be NULL and must be held by the caller. - */ -static void __unqueue_futex(struct futex_q *q) -{ - struct futex_hash_bucket *hb; - - if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) - || WARN_ON(plist_node_empty(&q->list))) - return; - - hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); - plist_del(&q->list, &hb->chain); - hb_waiters_dec(hb); -} - -/* - * The hash bucket lock must be held when this is called. - * Afterwards, the futex_q must not be accessed. Callers - * must ensure to later call wake_up_q() for the actual - * wakeups to occur. - */ -static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) -{ - struct task_struct *p = q->task; - - if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) - return; - - /* - * Queue the task for later wakeup for after we've released - * the hb->lock. wake_q_add() grabs reference to p. - */ - wake_q_add(wake_q, p); - __unqueue_futex(q); - /* - * The waiting task can free the futex_q as soon as q->lock_ptr = NULL - * is written, without taking any locks. This is possible in the event - * of a spurious wakeup, for example. A memory barrier is required here - * to prevent the following store to lock_ptr from getting ahead of the - * plist_del in __unqueue_futex(). - */ - smp_store_release(&q->lock_ptr, NULL); -} - -/* - * Caller must hold a reference on @pi_state. - */ -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) -{ - u32 uninitialized_var(curval), newval; - struct task_struct *new_owner; - bool postunlock = false; - DEFINE_WAKE_Q(wake_q); - int ret = 0; + if (uaddr1 == uaddr2) + return -EINVAL; - new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); - if (WARN_ON_ONCE(!new_owner)) { /* - * As per the comment in futex_unlock_pi() this should not happen. - * - * When this happens, give up our locks and try again, giving - * the futex_lock_pi() instance time to complete, either by - * waiting on the rtmutex or removing itself from the futex - * queue. + * requeue_pi requires a pi_state, try to allocate it now + * without any locks in case it fails. */ - ret = -EAGAIN; - goto out_unlock; - } - - /* - * We pass it to the next owner. The WAITERS bit is always kept - * enabled while there is PI state around. We cleanup the owner - * died bit, because we are the owner. - */ - newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - - if (unlikely(should_fail_futex(true))) - ret = -EFAULT; - - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { - ret = -EFAULT; - - } else if (curval != uval) { + if (refill_pi_state_cache()) + return -ENOMEM; /* - * If a unconditional UNLOCK_PI operation (user space did not - * try the TID->0 transition) raced with a waiter setting the - * FUTEX_WAITERS flag between get_user() and locking the hash - * bucket lock, retry the operation. + * requeue_pi must wake as many tasks as it can, up to nr_wake + * + nr_requeue, since it acquires the rt_mutex prior to + * returning to userspace, so as to not leave the rt_mutex with + * waiters and no owner. However, second and third wake-ups + * cannot be predicted as they involve race conditions with the + * first wake and a fault while looking up the pi_state. Both + * pthread_cond_signal() and pthread_cond_broadcast() should + * use nr_wake=1. */ - if ((FUTEX_TID_MASK & curval) == uval) - ret = -EAGAIN; - else - ret = -EINVAL; + if (nr_wake != 1) + return -EINVAL; } - if (ret) - goto out_unlock; +retry: + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); + if (unlikely(ret != 0)) + goto out; + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, + requeue_pi ? VERIFY_WRITE : VERIFY_READ); + if (unlikely(ret != 0)) + goto out_put_key1; /* - * This is a point of no return; once we modify the uval there is no - * going back and subsequent operations must not fail. + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: */ + if (requeue_pi && match_futex(&key1, &key2)) { + ret = -EINVAL; + goto out_put_keys; + } - raw_spin_lock(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock(&pi_state->owner->pi_lock); + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); - raw_spin_lock(&new_owner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &new_owner->pi_state_list); - pi_state->owner = new_owner; - raw_spin_unlock(&new_owner->pi_lock); +retry_private: + hb_waiters_inc(hb2); + double_lock_hb(hb1, hb2); - postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + if (likely(cmpval != NULL)) { + u32 curval; -out_unlock: - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + ret = get_futex_value_locked(&curval, uaddr1); - if (postunlock) - rt_mutex_postunlock(&wake_q); + if (unlikely(ret)) { + double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); - return ret; -} - -/* - * Express the locking dependencies for lockdep: - */ -static inline void -double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) -{ - if (hb1 <= hb2) { - spin_lock(&hb1->lock); - if (hb1 < hb2) - spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); - } else { /* hb1 > hb2 */ - spin_lock(&hb2->lock); - spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); - } -} - -static inline void -double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) -{ - spin_unlock(&hb1->lock); - if (hb1 != hb2) - spin_unlock(&hb2->lock); -} - -/* - * Wake up waiters matching bitset queued on this futex (uaddr). - */ -static int -futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) -{ - struct futex_hash_bucket *hb; - struct futex_q *this, *next; - union futex_key key = FUTEX_KEY_INIT; - int ret; - DEFINE_WAKE_Q(wake_q); - - if (!bitset) - return -EINVAL; - - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ); - if (unlikely(ret != 0)) - goto out; - - hb = hash_futex(&key); - - /* Make sure we really have tasks to wakeup */ - if (!hb_waiters_pending(hb)) - goto out_put_key; - - spin_lock(&hb->lock); - - plist_for_each_entry_safe(this, next, &hb->chain, list) { - if (match_futex (&this->key, &key)) { - if (this->pi_state || this->rt_waiter) { - ret = -EINVAL; - break; - } + ret = get_user(curval, uaddr1); + if (ret) + goto out_put_keys; - /* Check if one of the bits is set in both bitsets */ - if (!(this->bitset & bitset)) - continue; + if (!(flags & FLAGS_SHARED)) + goto retry_private; - mark_wake_futex(&wake_q, this); - if (++ret >= nr_wake) - break; + put_futex_key(&key2); + put_futex_key(&key1); + goto retry; + } + if (curval != *cmpval) { + ret = -EAGAIN; + goto out_unlock; } } - spin_unlock(&hb->lock); - wake_up_q(&wake_q); -out_put_key: - put_futex_key(&key); -out: - return ret; -} - -/* - * Wake up all waiters hashed on the physical page that is mapped - * to this virtual address: - */ -static int -futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, - int nr_wake, int nr_wake2, int op) -{ - union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; - struct futex_hash_bucket *hb1, *hb2; - struct futex_q *this, *next; - int ret, op_ret; - DEFINE_WAKE_Q(wake_q); - -retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); - if (unlikely(ret != 0)) - goto out; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); - if (unlikely(ret != 0)) - goto out_put_key1; - - hb1 = hash_futex(&key1); - hb2 = hash_futex(&key2); - -retry_private: - double_lock_hb(hb1, hb2); - op_ret = futex_atomic_op_inuser(op, uaddr2); - if (unlikely(op_ret < 0)) { - - double_unlock_hb(hb1, hb2); - -#ifndef CONFIG_MMU + if (requeue_pi && (task_count - nr_wake < nr_requeue)) { /* - * we don't get EFAULT from MMU faults if we don't have an MMU, - * but we might get them from range checking + * Attempt to acquire uaddr2 and wake the top waiter. If we + * intend to requeue waiters, force setting the FUTEX_WAITERS + * bit. We force this here where we are able to easily handle + * faults rather in the requeue loop below. */ - ret = op_ret; - goto out_put_keys; -#endif - - if (unlikely(op_ret != -EFAULT)) { - ret = op_ret; - goto out_put_keys; - } - - ret = fault_in_user_writeable(uaddr2); - if (ret) - goto out_put_keys; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - - put_futex_key(&key2); - put_futex_key(&key1); - goto retry; - } - - plist_for_each_entry_safe(this, next, &hb1->chain, list) { - if (match_futex (&this->key, &key1)) { - if (this->pi_state || this->rt_waiter) { - ret = -EINVAL; - goto out_unlock; - } - mark_wake_futex(&wake_q, this); - if (++ret >= nr_wake) - break; - } - } - - if (op_ret > 0) { - op_ret = 0; - plist_for_each_entry_safe(this, next, &hb2->chain, list) { - if (match_futex (&this->key, &key2)) { - if (this->pi_state || this->rt_waiter) { - ret = -EINVAL; - goto out_unlock; - } - mark_wake_futex(&wake_q, this); - if (++op_ret >= nr_wake2) - break; - } - } - ret += op_ret; - } - -out_unlock: - double_unlock_hb(hb1, hb2); - wake_up_q(&wake_q); -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); -out: - return ret; -} - -/** - * requeue_futex() - Requeue a futex_q from one hb to another - * @q: the futex_q to requeue - * @hb1: the source hash_bucket - * @hb2: the target hash_bucket - * @key2: the new key for the requeued futex_q - */ -static inline -void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, - struct futex_hash_bucket *hb2, union futex_key *key2) -{ - - /* - * If key1 and key2 hash to the same bucket, no need to - * requeue. - */ - if (likely(&hb1->chain != &hb2->chain)) { - plist_del(&q->list, &hb1->chain); - hb_waiters_dec(hb1); - hb_waiters_inc(hb2); - plist_add(&q->list, &hb2->chain); - q->lock_ptr = &hb2->lock; - } - get_futex_key_refs(key2); - q->key = *key2; -} - -/** - * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue - * @q: the futex_q - * @key: the key of the requeue target futex - * @hb: the hash_bucket of the requeue target futex - * - * During futex_requeue, with requeue_pi=1, it is possible to acquire the - * target futex if it is uncontended or via a lock steal. Set the futex_q key - * to the requeue target futex so the waiter can detect the wakeup on the right - * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock - * to protect access to the pi_state to fixup the owner later. Must be called - * with both q->lock_ptr and hb->lock held. - */ -static inline -void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, - struct futex_hash_bucket *hb) -{ - get_futex_key_refs(key); - q->key = *key; - - __unqueue_futex(q); - - WARN_ON(!q->rt_waiter); - q->rt_waiter = NULL; - - q->lock_ptr = &hb->lock; - - wake_up_state(q->task, TASK_NORMAL); -} - -/** - * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter - * @pifutex: the user address of the to futex - * @hb1: the from futex hash bucket, must be locked by the caller - * @hb2: the to futex hash bucket, must be locked by the caller - * @key1: the from futex key - * @key2: the to futex key - * @ps: address to store the pi_state pointer - * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) - * - * Try and get the lock on behalf of the top waiter if we can do it atomically. - * Wake the top waiter if we succeed. If the caller specified set_waiters, - * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. - * hb1 and hb2 must be held by the caller. - * - * Return: - * 0 - failed to acquire the lock atomically; - * >0 - acquired the lock, return value is vpid of the top_waiter - * <0 - error - */ -static int futex_proxy_trylock_atomic(u32 __user *pifutex, - struct futex_hash_bucket *hb1, - struct futex_hash_bucket *hb2, - union futex_key *key1, union futex_key *key2, - struct futex_pi_state **ps, int set_waiters) -{ - struct futex_q *top_waiter = NULL; - u32 curval; - int ret, vpid; - - if (get_futex_value_locked(&curval, pifutex)) - return -EFAULT; - - if (unlikely(should_fail_futex(true))) - return -EFAULT; - - /* - * Find the top_waiter and determine if there are additional waiters. - * If the caller intends to requeue more than 1 waiter to pifutex, - * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, - * as we have means to handle the possible fault. If not, don't set - * the bit unecessarily as it will force the subsequent unlock to enter - * the kernel. - */ - top_waiter = futex_top_waiter(hb1, key1); - - /* There are no waiters, nothing for us to do. */ - if (!top_waiter) - return 0; - - /* Ensure we requeue to the expected futex. */ - if (!match_futex(top_waiter->requeue_pi_key, key2)) - return -EINVAL; - - /* - * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in - * the contended case or if set_waiters is 1. The pi_state is returned - * in ps in contended cases. - */ - vpid = task_pid_vnr(top_waiter->task); - ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, - set_waiters); - if (ret == 1) { - requeue_pi_wake_futex(top_waiter, key2, hb2); - return vpid; - } - return ret; -} - -/** - * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 - * @uaddr1: source futex user address - * @flags: futex flags (FLAGS_SHARED, etc.) - * @uaddr2: target futex user address - * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) - * @nr_requeue: number of waiters to requeue (0-INT_MAX) - * @cmpval: @uaddr1 expected value (or %NULL) - * @requeue_pi: if we are attempting to requeue from a non-pi futex to a - * pi futex (pi to pi requeue is not supported) - * - * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire - * uaddr2 atomically on behalf of the top waiter. - * - * Return: - * >=0 - on success, the number of tasks requeued or woken; - * <0 - on error - */ -static int futex_requeue(u32 __user *uaddr1, unsigned int flags, - u32 __user *uaddr2, int nr_wake, int nr_requeue, - u32 *cmpval, int requeue_pi) -{ - union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; - int drop_count = 0, task_count = 0, ret; - struct futex_pi_state *pi_state = NULL; - struct futex_hash_bucket *hb1, *hb2; - struct futex_q *this, *next; - DEFINE_WAKE_Q(wake_q); - - if (requeue_pi) { - /* - * Requeue PI only works on two distinct uaddrs. This - * check is only valid for private futexes. See below. - */ - if (uaddr1 == uaddr2) - return -EINVAL; - - /* - * requeue_pi requires a pi_state, try to allocate it now - * without any locks in case it fails. - */ - if (refill_pi_state_cache()) - return -ENOMEM; - /* - * requeue_pi must wake as many tasks as it can, up to nr_wake - * + nr_requeue, since it acquires the rt_mutex prior to - * returning to userspace, so as to not leave the rt_mutex with - * waiters and no owner. However, second and third wake-ups - * cannot be predicted as they involve race conditions with the - * first wake and a fault while looking up the pi_state. Both - * pthread_cond_signal() and pthread_cond_broadcast() should - * use nr_wake=1. - */ - if (nr_wake != 1) - return -EINVAL; - } - -retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); - if (unlikely(ret != 0)) - goto out; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, - requeue_pi ? VERIFY_WRITE : VERIFY_READ); - if (unlikely(ret != 0)) - goto out_put_key1; - - /* - * The check above which compares uaddrs is not sufficient for - * shared futexes. We need to compare the keys: - */ - if (requeue_pi && match_futex(&key1, &key2)) { - ret = -EINVAL; - goto out_put_keys; - } - - hb1 = hash_futex(&key1); - hb2 = hash_futex(&key2); - -retry_private: - hb_waiters_inc(hb2); - double_lock_hb(hb1, hb2); - - if (likely(cmpval != NULL)) { - u32 curval; - - ret = get_futex_value_locked(&curval, uaddr1); - - if (unlikely(ret)) { - double_unlock_hb(hb1, hb2); - hb_waiters_dec(hb2); - - ret = get_user(curval, uaddr1); - if (ret) - goto out_put_keys; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - - put_futex_key(&key2); - put_futex_key(&key1); - goto retry; - } - if (curval != *cmpval) { - ret = -EAGAIN; - goto out_unlock; - } - } - - if (requeue_pi && (task_count - nr_wake < nr_requeue)) { - /* - * Attempt to acquire uaddr2 and wake the top waiter. If we - * intend to requeue waiters, force setting the FUTEX_WAITERS - * bit. We force this here where we are able to easily handle - * faults rather in the requeue loop below. - */ - ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, - &key2, &pi_state, nr_requeue); + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, + &key2, &pi_state, nr_requeue); /* * At this point the top_waiter has either taken uaddr2 or is @@ -1912,1085 +1145,415 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, * * If that call succeeds then we have pi_state and an * initial refcount on it. - */ - ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state); - } - - switch (ret) { - case 0: - /* We hold a reference on the pi state. */ - break; - - /* If the above failed, then pi_state is NULL */ - case -EFAULT: - double_unlock_hb(hb1, hb2); - hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); - ret = fault_in_user_writeable(uaddr2); - if (!ret) - goto retry; - goto out; - case -EAGAIN: - /* - * Two reasons for this: - * - Owner is exiting and we just wait for the - * exit to complete. - * - The user space value changed. - */ - double_unlock_hb(hb1, hb2); - hb_waiters_dec(hb2); - put_futex_key(&key2); - put_futex_key(&key1); - cond_resched(); - goto retry; - default: - goto out_unlock; - } - } - - plist_for_each_entry_safe(this, next, &hb1->chain, list) { - if (task_count - nr_wake >= nr_requeue) - break; - - if (!match_futex(&this->key, &key1)) - continue; - - /* - * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always - * be paired with each other and no other futex ops. - * - * We should never be requeueing a futex_q with a pi_state, - * which is awaiting a futex_unlock_pi(). - */ - if ((requeue_pi && !this->rt_waiter) || - (!requeue_pi && this->rt_waiter) || - this->pi_state) { - ret = -EINVAL; - break; - } - - /* - * Wake nr_wake waiters. For requeue_pi, if we acquired the - * lock, we already woke the top_waiter. If not, it will be - * woken by futex_unlock_pi(). - */ - if (++task_count <= nr_wake && !requeue_pi) { - mark_wake_futex(&wake_q, this); - continue; - } - - /* Ensure we requeue to the expected futex for requeue_pi. */ - if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { - ret = -EINVAL; - break; - } - - /* - * Requeue nr_requeue waiters and possibly one more in the case - * of requeue_pi if we couldn't acquire the lock atomically. - */ - if (requeue_pi) { - /* - * Prepare the waiter to take the rt_mutex. Take a - * refcount on the pi_state and store the pointer in - * the futex_q object of the waiter. - */ - get_pi_state(pi_state); - this->pi_state = pi_state; - ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, - this->rt_waiter, - this->task); - if (ret == 1) { - /* - * We got the lock. We do neither drop the - * refcount on pi_state nor clear - * this->pi_state because the waiter needs the - * pi_state for cleaning up the user space - * value. It will drop the refcount after - * doing so. - */ - requeue_pi_wake_futex(this, &key2, hb2); - drop_count++; - continue; - } else if (ret) { - /* - * rt_mutex_start_proxy_lock() detected a - * potential deadlock when we tried to queue - * that waiter. Drop the pi_state reference - * which we took above and remove the pointer - * to the state from the waiters futex_q - * object. - */ - this->pi_state = NULL; - put_pi_state(pi_state); - /* - * We stop queueing more waiters and let user - * space deal with the mess. - */ - break; - } - } - requeue_futex(this, hb1, hb2, &key2); - drop_count++; - } - - /* - * We took an extra initial reference to the pi_state either - * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We - * need to drop it here again. - */ - put_pi_state(pi_state); - -out_unlock: - double_unlock_hb(hb1, hb2); - wake_up_q(&wake_q); - hb_waiters_dec(hb2); - - /* - * drop_futex_key_refs() must be called outside the spinlocks. During - * the requeue we moved futex_q's from the hash bucket at key1 to the - * one at key2 and updated their key pointer. We no longer need to - * hold the references to key1. - */ - while (--drop_count >= 0) - drop_futex_key_refs(&key1); - -out_put_keys: - put_futex_key(&key2); -out_put_key1: - put_futex_key(&key1); -out: - return ret ? ret : task_count; -} - -/* The key must be already stored in q->key. */ -static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) - __acquires(&hb->lock) -{ - struct futex_hash_bucket *hb; - - hb = hash_futex(&q->key); - - /* - * Increment the counter before taking the lock so that - * a potential waker won't miss a to-be-slept task that is - * waiting for the spinlock. This is safe as all queue_lock() - * users end up calling queue_me(). Similarly, for housekeeping, - * decrement the counter at queue_unlock() when some error has - * occurred and we don't end up adding the task to the list. - */ - hb_waiters_inc(hb); - - q->lock_ptr = &hb->lock; - - spin_lock(&hb->lock); /* implies smp_mb(); (A) */ - return hb; -} - -static inline void -queue_unlock(struct futex_hash_bucket *hb) - __releases(&hb->lock) -{ - spin_unlock(&hb->lock); - hb_waiters_dec(hb); -} - -static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) -{ - int prio; - - /* - * The priority used to register this element is - * - either the real thread-priority for the real-time threads - * (i.e. threads with a priority lower than MAX_RT_PRIO) - * - or MAX_RT_PRIO for non-RT threads. - * Thus, all RT-threads are woken first in priority order, and - * the others are woken last, in FIFO order. - */ - prio = min(current->normal_prio, MAX_RT_PRIO); - - plist_node_init(&q->list, prio); - plist_add(&q->list, &hb->chain); - q->task = current; -} - -/** - * queue_me() - Enqueue the futex_q on the futex_hash_bucket - * @q: The futex_q to enqueue - * @hb: The destination hash bucket - * - * The hb->lock must be held by the caller, and is released here. A call to - * queue_me() is typically paired with exactly one call to unqueue_me(). The - * exceptions involve the PI related operations, which may use unqueue_me_pi() - * or nothing if the unqueue is done as part of the wake process and the unqueue - * state is implicit in the state of woken task (see futex_wait_requeue_pi() for - * an example). - */ -static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) - __releases(&hb->lock) -{ - __queue_me(q, hb); - spin_unlock(&hb->lock); -} - -/** - * unqueue_me() - Remove the futex_q from its futex_hash_bucket - * @q: The futex_q to unqueue - * - * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must - * be paired with exactly one earlier call to queue_me(). - * - * Return: - * 1 - if the futex_q was still queued (and we removed unqueued it); - * 0 - if the futex_q was already removed by the waking thread - */ -static int unqueue_me(struct futex_q *q) -{ - spinlock_t *lock_ptr; - int ret = 0; - - /* In the common case we don't take the spinlock, which is nice. */ -retry: - /* - * q->lock_ptr can change between this read and the following spin_lock. - * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and - * optimizing lock_ptr out of the logic below. - */ - lock_ptr = READ_ONCE(q->lock_ptr); - if (lock_ptr != NULL) { - spin_lock(lock_ptr); - /* - * q->lock_ptr can change between reading it and - * spin_lock(), causing us to take the wrong lock. This - * corrects the race condition. - * - * Reasoning goes like this: if we have the wrong lock, - * q->lock_ptr must have changed (maybe several times) - * between reading it and the spin_lock(). It can - * change again after the spin_lock() but only if it was - * already changed before the spin_lock(). It cannot, - * however, change back to the original value. Therefore - * we can detect whether we acquired the correct lock. - */ - if (unlikely(lock_ptr != q->lock_ptr)) { - spin_unlock(lock_ptr); - goto retry; - } - __unqueue_futex(q); - - BUG_ON(q->pi_state); - - spin_unlock(lock_ptr); - ret = 1; - } - - drop_futex_key_refs(&q->key); - return ret; -} - -/* - * PI futexes can not be requeued and must remove themself from the - * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry - * and dropped here. - */ -static void unqueue_me_pi(struct futex_q *q) - __releases(q->lock_ptr) -{ - __unqueue_futex(q); - - BUG_ON(!q->pi_state); - put_pi_state(q->pi_state); - q->pi_state = NULL; - - spin_unlock(q->lock_ptr); -} - -/* - * Fixup the pi_state owner with the new owner. - * - * Must be called with hash bucket lock held and mm->sem held for non - * private futexes. - */ -static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) -{ - u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; - struct futex_pi_state *pi_state = q->pi_state; - u32 uval, uninitialized_var(curval), newval; - struct task_struct *oldowner; - int ret; - - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); - - oldowner = pi_state->owner; - /* Owner died? */ - if (!pi_state->owner) - newtid |= FUTEX_OWNER_DIED; - - /* - * We are here either because we stole the rtmutex from the - * previous highest priority waiter or we are the highest priority - * waiter but have failed to get the rtmutex the first time. - * - * We have to replace the newowner TID in the user space variable. - * This must be atomic as we have to preserve the owner died bit here. - * - * Note: We write the user space value _before_ changing the pi_state - * because we can fault here. Imagine swapped out pages or a fork - * that marked all the anonymous memory readonly for cow. - * - * Modifying pi_state _before_ the user space value would leave the - * pi_state in an inconsistent state when we fault here, because we - * need to drop the locks to handle the fault. This might be observed - * in the PID check in lookup_pi_state. - */ -retry: - if (get_futex_value_locked(&uval, uaddr)) - goto handle_fault; - - for (;;) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; - - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) - goto handle_fault; - if (curval == uval) - break; - uval = curval; - } - - /* - * We fixed up user space. Now we need to fix the pi_state - * itself. - */ - if (pi_state->owner != NULL) { - raw_spin_lock(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock(&pi_state->owner->pi_lock); - } - - pi_state->owner = newowner; - - raw_spin_lock(&newowner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &newowner->pi_state_list); - raw_spin_unlock(&newowner->pi_lock); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - - return 0; - - /* - * To handle the page fault we need to drop the locks here. That gives - * the other task (either the highest priority waiter itself or the - * task which stole the rtmutex) the chance to try the fixup of the - * pi_state. So once we are back from handling the fault we need to - * check the pi_state after reacquiring the locks and before trying to - * do another fixup. When the fixup has been done already we simply - * return. - * - * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely - * drop hb->lock since the caller owns the hb -> futex_q relation. - * Dropping the pi_mutex->wait_lock requires the state revalidate. - */ -handle_fault: - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - spin_unlock(q->lock_ptr); - - ret = fault_in_user_writeable(uaddr); - - spin_lock(q->lock_ptr); - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); - - /* - * Check if someone else fixed it for us: - */ - if (pi_state->owner != oldowner) { - ret = 0; - goto out_unlock; - } - - if (ret) - goto out_unlock; - - goto retry; - -out_unlock: - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - return ret; -} - -static long futex_wait_restart(struct restart_block *restart); - -/** - * fixup_owner() - Post lock pi_state and corner case management - * @uaddr: user address of the futex - * @q: futex_q (contains pi_state and access to the rt_mutex) - * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) - * - * After attempting to lock an rt_mutex, this function is called to cleanup - * the pi_state owner as well as handle race conditions that may allow us to - * acquire the lock. Must be called with the hb lock held. - * - * Return: - * 1 - success, lock taken; - * 0 - success, lock not taken; - * <0 - on error (-EFAULT) - */ -static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) -{ - int ret = 0; - - if (locked) { - /* - * Got the lock. We might not be the anticipated owner if we - * did a lock-steal - fix up the PI-state in that case: - * - * We can safely read pi_state->owner without holding wait_lock - * because we now own the rt_mutex, only the owner will attempt - * to change it. - */ - if (q->pi_state->owner != current) - ret = fixup_pi_state_owner(uaddr, q, current); - goto out; - } - - /* - * Paranoia check. If we did not take the lock, then we should not be - * the owner of the rt_mutex. - */ - if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { - printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " - "pi-state %p\n", ret, - q->pi_state->pi_mutex.owner, - q->pi_state->owner); - } - -out: - return ret ? ret : locked; -} - -/** - * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal - * @hb: the futex hash bucket, must be locked by the caller - * @q: the futex_q to queue up on - * @timeout: the prepared hrtimer_sleeper, or null for no timeout - */ -static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, - struct hrtimer_sleeper *timeout) -{ - /* - * The task state is guaranteed to be set before another task can - * wake it. set_current_state() is implemented using smp_store_mb() and - * queue_me() calls spin_unlock() upon completion, both serializing - * access to the hash list and forcing another memory barrier. - */ - set_current_state(TASK_INTERRUPTIBLE); - queue_me(q, hb); - - /* Arm the timer */ - if (timeout) - hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - - /* - * If we have been removed from the hash list, then another task - * has tried to wake us, and we can skip the call to schedule(). - */ - if (likely(!plist_node_empty(&q->list))) { - /* - * If the timer has already expired, current will already be - * flagged for rescheduling. Only call schedule if there - * is no timeout, or if it has yet to expire. - */ - if (!timeout || timeout->task) - freezable_schedule(); - } - __set_current_state(TASK_RUNNING); -} - -/** - * futex_wait_setup() - Prepare to wait on a futex - * @uaddr: the futex userspace address - * @val: the expected value - * @flags: futex flags (FLAGS_SHARED, etc.) - * @q: the associated futex_q - * @hb: storage for hash_bucket pointer to be returned to caller - * - * Setup the futex_q and locate the hash_bucket. Get the futex value and - * compare it with the expected value. Handle atomic faults internally. - * Return with the hb lock held and a q.key reference on success, and unlocked - * with no q.key reference on failure. - * - * Return: - * 0 - uaddr contains val and hb has been locked; - * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked - */ -static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - struct futex_q *q, struct futex_hash_bucket **hb) -{ - u32 uval; - int ret; - - /* - * Access the page AFTER the hash-bucket is locked. - * Order is important: - * - * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); - * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } - * - * The basic logical guarantee of a futex is that it blocks ONLY - * if cond(var) is known to be true at the time of blocking, for - * any cond. If we locked the hash-bucket after testing *uaddr, that - * would open a race condition where we could block indefinitely with - * cond(var) false, which would violate the guarantee. - * - * On the other hand, we insert q and release the hash-bucket only - * after testing *uaddr. This guarantees that futex_wait() will NOT - * absorb a wakeup if *uaddr does not match the desired values - * while the syscall executes. - */ -retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ); - if (unlikely(ret != 0)) - return ret; - -retry_private: - *hb = queue_lock(q); - - ret = get_futex_value_locked(&uval, uaddr); - - if (ret) { - queue_unlock(*hb); - - ret = get_user(uval, uaddr); - if (ret) - goto out; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - - put_futex_key(&q->key); - goto retry; - } - - if (uval != val) { - queue_unlock(*hb); - ret = -EWOULDBLOCK; - } - -out: - if (ret) - put_futex_key(&q->key); - return ret; -} - -static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, - ktime_t *abs_time, u32 bitset) -{ - struct hrtimer_sleeper timeout, *to = NULL; - struct restart_block *restart; - struct futex_hash_bucket *hb; - struct futex_q q = futex_q_init; - int ret; - - if (!bitset) - return -EINVAL; - q.bitset = bitset; - - if (abs_time) { - to = &timeout; - - hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires_range_ns(&to->timer, *abs_time, - current->timer_slack_ns); - } - -retry: - /* - * Prepare to wait on uaddr. On success, holds hb lock and increments - * q.key refs. - */ - ret = futex_wait_setup(uaddr, val, flags, &q, &hb); - if (ret) - goto out; - - /* queue_me and wait for wakeup, timeout, or a signal. */ - futex_wait_queue_me(hb, &q, to); - - /* If we were woken (and unqueued), we succeeded, whatever. */ - ret = 0; - /* unqueue_me() drops q.key ref */ - if (!unqueue_me(&q)) - goto out; - ret = -ETIMEDOUT; - if (to && !to->task) - goto out; - - /* - * We expect signal_pending(current), but we might be the - * victim of a spurious wakeup as well. - */ - if (!signal_pending(current)) - goto retry; - - ret = -ERESTARTSYS; - if (!abs_time) - goto out; - - restart = &current->restart_block; - restart->fn = futex_wait_restart; - restart->futex.uaddr = uaddr; - restart->futex.val = val; - restart->futex.time = *abs_time; - restart->futex.bitset = bitset; - restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; - - ret = -ERESTART_RESTARTBLOCK; - -out: - if (to) { - hrtimer_cancel(&to->timer); - destroy_hrtimer_on_stack(&to->timer); - } - return ret; -} - - -static long futex_wait_restart(struct restart_block *restart) -{ - u32 __user *uaddr = restart->futex.uaddr; - ktime_t t, *tp = NULL; - - if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { - t = restart->futex.time; - tp = &t; - } - restart->fn = do_no_restart_syscall; - - return (long)futex_wait(uaddr, restart->futex.flags, - restart->futex.val, tp, restart->futex.bitset); -} - - -/* - * Userspace tried a 0 -> TID atomic transition of the futex value - * and failed. The kernel side here does the whole locking operation: - * if there are waiters then it will block as a consequence of relying - * on rt-mutexes, it does PI, etc. (Due to races the kernel might see - * a 0 value of the futex too.). - * - * Also serves as futex trylock_pi()'ing, and due semantics. - */ -static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, - ktime_t *time, int trylock) -{ - struct hrtimer_sleeper timeout, *to = NULL; - struct futex_pi_state *pi_state = NULL; - struct rt_mutex_waiter rt_waiter; - struct futex_hash_bucket *hb; - struct futex_q q = futex_q_init; - int res, ret; - - if (refill_pi_state_cache()) - return -ENOMEM; - - if (time) { - to = &timeout; - hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires(&to->timer, *time); - } - -retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE); - if (unlikely(ret != 0)) - goto out; - -retry_private: - hb = queue_lock(&q); + */ + ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state); + } - ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); - if (unlikely(ret)) { - /* - * Atomic work succeeded and we got the lock, - * or failed. Either way, we do _not_ block. - */ switch (ret) { - case 1: - /* We got the lock. */ - ret = 0; - goto out_unlock_put_key; + case 0: + /* We hold a reference on the pi state. */ + break; + + /* If the above failed, then pi_state is NULL */ case -EFAULT: - goto uaddr_faulted; + double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); + put_futex_key(&key2); + put_futex_key(&key1); + ret = fault_in_user_writeable(uaddr2); + if (!ret) + goto retry; + goto out; case -EAGAIN: /* * Two reasons for this: - * - Task is exiting and we just wait for the + * - Owner is exiting and we just wait for the * exit to complete. * - The user space value changed. */ - queue_unlock(hb); - put_futex_key(&q.key); + double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); + put_futex_key(&key2); + put_futex_key(&key1); cond_resched(); goto retry; default: - goto out_unlock_put_key; + goto out_unlock; } } - WARN_ON(!q.pi_state); - - /* - * Only actually queue now that the atomic ops are done: - */ - __queue_me(&q, hb); + plist_for_each_entry_safe(this, next, &hb1->chain, list) { + if (task_count - nr_wake >= nr_requeue) + break; - if (trylock) { - ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); - /* Fixup the trylock return value: */ - ret = ret ? 0 : -EWOULDBLOCK; - goto no_block; - } + if (!match_futex(&this->key, &key1)) + continue; - rt_mutex_init_waiter(&rt_waiter); + /* + * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * be paired with each other and no other futex ops. + * + * We should never be requeueing a futex_q with a pi_state, + * which is awaiting a futex_unlock_pi(). + */ + if ((requeue_pi && !this->rt_waiter) || + (!requeue_pi && this->rt_waiter) || + this->pi_state) { + ret = -EINVAL; + break; + } - /* - * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not - * hold it while doing rt_mutex_start_proxy(), because then it will - * include hb->lock in the blocking chain, even through we'll not in - * fact hold it while blocking. This will lead it to report -EDEADLK - * and BUG when futex_unlock_pi() interleaves with this. - * - * Therefore acquire wait_lock while holding hb->lock, but drop the - * latter before calling rt_mutex_start_proxy_lock(). This still fully - * serializes against futex_unlock_pi() as that does the exact same - * lock handoff sequence. - */ - raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); - spin_unlock(q.lock_ptr); - ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); - raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); + /* + * Wake nr_wake waiters. For requeue_pi, if we acquired the + * lock, we already woke the top_waiter. If not, it will be + * woken by futex_unlock_pi(). + */ + if (++task_count <= nr_wake && !requeue_pi) { + mark_wake_futex(&wake_q, this); + continue; + } - if (ret) { - if (ret == 1) - ret = 0; + /* Ensure we requeue to the expected futex for requeue_pi. */ + if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { + ret = -EINVAL; + break; + } - spin_lock(q.lock_ptr); - goto no_block; + /* + * Requeue nr_requeue waiters and possibly one more in the case + * of requeue_pi if we couldn't acquire the lock atomically. + */ + if (requeue_pi) { + /* + * Prepare the waiter to take the rt_mutex. Take a + * refcount on the pi_state and store the pointer in + * the futex_q object of the waiter. + */ + get_pi_state(pi_state); + this->pi_state = pi_state; + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, + this->task); + if (ret == 1) { + /* + * We got the lock. We do neither drop the + * refcount on pi_state nor clear + * this->pi_state because the waiter needs the + * pi_state for cleaning up the user space + * value. It will drop the refcount after + * doing so. + */ + requeue_pi_wake_futex(this, &key2, hb2); + drop_count++; + continue; + } else if (ret) { + /* + * rt_mutex_start_proxy_lock() detected a + * potential deadlock when we tried to queue + * that waiter. Drop the pi_state reference + * which we took above and remove the pointer + * to the state from the waiters futex_q + * object. + */ + this->pi_state = NULL; + put_pi_state(pi_state); + /* + * We stop queueing more waiters and let user + * space deal with the mess. + */ + break; + } + } + requeue_futex(this, hb1, hb2, &key2); + drop_count++; } - - if (unlikely(to)) - hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); - - ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); - - spin_lock(q.lock_ptr); /* - * If we failed to acquire the lock (signal/timeout), we must - * first acquire the hb->lock before removing the lock from the - * rt_mutex waitqueue, such that we can keep the hb and rt_mutex - * wait lists consistent. - * - * In particular; it is important that futex_unlock_pi() can not - * observe this inconsistency. + * We took an extra initial reference to the pi_state either + * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We + * need to drop it here again. */ - if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) - ret = 0; + put_pi_state(pi_state); -no_block: - /* - * Fixup the pi_state owner and possibly acquire the lock if we - * haven't already. - */ - res = fixup_owner(uaddr, &q, !ret); - /* - * If fixup_owner() returned an error, proprogate that. If it acquired - * the lock, clear our -ETIMEDOUT or -EINTR. - */ - if (res) - ret = (res < 0) ? res : 0; +out_unlock: + double_unlock_hb(hb1, hb2); + wake_up_q(&wake_q); + hb_waiters_dec(hb2); /* - * If fixup_owner() faulted and was unable to handle the fault, unlock - * it and return the fault to userspace. + * drop_futex_key_refs() must be called outside the spinlocks. During + * the requeue we moved futex_q's from the hash bucket at key1 to the + * one at key2 and updated their key pointer. We no longer need to + * hold the references to key1. */ - if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { - pi_state = q.pi_state; - get_pi_state(pi_state); - } - - /* Unqueue and drop the lock */ - unqueue_me_pi(&q); - - if (pi_state) { - rt_mutex_futex_unlock(&pi_state->pi_mutex); - put_pi_state(pi_state); - } - - goto out_put_key; - -out_unlock_put_key: - queue_unlock(hb); + while (--drop_count >= 0) + drop_futex_key_refs(&key1); -out_put_key: - put_futex_key(&q.key); +out_put_keys: + put_futex_key(&key2); +out_put_key1: + put_futex_key(&key1); out: - if (to) { - hrtimer_cancel(&to->timer); - destroy_hrtimer_on_stack(&to->timer); - } - return ret != -EINTR ? ret : -ERESTARTNOINTR; - -uaddr_faulted: - queue_unlock(hb); - - ret = fault_in_user_writeable(uaddr); - if (ret) - goto out_put_key; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - - put_futex_key(&q.key); - goto retry; + return ret ? ret : task_count; } -/* - * Userspace attempted a TID -> 0 atomic transition, and failed. - * This is the in-kernel slowpath: we look up the PI state (if any), - * and do the rt-mutex unlock. - */ -static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) +/* The key must be already stored in q->key. */ +static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) + __acquires(&hb->lock) { - u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); - union futex_key key = FUTEX_KEY_INIT; struct futex_hash_bucket *hb; - struct futex_q *top_waiter; - int ret; -retry: - if (get_user(uval, uaddr)) - return -EFAULT; + hb = hash_futex(&q->key); + /* - * We release only a lock we actually own: + * Increment the counter before taking the lock so that + * a potential waker won't miss a to-be-slept task that is + * waiting for the spinlock. This is safe as all queue_lock() + * users end up calling queue_me(). Similarly, for housekeeping, + * decrement the counter at queue_unlock() when some error has + * occurred and we don't end up adding the task to the list. */ - if ((uval & FUTEX_TID_MASK) != vpid) - return -EPERM; + hb_waiters_inc(hb); - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); - if (ret) - return ret; + q->lock_ptr = &hb->lock; - hb = hash_futex(&key); - spin_lock(&hb->lock); + spin_lock(&hb->lock); /* implies smp_mb(); (A) */ + return hb; +} - /* - * Check waiters first. We do not trust user space values at - * all and we at least want to know if user space fiddled - * with the futex value instead of blindly unlocking. - */ - top_waiter = futex_top_waiter(hb, &key); - if (top_waiter) { - struct futex_pi_state *pi_state = top_waiter->pi_state; +static inline void +queue_unlock(struct futex_hash_bucket *hb) + __releases(&hb->lock) +{ + spin_unlock(&hb->lock); + hb_waiters_dec(hb); +} - ret = -EINVAL; - if (!pi_state) - goto out_unlock; +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) +{ + int prio; - /* - * If current does not own the pi_state then the futex is - * inconsistent and user space fiddled with the futex value. - */ - if (pi_state->owner != current) - goto out_unlock; + /* + * The priority used to register this element is + * - either the real thread-priority for the real-time threads + * (i.e. threads with a priority lower than MAX_RT_PRIO) + * - or MAX_RT_PRIO for non-RT threads. + * Thus, all RT-threads are woken first in priority order, and + * the others are woken last, in FIFO order. + */ + prio = min(current->normal_prio, MAX_RT_PRIO); - get_pi_state(pi_state); - /* - * By taking wait_lock while still holding hb->lock, we ensure - * there is no point where we hold neither; and therefore - * wake_futex_pi() must observe a state consistent with what we - * observed. - */ - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); - spin_unlock(&hb->lock); + plist_node_init(&q->list, prio); + plist_add(&q->list, &hb->chain); + q->task = current; +} - ret = wake_futex_pi(uaddr, uval, pi_state); +/** + * queue_me() - Enqueue the futex_q on the futex_hash_bucket + * @q: The futex_q to enqueue + * @hb: The destination hash bucket + * + * The hb->lock must be held by the caller, and is released here. A call to + * queue_me() is typically paired with exactly one call to unqueue_me(). The + * exceptions involve the PI related operations, which may use unqueue_me_pi() + * or nothing if the unqueue is done as part of the wake process and the unqueue + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for + * an example). + */ +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) +{ + __queue_me(q, hb); + spin_unlock(&hb->lock); +} - put_pi_state(pi_state); +/** + * unqueue_me() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must + * be paired with exactly one earlier call to queue_me(). + * + * Return: + * 1 - if the futex_q was still queued (and we removed unqueued it); + * 0 - if the futex_q was already removed by the waking thread + */ +static int unqueue_me(struct futex_q *q) +{ + spinlock_t *lock_ptr; + int ret = 0; + /* In the common case we don't take the spinlock, which is nice. */ +retry: + /* + * q->lock_ptr can change between this read and the following spin_lock. + * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and + * optimizing lock_ptr out of the logic below. + */ + lock_ptr = READ_ONCE(q->lock_ptr); + if (lock_ptr != NULL) { + spin_lock(lock_ptr); /* - * Success, we're done! No tricky corner cases. - */ - if (!ret) - goto out_putkey; - /* - * The atomic access to the futex value generated a - * pagefault, so retry the user-access and the wakeup: - */ - if (ret == -EFAULT) - goto pi_faulted; - /* - * A unconditional UNLOCK_PI op raced against a waiter - * setting the FUTEX_WAITERS bit. Try again. + * q->lock_ptr can change between reading it and + * spin_lock(), causing us to take the wrong lock. This + * corrects the race condition. + * + * Reasoning goes like this: if we have the wrong lock, + * q->lock_ptr must have changed (maybe several times) + * between reading it and the spin_lock(). It can + * change again after the spin_lock() but only if it was + * already changed before the spin_lock(). It cannot, + * however, change back to the original value. Therefore + * we can detect whether we acquired the correct lock. */ - if (ret == -EAGAIN) { - put_futex_key(&key); + if (unlikely(lock_ptr != q->lock_ptr)) { + spin_unlock(lock_ptr); goto retry; } - /* - * wake_futex_pi has detected invalid state. Tell user - * space. - */ - goto out_putkey; - } + __unqueue_futex(q); - /* - * We have no kernel internal state, i.e. no waiters in the - * kernel. Waiters which are about to queue themselves are stuck - * on hb->lock. So we can safely ignore them. We do neither - * preserve the WAITERS bit not the OWNER_DIED one. We are the - * owner. - */ - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { - spin_unlock(&hb->lock); - goto pi_faulted; - } + BUG_ON(q->pi_state); - /* - * If uval has changed, let user space handle it. - */ - ret = (curval == uval) ? 0 : -EAGAIN; + spin_unlock(lock_ptr); + ret = 1; + } -out_unlock: - spin_unlock(&hb->lock); -out_putkey: - put_futex_key(&key); + drop_futex_key_refs(&q->key); return ret; +} -pi_faulted: - put_futex_key(&key); +static long futex_wait_restart(struct restart_block *restart); - ret = fault_in_user_writeable(uaddr); - if (!ret) - goto retry; +/** + * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal + * @hb: the futex hash bucket, must be locked by the caller + * @q: the futex_q to queue up on + * @timeout: the prepared hrtimer_sleeper, or null for no timeout + */ +static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + struct hrtimer_sleeper *timeout) +{ + /* + * The task state is guaranteed to be set before another task can + * wake it. set_current_state() is implemented using smp_store_mb() and + * queue_me() calls spin_unlock() upon completion, both serializing + * access to the hash list and forcing another memory barrier. + */ + set_current_state(TASK_INTERRUPTIBLE); + queue_me(q, hb); - return ret; + /* Arm the timer */ + if (timeout) + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + + /* + * If we have been removed from the hash list, then another task + * has tried to wake us, and we can skip the call to schedule(). + */ + if (likely(!plist_node_empty(&q->list))) { + /* + * If the timer has already expired, current will already be + * flagged for rescheduling. Only call schedule if there + * is no timeout, or if it has yet to expire. + */ + if (!timeout || timeout->task) + freezable_schedule(); + } + __set_current_state(TASK_RUNNING); } /** - * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex - * @hb: the hash_bucket futex_q was original enqueued on - * @q: the futex_q woken while waiting to be requeued - * @key2: the futex_key of the requeue target futex - * @timeout: the timeout associated with the wait (NULL if none) + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr: the futex userspace address + * @val: the expected value + * @flags: futex flags (FLAGS_SHARED, etc.) + * @q: the associated futex_q + * @hb: storage for hash_bucket pointer to be returned to caller * - * Detect if the task was woken on the initial futex as opposed to the requeue - * target futex. If so, determine if it was a timeout or a signal that caused - * the wakeup and return the appropriate error code to the caller. Must be - * called with the hb lock held. + * Setup the futex_q and locate the hash_bucket. Get the futex value and + * compare it with the expected value. Handle atomic faults internally. + * Return with the hb lock held and a q.key reference on success, and unlocked + * with no q.key reference on failure. * * Return: - * 0 = no early wakeup detected; - * <0 = -ETIMEDOUT or -ERESTARTNOINTR + * 0 - uaddr contains val and hb has been locked; + * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ -static inline -int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, - struct futex_q *q, union futex_key *key2, - struct hrtimer_sleeper *timeout) +static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + struct futex_q *q, struct futex_hash_bucket **hb) { - int ret = 0; + u32 uval; + int ret; /* - * With the hb lock held, we avoid races while we process the wakeup. - * We only need to hold hb (and not hb2) to ensure atomicity as the - * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. - * It can't be requeued from uaddr2 to something else since we don't - * support a PI aware source futex for requeue. + * Access the page AFTER the hash-bucket is locked. + * Order is important: + * + * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); + * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } + * + * The basic logical guarantee of a futex is that it blocks ONLY + * if cond(var) is known to be true at the time of blocking, for + * any cond. If we locked the hash-bucket after testing *uaddr, that + * would open a race condition where we could block indefinitely with + * cond(var) false, which would violate the guarantee. + * + * On the other hand, we insert q and release the hash-bucket only + * after testing *uaddr. This guarantees that futex_wait() will NOT + * absorb a wakeup if *uaddr does not match the desired values + * while the syscall executes. */ - if (!match_futex(&q->key, key2)) { - WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); - /* - * We were woken prior to requeue by a timeout or a signal. - * Unqueue the futex_q and determine which it was. - */ - plist_del(&q->list, &hb->chain); - hb_waiters_dec(hb); +retry: + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ); + if (unlikely(ret != 0)) + return ret; + +retry_private: + *hb = queue_lock(q); + + ret = get_futex_value_locked(&uval, uaddr); + + if (ret) { + queue_unlock(*hb); + + ret = get_user(uval, uaddr); + if (ret) + goto out; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + put_futex_key(&q->key); + goto retry; + } - /* Handle spurious wakeups gracefully */ + if (uval != val) { + queue_unlock(*hb); ret = -EWOULDBLOCK; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; - else if (signal_pending(current)) - ret = -ERESTARTNOINTR; } + +out: + if (ret) + put_futex_key(&q->key); return ret; } -/** - * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 - * @uaddr: the futex we initially wait on (non-pi) - * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be - * the same type, no requeueing from private to shared, etc. - * @val: the expected value of uaddr - * @abs_time: absolute timeout - * @bitset: 32 bit wakeup bitset set by userspace, defaults to all - * @uaddr2: the pi futex we will take prior to returning to user-space - * - * The caller will wait on uaddr and will be requeued by futex_requeue() to - * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake - * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to - * userspace. This ensures the rt_mutex maintains an owner when it has waiters; - * without one, the pi logic would not know which task to boost/deboost, if - * there was a need to. - * - * We call schedule in futex_wait_queue_me() when we enqueue and return there - * via the following-- - * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() - * 2) wakeup on uaddr2 after a requeue - * 3) signal - * 4) timeout - * - * If 3, cleanup and return -ERESTARTNOINTR. - * - * If 2, we may then block on trying to take the rt_mutex and return via: - * 5) successful lock - * 6) signal - * 7) timeout - * 8) other lock acquisition failure - * - * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). - * - * If 4 or 7, we cleanup and return with -ETIMEDOUT. - * - * Return: - * 0 - On success; - * <0 - On error - */ -static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, - u32 val, ktime_t *abs_time, u32 bitset, - u32 __user *uaddr2) +static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset) { struct hrtimer_sleeper timeout, *to = NULL; - struct futex_pi_state *pi_state = NULL; - struct rt_mutex_waiter rt_waiter; + struct restart_block *restart; struct futex_hash_bucket *hb; - union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; - int res, ret; - - if (uaddr == uaddr2) - return -EINVAL; + int ret; if (!bitset) return -EINVAL; + q.bitset = bitset; if (abs_time) { to = &timeout; + hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC, HRTIMER_MODE_ABS); @@ -2999,139 +1562,47 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, current->timer_slack_ns); } +retry: /* - * The waiter is allocated on our stack, manipulated by the requeue - * code while we sleep on uaddr. - */ - rt_mutex_init_waiter(&rt_waiter); - - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); - if (unlikely(ret != 0)) - goto out; - - q.bitset = bitset; - q.rt_waiter = &rt_waiter; - q.requeue_pi_key = &key2; - - /* - * Prepare to wait on uaddr. On success, increments q.key (key1) ref - * count. + * Prepare to wait on uaddr. On success, holds hb lock and increments + * q.key refs. */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) - goto out_key2; - - /* - * The check above which compares uaddrs is not sufficient for - * shared futexes. We need to compare the keys: - */ - if (match_futex(&q.key, &key2)) { - queue_unlock(hb); - ret = -EINVAL; - goto out_put_keys; - } + goto out; - /* Queue the futex_q, drop the hb lock, wait for wakeup. */ + /* queue_me and wait for wakeup, timeout, or a signal. */ futex_wait_queue_me(hb, &q, to); - spin_lock(&hb->lock); - ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); - spin_unlock(&hb->lock); - if (ret) - goto out_put_keys; + /* If we were woken (and unqueued), we succeeded, whatever. */ + ret = 0; + /* unqueue_me() drops q.key ref */ + if (!unqueue_me(&q)) + goto out; + ret = -ETIMEDOUT; + if (to && !to->task) + goto out; /* - * In order for us to be here, we know our q.key == key2, and since - * we took the hb->lock above, we also know that futex_requeue() has - * completed and we no longer have to concern ourselves with a wakeup - * race with the atomic proxy lock acquisition by the requeue code. The - * futex_requeue dropped our key1 reference and incremented our key2 - * reference count. + * We expect signal_pending(current), but we might be the + * victim of a spurious wakeup as well. */ + if (!signal_pending(current)) + goto retry; - /* Check if the requeue code acquired the second futex for us. */ - if (!q.rt_waiter) { - /* - * Got the lock. We might not be the anticipated owner if we - * did a lock-steal - fix up the PI-state in that case. - */ - if (q.pi_state && (q.pi_state->owner != current)) { - spin_lock(q.lock_ptr); - ret = fixup_pi_state_owner(uaddr2, &q, current); - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { - pi_state = q.pi_state; - get_pi_state(pi_state); - } - /* - * Drop the reference to the pi state which - * the requeue_pi() code acquired for us. - */ - put_pi_state(q.pi_state); - spin_unlock(q.lock_ptr); - } - } else { - struct rt_mutex *pi_mutex; - - /* - * We have been woken up by futex_unlock_pi(), a timeout, or a - * signal. futex_unlock_pi() will not destroy the lock_ptr nor - * the pi_state. - */ - WARN_ON(!q.pi_state); - pi_mutex = &q.pi_state->pi_mutex; - ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); - - spin_lock(q.lock_ptr); - if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) - ret = 0; - - debug_rt_mutex_free_waiter(&rt_waiter); - /* - * Fixup the pi_state owner and possibly acquire the lock if we - * haven't already. - */ - res = fixup_owner(uaddr2, &q, !ret); - /* - * If fixup_owner() returned an error, proprogate that. If it - * acquired the lock, clear -ETIMEDOUT or -EINTR. - */ - if (res) - ret = (res < 0) ? res : 0; - - /* - * If fixup_pi_state_owner() faulted and was unable to handle - * the fault, unlock the rt_mutex and return the fault to - * userspace. - */ - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { - pi_state = q.pi_state; - get_pi_state(pi_state); - } - - /* Unqueue and drop the lock. */ - unqueue_me_pi(&q); - } - - if (pi_state) { - rt_mutex_futex_unlock(&pi_state->pi_mutex); - put_pi_state(pi_state); - } + ret = -ERESTARTSYS; + if (!abs_time) + goto out; - if (ret == -EINTR) { - /* - * We've already been requeued, but cannot restart by calling - * futex_lock_pi() directly. We could restart this syscall, but - * it would detect that the user space "val" changed and return - * -EWOULDBLOCK. Save the overhead of the restart and return - * -EWOULDBLOCK directly. - */ - ret = -EWOULDBLOCK; - } + restart = &current->restart_block; + restart->fn = futex_wait_restart; + restart->futex.uaddr = uaddr; + restart->futex.val = val; + restart->futex.time = *abs_time; + restart->futex.bitset = bitset; + restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; -out_put_keys: - put_futex_key(&q.key); -out_key2: - put_futex_key(&key2); + ret = -ERESTART_RESTARTBLOCK; out: if (to) { @@ -3141,6 +1612,22 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, return ret; } + +static long futex_wait_restart(struct restart_block *restart) +{ + u32 __user *uaddr = restart->futex.uaddr; + ktime_t t, *tp = NULL; + + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { + t = restart->futex.time; + tp = &t; + } + restart->fn = do_no_restart_syscall; + + return (long)futex_wait(uaddr, restart->futex.flags, + restart->futex.val, tp, restart->futex.bitset); +} + /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. diff --git a/kernel/futex_pi.c b/kernel/futex_pi.c new file mode 100644 index 0000000000..a10c962aa2 --- /dev/null +++ b/kernel/futex_pi.c @@ -0,0 +1,1563 @@ +/* + * PI-futex support started by Ingo Molnar and Thomas Gleixner + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> + * Copyright (C) IBM Corporation, 2009 + * Thanks to Thomas Gleixner for conceptual design and careful reviews. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "locking/rtmutex_common.h" + +/* from futex.c: */ +static void __unqueue_futex(struct futex_q *q); +static inline struct futex_hash_bucket *queue_lock(struct futex_q *q); +static inline void queue_unlock(struct futex_hash_bucket *hb); +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb); +static void futex_wait_queue_me(struct futex_hash_bucket *hb, + struct futex_q *q, struct hrtimer_sleeper *timeout); +static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + struct futex_q *q, struct futex_hash_bucket **hb); + +/* + * Priority Inheritance state: + */ +struct futex_pi_state { + /* + * list of 'owned' pi_state instances - these have to be + * cleaned up in do_exit() if the task exits prematurely: + */ + struct list_head list; + + /* + * The PI object: + */ + struct rt_mutex pi_mutex; + + struct task_struct *owner; + atomic_t refcount; + + union futex_key key; +}; + +static int refill_pi_state_cache(void) +{ + struct futex_pi_state *pi_state; + + if (likely(current->pi_state_cache)) + return 0; + + pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); + + if (!pi_state) + return -ENOMEM; + + INIT_LIST_HEAD(&pi_state->list); + /* pi_mutex gets initialized later */ + pi_state->owner = NULL; + atomic_set(&pi_state->refcount, 1); + pi_state->key = FUTEX_KEY_INIT; + + current->pi_state_cache = pi_state; + + return 0; +} + +static struct futex_pi_state *alloc_pi_state(void) +{ + struct futex_pi_state *pi_state = current->pi_state_cache; + + WARN_ON(!pi_state); + current->pi_state_cache = NULL; + + return pi_state; +} + +static void get_pi_state(struct futex_pi_state *pi_state) +{ + WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); +} + +/* + * Drops a reference to the pi_state object and frees or caches it + * when the last reference is gone. + * + * Must be called with the hb lock held. + */ +static void put_pi_state(struct futex_pi_state *pi_state) +{ + if (!pi_state) + return; + + if (!atomic_dec_and_test(&pi_state->refcount)) + return; + + /* + * If pi_state->owner is NULL, the owner is most probably dying + * and has cleaned up the pi_state already + */ + if (pi_state->owner) { + raw_spin_lock_irq(&pi_state->owner->pi_lock); + list_del_init(&pi_state->list); + raw_spin_unlock_irq(&pi_state->owner->pi_lock); + + rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); + } + + if (current->pi_state_cache) + kfree(pi_state); + else { + /* + * pi_state->list is already empty. + * clear pi_state->owner. + * refcount is at 0 - put it back to 1. + */ + pi_state->owner = NULL; + atomic_set(&pi_state->refcount, 1); + current->pi_state_cache = pi_state; + } +} + +/* + * Look up the task based on what TID userspace gave us. + * We dont trust it. + */ +static struct task_struct *futex_find_get_task(pid_t pid) +{ + struct task_struct *p; + + rcu_read_lock(); + p = find_task_by_vpid(pid); + if (p) + get_task_struct(p); + + rcu_read_unlock(); + + return p; +} + +/* + * This task is holding PI mutexes at exit time => bad. + * Kernel cleans up PI-state, but userspace is likely hosed. + * (Robust-futex cleanup is separate and might save the day for userspace.) + */ +void exit_pi_state_list(struct task_struct *curr) +{ + struct list_head *next, *head = &curr->pi_state_list; + struct futex_pi_state *pi_state; + struct futex_hash_bucket *hb; + union futex_key key = FUTEX_KEY_INIT; + + if (!futex_cmpxchg_enabled) + return; + /* + * We are a ZOMBIE and nobody can enqueue itself on + * pi_state_list anymore, but we have to be careful + * versus waiters unqueueing themselves: + */ + raw_spin_lock_irq(&curr->pi_lock); + while (!list_empty(head)) { + + next = head->next; + pi_state = list_entry(next, struct futex_pi_state, list); + key = pi_state->key; + hb = hash_futex(&key); + raw_spin_unlock_irq(&curr->pi_lock); + + spin_lock(&hb->lock); + + raw_spin_lock_irq(&curr->pi_lock); + /* + * We dropped the pi-lock, so re-check whether this + * task still owns the PI-state: + */ + if (head->next != next) { + spin_unlock(&hb->lock); + continue; + } + + WARN_ON(pi_state->owner != curr); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + pi_state->owner = NULL; + raw_spin_unlock_irq(&curr->pi_lock); + + get_pi_state(pi_state); + spin_unlock(&hb->lock); + + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + + raw_spin_lock_irq(&curr->pi_lock); + } + raw_spin_unlock_irq(&curr->pi_lock); +} + +/** + * futex_top_waiter() - Return the highest priority waiter on a futex + * @hb: the hash bucket the futex_q's reside in + * @key: the futex key (to distinguish it from other futex futex_q's) + * + * Must be called with the hb lock held. + */ +static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, + union futex_key *key) +{ + struct futex_q *this; + + plist_for_each_entry(this, &hb->chain, list) { + if (match_futex(&this->key, key)) + return this; + } + return NULL; +} + +/* + * We need to check the following states: + * + * Waiter | pi_state | pi->owner | uTID | uODIED | ? + * + * [1] NULL | --- | --- | 0 | 0/1 | Valid + * [2] NULL | --- | --- | >0 | 0/1 | Valid + * + * [3] Found | NULL | -- | Any | 0/1 | Invalid + * + * [4] Found | Found | NULL | 0 | 1 | Valid + * [5] Found | Found | NULL | >0 | 1 | Invalid + * + * [6] Found | Found | task | 0 | 1 | Valid + * + * [7] Found | Found | NULL | Any | 0 | Invalid + * + * [8] Found | Found | task | ==taskTID | 0/1 | Valid + * [9] Found | Found | task | 0 | 0 | Invalid + * [10] Found | Found | task | !=taskTID | 0/1 | Invalid + * + * [1] Indicates that the kernel can acquire the futex atomically. We + * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. + * + * [2] Valid, if TID does not belong to a kernel thread. If no matching + * thread is found then it indicates that the owner TID has died. + * + * [3] Invalid. The waiter is queued on a non PI futex + * + * [4] Valid state after exit_robust_list(), which sets the user space + * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. + * + * [5] The user space value got manipulated between exit_robust_list() + * and exit_pi_state_list() + * + * [6] Valid state after exit_pi_state_list() which sets the new owner in + * the pi_state but cannot access the user space value. + * + * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. + * + * [8] Owner and user space value match + * + * [9] There is no transient state which sets the user space TID to 0 + * except exit_robust_list(), but this is indicated by the + * FUTEX_OWNER_DIED bit. See [4] + * + * [10] There is no transient state which leaves owner and user space + * TID out of sync. + * + * + * Serialization and lifetime rules: + * + * hb->lock: + * + * hb -> futex_q, relation + * futex_q -> pi_state, relation + * + * (cannot be raw because hb can contain arbitrary amount + * of futex_q's) + * + * pi_mutex->wait_lock: + * + * {uval, pi_state} + * + * (and pi_mutex 'obviously') + * + * p->pi_lock: + * + * p->pi_state_list -> pi_state->list, relation + * + * pi_state->refcount: + * + * pi_state lifetime + * + * + * Lock order: + * + * hb->lock + * pi_mutex->wait_lock + * p->pi_lock + * + */ + +/* + * Validate that the existing waiter has a pi_state and sanity check + * the pi_state against the user space value. If correct, attach to + * it. + */ +static int attach_to_pi_state(u32 __user *uaddr, u32 uval, + struct futex_pi_state *pi_state, + struct futex_pi_state **ps) +{ + pid_t pid = uval & FUTEX_TID_MASK; + u32 uval2; + int ret; + + /* + * Userspace might have messed up non-PI and PI futexes [3] + */ + if (unlikely(!pi_state)) + return -EINVAL; + + /* + * We get here with hb->lock held, and having found a + * futex_top_waiter(). This means that futex_lock_pi() of said futex_q + * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), + * which in turn means that futex_lock_pi() still has a reference on + * our pi_state. + * + * The waiter holding a reference on @pi_state also protects against + * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() + * and futex_wait_requeue_pi() as it cannot go to 0 and consequently + * free pi_state before we can take a reference ourselves. + */ + WARN_ON(!atomic_read(&pi_state->refcount)); + + /* + * Now that we have a pi_state, we can acquire wait_lock + * and do the state validation. + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + /* + * Since {uval, pi_state} is serialized by wait_lock, and our current + * uval was read without holding it, it can have changed. Verify it + * still is what we expect it to be, otherwise retry the entire + * operation. + */ + if (get_futex_value_locked(&uval2, uaddr)) + goto out_efault; + + if (uval != uval2) + goto out_eagain; + + /* + * Handle the owner died case: + */ + if (uval & FUTEX_OWNER_DIED) { + /* + * exit_pi_state_list sets owner to NULL and wakes the + * topmost waiter. The task which acquires the + * pi_state->rt_mutex will fixup owner. + */ + if (!pi_state->owner) { + /* + * No pi state owner, but the user space TID + * is not 0. Inconsistent state. [5] + */ + if (pid) + goto out_einval; + /* + * Take a ref on the state and return success. [4] + */ + goto out_attach; + } + + /* + * If TID is 0, then either the dying owner has not + * yet executed exit_pi_state_list() or some waiter + * acquired the rtmutex in the pi state, but did not + * yet fixup the TID in user space. + * + * Take a ref on the state and return success. [6] + */ + if (!pid) + goto out_attach; + } else { + /* + * If the owner died bit is not set, then the pi_state + * must have an owner. [7] + */ + if (!pi_state->owner) + goto out_einval; + } + + /* + * Bail out if user space manipulated the futex value. If pi + * state exists then the owner TID must be the same as the + * user space TID. [9/10] + */ + if (pid != task_pid_vnr(pi_state->owner)) + goto out_einval; + +out_attach: + get_pi_state(pi_state); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + *ps = pi_state; + return 0; + +out_einval: + ret = -EINVAL; + goto out_error; + +out_eagain: + ret = -EAGAIN; + goto out_error; + +out_efault: + ret = -EFAULT; + goto out_error; + +out_error: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; +} + +/* + * Lookup the task for the TID provided from user space and attach to + * it after doing proper sanity checks. + */ +static int attach_to_pi_owner(u32 uval, union futex_key *key, + struct futex_pi_state **ps) +{ + pid_t pid = uval & FUTEX_TID_MASK; + struct futex_pi_state *pi_state; + struct task_struct *p; + + /* + * We are the first waiter - try to look up the real owner and attach + * the new pi_state to it, but bail out when TID = 0 [1] + */ + if (!pid) + return -ESRCH; + p = futex_find_get_task(pid); + if (!p) + return -ESRCH; + + if (unlikely(p->flags & PF_KTHREAD)) { + put_task_struct(p); + return -EPERM; + } + + /* + * We need to look at the task state flags to figure out, + * whether the task is exiting. To protect against the do_exit + * change of the task flags, we do this protected by + * p->pi_lock: + */ + raw_spin_lock_irq(&p->pi_lock); + if (unlikely(p->flags & PF_EXITING)) { + /* + * The task is on the way out. When PF_EXITPIDONE is + * set, we know that the task has finished the + * cleanup: + */ + int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; + + raw_spin_unlock_irq(&p->pi_lock); + put_task_struct(p); + return ret; + } + + /* + * No existing pi state. First waiter. [2] + * + * This creates pi_state, we have hb->lock held, this means nothing can + * observe this state, wait_lock is irrelevant. + */ + pi_state = alloc_pi_state(); + + /* + * Initialize the pi_mutex in locked state and make @p + * the owner of it: + */ + rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); + + /* Store the key for possible exit cleanups: */ + pi_state->key = *key; + + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &p->pi_state_list); + pi_state->owner = p; + raw_spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); + + *ps = pi_state; + + return 0; +} + +static int lookup_pi_state(u32 __user *uaddr, u32 uval, + struct futex_hash_bucket *hb, + union futex_key *key, struct futex_pi_state **ps) +{ + struct futex_q *top_waiter = futex_top_waiter(hb, key); + + /* + * If there is a waiter on that futex, validate it and + * attach to the pi_state when the validation succeeds. + */ + if (top_waiter) + return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); + + /* + * We are the first waiter - try to look up the owner based on + * @uval and attach to it. + */ + return attach_to_pi_owner(uval, key, ps); +} + +static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) +{ + u32 uninitialized_var(curval); + + if (unlikely(should_fail_futex(true))) + return -EFAULT; + + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) + return -EFAULT; + + /* If user space value changed, let the caller retry */ + return curval != uval ? -EAGAIN : 0; +} + +/** + * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex + * @uaddr: the pi futex user address + * @hb: the pi futex hash bucket + * @key: the futex key associated with uaddr and hb + * @ps: the pi_state pointer where we store the result of the + * lookup + * @task: the task to perform the atomic lock work for. This will + * be "current" except in the case of requeue pi. + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Return: + * 0 - ready to wait; + * 1 - acquired the lock; + * <0 - error + * + * The hb->lock and futex_key refs shall be held by the caller. + */ +static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, + union futex_key *key, + struct futex_pi_state **ps, + struct task_struct *task, int set_waiters) +{ + u32 uval, newval, vpid = task_pid_vnr(task); + struct futex_q *top_waiter; + int ret; + + /* + * Read the user space value first so we can validate a few + * things before proceeding further. + */ + if (get_futex_value_locked(&uval, uaddr)) + return -EFAULT; + + if (unlikely(should_fail_futex(true))) + return -EFAULT; + + /* + * Detect deadlocks. + */ + if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) + return -EDEADLK; + + if ((unlikely(should_fail_futex(true)))) + return -EDEADLK; + + /* + * Lookup existing state first. If it exists, try to attach to + * its pi_state. + */ + top_waiter = futex_top_waiter(hb, key); + if (top_waiter) + return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); + + /* + * No waiter and user TID is 0. We are here because the + * waiters or the owner died bit is set or called from + * requeue_cmp_pi or for whatever reason something took the + * syscall. + */ + if (!(uval & FUTEX_TID_MASK)) { + /* + * We take over the futex. No other waiters and the user space + * TID is 0. We preserve the owner died bit. + */ + newval = uval & FUTEX_OWNER_DIED; + newval |= vpid; + + /* The futex requeue_pi code can enforce the waiters bit */ + if (set_waiters) + newval |= FUTEX_WAITERS; + + ret = lock_pi_update_atomic(uaddr, uval, newval); + /* If the take over worked, return 1 */ + return ret < 0 ? ret : 1; + } + + /* + * First waiter. Set the waiters bit before attaching ourself to + * the owner. If owner tries to unlock, it will be forced into + * the kernel and blocked on hb->lock. + */ + newval = uval | FUTEX_WAITERS; + ret = lock_pi_update_atomic(uaddr, uval, newval); + if (ret) + return ret; + /* + * If the update of the user space value succeeded, we try to + * attach to the owner. If that fails, no harm done, we only + * set the FUTEX_WAITERS bit in the user space variable. + */ + return attach_to_pi_owner(uval, key, ps); +} + +/* + * Caller must hold a reference on @pi_state. + */ +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) +{ + u32 uninitialized_var(curval), newval; + struct task_struct *new_owner; + bool postunlock = false; + DEFINE_WAKE_Q(wake_q); + int ret = 0; + + new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); + if (WARN_ON_ONCE(!new_owner)) { + /* + * As per the comment in futex_unlock_pi() this should not happen. + * + * When this happens, give up our locks and try again, giving + * the futex_lock_pi() instance time to complete, either by + * waiting on the rtmutex or removing itself from the futex + * queue. + */ + ret = -EAGAIN; + goto out_unlock; + } + + /* + * We pass it to the next owner. The WAITERS bit is always kept + * enabled while there is PI state around. We cleanup the owner + * died bit, because we are the owner. + */ + newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + + if (unlikely(should_fail_futex(true))) + ret = -EFAULT; + + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { + ret = -EFAULT; + + } else if (curval != uval) { + /* + * If a unconditional UNLOCK_PI operation (user space did not + * try the TID->0 transition) raced with a waiter setting the + * FUTEX_WAITERS flag between get_user() and locking the hash + * bucket lock, retry the operation. + */ + if ((FUTEX_TID_MASK & curval) == uval) + ret = -EAGAIN; + else + ret = -EINVAL; + } + + if (ret) + goto out_unlock; + + /* + * This is a point of no return; once we modify the uval there is no + * going back and subsequent operations must not fail. + */ + + raw_spin_lock(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + raw_spin_unlock(&pi_state->owner->pi_lock); + + raw_spin_lock(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &new_owner->pi_state_list); + pi_state->owner = new_owner; + raw_spin_unlock(&new_owner->pi_lock); + + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + +out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + + if (postunlock) + rt_mutex_postunlock(&wake_q); + + return ret; +} + +/** + * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue + * @q: the futex_q + * @key: the key of the requeue target futex + * @hb: the hash_bucket of the requeue target futex + * + * During futex_requeue, with requeue_pi=1, it is possible to acquire the + * target futex if it is uncontended or via a lock steal. Set the futex_q key + * to the requeue target futex so the waiter can detect the wakeup on the right + * futex, but remove it from the hb and NULL the rt_waiter so it can detect + * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock + * to protect access to the pi_state to fixup the owner later. Must be called + * with both q->lock_ptr and hb->lock held. + */ +static inline +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, + struct futex_hash_bucket *hb) +{ + get_futex_key_refs(key); + q->key = *key; + + __unqueue_futex(q); + + WARN_ON(!q->rt_waiter); + q->rt_waiter = NULL; + + q->lock_ptr = &hb->lock; + + wake_up_state(q->task, TASK_NORMAL); +} + +/** + * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter + * @pifutex: the user address of the to futex + * @hb1: the from futex hash bucket, must be locked by the caller + * @hb2: the to futex hash bucket, must be locked by the caller + * @key1: the from futex key + * @key2: the to futex key + * @ps: address to store the pi_state pointer + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Try and get the lock on behalf of the top waiter if we can do it atomically. + * Wake the top waiter if we succeed. If the caller specified set_waiters, + * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. + * hb1 and hb2 must be held by the caller. + * + * Return: + * 0 - failed to acquire the lock atomically; + * >0 - acquired the lock, return value is vpid of the top_waiter + * <0 - error + */ +static int futex_proxy_trylock_atomic(u32 __user *pifutex, + struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, + union futex_key *key1, union futex_key *key2, + struct futex_pi_state **ps, int set_waiters) +{ + struct futex_q *top_waiter = NULL; + u32 curval; + int ret, vpid; + + if (get_futex_value_locked(&curval, pifutex)) + return -EFAULT; + + if (unlikely(should_fail_futex(true))) + return -EFAULT; + + /* + * Find the top_waiter and determine if there are additional waiters. + * If the caller intends to requeue more than 1 waiter to pifutex, + * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, + * as we have means to handle the possible fault. If not, don't set + * the bit unecessarily as it will force the subsequent unlock to enter + * the kernel. + */ + top_waiter = futex_top_waiter(hb1, key1); + + /* There are no waiters, nothing for us to do. */ + if (!top_waiter) + return 0; + + /* Ensure we requeue to the expected futex. */ + if (!match_futex(top_waiter->requeue_pi_key, key2)) + return -EINVAL; + + /* + * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in + * the contended case or if set_waiters is 1. The pi_state is returned + * in ps in contended cases. + */ + vpid = task_pid_vnr(top_waiter->task); + ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, + set_waiters); + if (ret == 1) { + requeue_pi_wake_futex(top_waiter, key2, hb2); + return vpid; + } + return ret; +} + +/* + * PI futexes can not be requeued and must remove themself from the + * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry + * and dropped here. + */ +static void unqueue_me_pi(struct futex_q *q) + __releases(q->lock_ptr) +{ + __unqueue_futex(q); + + BUG_ON(!q->pi_state); + put_pi_state(q->pi_state); + q->pi_state = NULL; + + spin_unlock(q->lock_ptr); +} + +/* + * Fixup the pi_state owner with the new owner. + * + * Must be called with hash bucket lock held and mm->sem held for non + * private futexes. + */ +static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + struct task_struct *newowner) +{ + u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; + struct futex_pi_state *pi_state = q->pi_state; + u32 uval, uninitialized_var(curval), newval; + struct task_struct *oldowner; + int ret; + + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + oldowner = pi_state->owner; + /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + + /* + * We are here either because we stole the rtmutex from the + * previous highest priority waiter or we are the highest priority + * waiter but have failed to get the rtmutex the first time. + * + * We have to replace the newowner TID in the user space variable. + * This must be atomic as we have to preserve the owner died bit here. + * + * Note: We write the user space value _before_ changing the pi_state + * because we can fault here. Imagine swapped out pages or a fork + * that marked all the anonymous memory readonly for cow. + * + * Modifying pi_state _before_ the user space value would leave the + * pi_state in an inconsistent state when we fault here, because we + * need to drop the locks to handle the fault. This might be observed + * in the PID check in lookup_pi_state. + */ +retry: + if (get_futex_value_locked(&uval, uaddr)) + goto handle_fault; + + for (;;) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) + goto handle_fault; + if (curval == uval) + break; + uval = curval; + } + + /* + * We fixed up user space. Now we need to fix the pi_state + * itself. + */ + if (pi_state->owner != NULL) { + raw_spin_lock(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + raw_spin_unlock(&pi_state->owner->pi_lock); + } + + pi_state->owner = newowner; + + raw_spin_lock(&newowner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &newowner->pi_state_list); + raw_spin_unlock(&newowner->pi_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + + return 0; + + /* + * To handle the page fault we need to drop the locks here. That gives + * the other task (either the highest priority waiter itself or the + * task which stole the rtmutex) the chance to try the fixup of the + * pi_state. So once we are back from handling the fault we need to + * check the pi_state after reacquiring the locks and before trying to + * do another fixup. When the fixup has been done already we simply + * return. + * + * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely + * drop hb->lock since the caller owns the hb -> futex_q relation. + * Dropping the pi_mutex->wait_lock requires the state revalidate. + */ +handle_fault: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(q->lock_ptr); + + ret = fault_in_user_writeable(uaddr); + + spin_lock(q->lock_ptr); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + /* + * Check if someone else fixed it for us: + */ + if (pi_state->owner != oldowner) { + ret = 0; + goto out_unlock; + } + + if (ret) + goto out_unlock; + + goto retry; + +out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; +} + +/** + * fixup_owner() - Post lock pi_state and corner case management + * @uaddr: user address of the futex + * @q: futex_q (contains pi_state and access to the rt_mutex) + * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) + * + * After attempting to lock an rt_mutex, this function is called to cleanup + * the pi_state owner as well as handle race conditions that may allow us to + * acquire the lock. Must be called with the hb lock held. + * + * Return: + * 1 - success, lock taken; + * 0 - success, lock not taken; + * <0 - on error (-EFAULT) + */ +static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) +{ + int ret = 0; + + if (locked) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case: + * + * We can safely read pi_state->owner without holding wait_lock + * because we now own the rt_mutex, only the owner will attempt + * to change it. + */ + if (q->pi_state->owner != current) + ret = fixup_pi_state_owner(uaddr, q, current); + goto out; + } + + /* + * Paranoia check. If we did not take the lock, then we should not be + * the owner of the rt_mutex. + */ + if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { + printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " + "pi-state %p\n", ret, + q->pi_state->pi_mutex.owner, + q->pi_state->owner); + } + +out: + return ret ? ret : locked; +} + +/* + * Userspace tried a 0 -> TID atomic transition of the futex value + * and failed. The kernel side here does the whole locking operation: + * if there are waiters then it will block as a consequence of relying + * on rt-mutexes, it does PI, etc. (Due to races the kernel might see + * a 0 value of the futex too.). + * + * Also serves as futex trylock_pi()'ing, and due semantics. + */ +static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, + ktime_t *time, int trylock) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct futex_pi_state *pi_state = NULL; + struct rt_mutex_waiter rt_waiter; + struct futex_hash_bucket *hb; + struct futex_q q = futex_q_init; + int res, ret; + + if (refill_pi_state_cache()) + return -ENOMEM; + + if (time) { + to = &timeout; + hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, + HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires(&to->timer, *time); + } + +retry: + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE); + if (unlikely(ret != 0)) + goto out; + +retry_private: + hb = queue_lock(&q); + + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); + if (unlikely(ret)) { + /* + * Atomic work succeeded and we got the lock, + * or failed. Either way, we do _not_ block. + */ + switch (ret) { + case 1: + /* We got the lock. */ + ret = 0; + goto out_unlock_put_key; + case -EFAULT: + goto uaddr_faulted; + case -EAGAIN: + /* + * Two reasons for this: + * - Task is exiting and we just wait for the + * exit to complete. + * - The user space value changed. + */ + queue_unlock(hb); + put_futex_key(&q.key); + cond_resched(); + goto retry; + default: + goto out_unlock_put_key; + } + } + + WARN_ON(!q.pi_state); + + /* + * Only actually queue now that the atomic ops are done: + */ + __queue_me(&q, hb); + + if (trylock) { + ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); + /* Fixup the trylock return value: */ + ret = ret ? 0 : -EWOULDBLOCK; + goto no_block; + } + + rt_mutex_init_waiter(&rt_waiter); + + /* + * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not + * hold it while doing rt_mutex_start_proxy(), because then it will + * include hb->lock in the blocking chain, even through we'll not in + * fact hold it while blocking. This will lead it to report -EDEADLK + * and BUG when futex_unlock_pi() interleaves with this. + * + * Therefore acquire wait_lock while holding hb->lock, but drop the + * latter before calling rt_mutex_start_proxy_lock(). This still fully + * serializes against futex_unlock_pi() as that does the exact same + * lock handoff sequence. + */ + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); + spin_unlock(q.lock_ptr); + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); + raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); + + if (ret) { + if (ret == 1) + ret = 0; + + spin_lock(q.lock_ptr); + goto no_block; + } + + + if (unlikely(to)) + hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); + + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); + + spin_lock(q.lock_ptr); + /* + * If we failed to acquire the lock (signal/timeout), we must + * first acquire the hb->lock before removing the lock from the + * rt_mutex waitqueue, such that we can keep the hb and rt_mutex + * wait lists consistent. + * + * In particular; it is important that futex_unlock_pi() can not + * observe this inconsistency. + */ + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) + ret = 0; + +no_block: + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it acquired + * the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + /* + * If fixup_owner() faulted and was unable to handle the fault, unlock + * it and return the fault to userspace. + */ + if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } + + /* Unqueue and drop the lock */ + unqueue_me_pi(&q); + + if (pi_state) { + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + } + + goto out_put_key; + +out_unlock_put_key: + queue_unlock(hb); + +out_put_key: + put_futex_key(&q.key); +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret != -EINTR ? ret : -ERESTARTNOINTR; + +uaddr_faulted: + queue_unlock(hb); + + ret = fault_in_user_writeable(uaddr); + if (ret) + goto out_put_key; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + put_futex_key(&q.key); + goto retry; +} + +/* + * Userspace attempted a TID -> 0 atomic transition, and failed. + * This is the in-kernel slowpath: we look up the PI state (if any), + * and do the rt-mutex unlock. + */ +static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) +{ + u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); + union futex_key key = FUTEX_KEY_INIT; + struct futex_hash_bucket *hb; + struct futex_q *top_waiter; + int ret; + +retry: + if (get_user(uval, uaddr)) + return -EFAULT; + /* + * We release only a lock we actually own: + */ + if ((uval & FUTEX_TID_MASK) != vpid) + return -EPERM; + + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); + if (ret) + return ret; + + hb = hash_futex(&key); + spin_lock(&hb->lock); + + /* + * Check waiters first. We do not trust user space values at + * all and we at least want to know if user space fiddled + * with the futex value instead of blindly unlocking. + */ + top_waiter = futex_top_waiter(hb, &key); + if (top_waiter) { + struct futex_pi_state *pi_state = top_waiter->pi_state; + + ret = -EINVAL; + if (!pi_state) + goto out_unlock; + + /* + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + goto out_unlock; + + get_pi_state(pi_state); + /* + * By taking wait_lock while still holding hb->lock, we ensure + * there is no point where we hold neither; and therefore + * wake_futex_pi() must observe a state consistent with what we + * observed. + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + + ret = wake_futex_pi(uaddr, uval, pi_state); + + put_pi_state(pi_state); + + /* + * Success, we're done! No tricky corner cases. + */ + if (!ret) + goto out_putkey; + /* + * The atomic access to the futex value generated a + * pagefault, so retry the user-access and the wakeup: + */ + if (ret == -EFAULT) + goto pi_faulted; + /* + * A unconditional UNLOCK_PI op raced against a waiter + * setting the FUTEX_WAITERS bit. Try again. + */ + if (ret == -EAGAIN) { + put_futex_key(&key); + goto retry; + } + /* + * wake_futex_pi has detected invalid state. Tell user + * space. + */ + goto out_putkey; + } + + /* + * We have no kernel internal state, i.e. no waiters in the + * kernel. Waiters which are about to queue themselves are stuck + * on hb->lock. So we can safely ignore them. We do neither + * preserve the WAITERS bit not the OWNER_DIED one. We are the + * owner. + */ + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { + spin_unlock(&hb->lock); + goto pi_faulted; + } + + /* + * If uval has changed, let user space handle it. + */ + ret = (curval == uval) ? 0 : -EAGAIN; + +out_unlock: + spin_unlock(&hb->lock); +out_putkey: + put_futex_key(&key); + return ret; + +pi_faulted: + put_futex_key(&key); + + ret = fault_in_user_writeable(uaddr); + if (!ret) + goto retry; + + return ret; +} + +/** + * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex + * @hb: the hash_bucket futex_q was original enqueued on + * @q: the futex_q woken while waiting to be requeued + * @key2: the futex_key of the requeue target futex + * @timeout: the timeout associated with the wait (NULL if none) + * + * Detect if the task was woken on the initial futex as opposed to the requeue + * target futex. If so, determine if it was a timeout or a signal that caused + * the wakeup and return the appropriate error code to the caller. Must be + * called with the hb lock held. + * + * Return: + * 0 = no early wakeup detected; + * <0 = -ETIMEDOUT or -ERESTARTNOINTR + */ +static inline +int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, + struct futex_q *q, union futex_key *key2, + struct hrtimer_sleeper *timeout) +{ + int ret = 0; + + /* + * With the hb lock held, we avoid races while we process the wakeup. + * We only need to hold hb (and not hb2) to ensure atomicity as the + * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. + * It can't be requeued from uaddr2 to something else since we don't + * support a PI aware source futex for requeue. + */ + if (!match_futex(&q->key, key2)) { + WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); + /* + * We were woken prior to requeue by a timeout or a signal. + * Unqueue the futex_q and determine which it was. + */ + plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); + + /* Handle spurious wakeups gracefully */ + ret = -EWOULDBLOCK; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + else if (signal_pending(current)) + ret = -ERESTARTNOINTR; + } + return ret; +} + +/** + * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 + * @uaddr: the futex we initially wait on (non-pi) + * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be + * the same type, no requeueing from private to shared, etc. + * @val: the expected value of uaddr + * @abs_time: absolute timeout + * @bitset: 32 bit wakeup bitset set by userspace, defaults to all + * @uaddr2: the pi futex we will take prior to returning to user-space + * + * The caller will wait on uaddr and will be requeued by futex_requeue() to + * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake + * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to + * userspace. This ensures the rt_mutex maintains an owner when it has waiters; + * without one, the pi logic would not know which task to boost/deboost, if + * there was a need to. + * + * We call schedule in futex_wait_queue_me() when we enqueue and return there + * via the following-- + * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() + * 2) wakeup on uaddr2 after a requeue + * 3) signal + * 4) timeout + * + * If 3, cleanup and return -ERESTARTNOINTR. + * + * If 2, we may then block on trying to take the rt_mutex and return via: + * 5) successful lock + * 6) signal + * 7) timeout + * 8) other lock acquisition failure + * + * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). + * + * If 4 or 7, we cleanup and return with -ETIMEDOUT. + * + * Return: + * 0 - On success; + * <0 - On error + */ +static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + u32 val, ktime_t *abs_time, u32 bitset, + u32 __user *uaddr2) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct futex_pi_state *pi_state = NULL; + struct rt_mutex_waiter rt_waiter; + struct futex_hash_bucket *hb; + union futex_key key2 = FUTEX_KEY_INIT; + struct futex_q q = futex_q_init; + int res, ret; + + if (uaddr == uaddr2) + return -EINVAL; + + if (!bitset) + return -EINVAL; + + if (abs_time) { + to = &timeout; + hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires_range_ns(&to->timer, *abs_time, + current->timer_slack_ns); + } + + /* + * The waiter is allocated on our stack, manipulated by the requeue + * code while we sleep on uaddr. + */ + rt_mutex_init_waiter(&rt_waiter); + + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); + if (unlikely(ret != 0)) + goto out; + + q.bitset = bitset; + q.rt_waiter = &rt_waiter; + q.requeue_pi_key = &key2; + + /* + * Prepare to wait on uaddr. On success, increments q.key (key1) ref + * count. + */ + ret = futex_wait_setup(uaddr, val, flags, &q, &hb); + if (ret) + goto out_key2; + + /* + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: + */ + if (match_futex(&q.key, &key2)) { + queue_unlock(hb); + ret = -EINVAL; + goto out_put_keys; + } + + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ + futex_wait_queue_me(hb, &q, to); + + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); + spin_unlock(&hb->lock); + if (ret) + goto out_put_keys; + + /* + * In order for us to be here, we know our q.key == key2, and since + * we took the hb->lock above, we also know that futex_requeue() has + * completed and we no longer have to concern ourselves with a wakeup + * race with the atomic proxy lock acquisition by the requeue code. The + * futex_requeue dropped our key1 reference and incremented our key2 + * reference count. + */ + + /* Check if the requeue code acquired the second futex for us. */ + if (!q.rt_waiter) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case. + */ + if (q.pi_state && (q.pi_state->owner != current)) { + spin_lock(q.lock_ptr); + ret = fixup_pi_state_owner(uaddr2, &q, current); + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } + /* + * Drop the reference to the pi state which + * the requeue_pi() code acquired for us. + */ + put_pi_state(q.pi_state); + spin_unlock(q.lock_ptr); + } + } else { + struct rt_mutex *pi_mutex; + + /* + * We have been woken up by futex_unlock_pi(), a timeout, or a + * signal. futex_unlock_pi() will not destroy the lock_ptr nor + * the pi_state. + */ + WARN_ON(!q.pi_state); + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); + + spin_lock(q.lock_ptr); + if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) + ret = 0; + + debug_rt_mutex_free_waiter(&rt_waiter); + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr2, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it + * acquired the lock, clear -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + /* + * If fixup_pi_state_owner() faulted and was unable to handle + * the fault, unlock the rt_mutex and return the fault to + * userspace. + */ + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } + + /* Unqueue and drop the lock. */ + unqueue_me_pi(&q); + } + + if (pi_state) { + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + } + + if (ret == -EINTR) { + /* + * We've already been requeued, but cannot restart by calling + * futex_lock_pi() directly. We could restart this syscall, but + * it would detect that the user space "val" changed and return + * -EWOULDBLOCK. Save the overhead of the restart and return + * -EWOULDBLOCK directly. + */ + ret = -EWOULDBLOCK; + } + +out_put_keys: + put_futex_key(&q.key); +out_key2: + put_futex_key(&key2); + +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret; +}

[v2,3/8] futex: make PI support optional

Commit Message

Patch