]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - kernel/futex.c
Merge tag 'clk-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/clk/linux
[linux.git] / kernel / futex.c
index bd18f60e4c6c635d938cce50fd226150800b359f..03c518e9747e579869e13b7e2bc91ad83e3e8da7 100644 (file)
@@ -325,6 +325,12 @@ static inline bool should_fail_futex(bool fshared)
 }
 #endif /* CONFIG_FAIL_FUTEX */
 
+#ifdef CONFIG_COMPAT
+static void compat_exit_robust_list(struct task_struct *curr);
+#else
+static inline void compat_exit_robust_list(struct task_struct *curr) { }
+#endif
+
 static inline void futex_get_mm(union futex_key *key)
 {
        mmgrab(key->private.mm);
@@ -890,7 +896,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
  * Kernel cleans up PI-state, but userspace is likely hosed.
  * (Robust-futex cleanup is separate and might save the day for userspace.)
  */
-void exit_pi_state_list(struct task_struct *curr)
+static void exit_pi_state_list(struct task_struct *curr)
 {
        struct list_head *next, *head = &curr->pi_state_list;
        struct futex_pi_state *pi_state;
@@ -960,7 +966,8 @@ void exit_pi_state_list(struct task_struct *curr)
        }
        raw_spin_unlock_irq(&curr->pi_lock);
 }
-
+#else
+static inline void exit_pi_state_list(struct task_struct *curr) { }
 #endif
 
 /*
@@ -1169,16 +1176,47 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
        return ret;
 }
 
+/**
+ * wait_for_owner_exiting - Block until the owner has exited
+ * @exiting:   Pointer to the exiting task
+ *
+ * Caller must hold a refcount on @exiting.
+ */
+static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
+{
+       if (ret != -EBUSY) {
+               WARN_ON_ONCE(exiting);
+               return;
+       }
+
+       if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
+               return;
+
+       mutex_lock(&exiting->futex_exit_mutex);
+       /*
+        * No point in doing state checking here. If the waiter got here
+        * while the task was in exec()->exec_futex_release() then it can
+        * have any FUTEX_STATE_* value when the waiter has acquired the
+        * mutex. OK, if running, EXITING or DEAD if it reached exit()
+        * already. Highly unlikely and not a problem. Just one more round
+        * through the futex maze.
+        */
+       mutex_unlock(&exiting->futex_exit_mutex);
+
+       put_task_struct(exiting);
+}
+
 static int handle_exit_race(u32 __user *uaddr, u32 uval,
                            struct task_struct *tsk)
 {
        u32 uval2;
 
        /*
-        * If PF_EXITPIDONE is not yet set, then try again.
+        * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
+        * caller that the alleged owner is busy.
         */
-       if (tsk && !(tsk->flags & PF_EXITPIDONE))
-               return -EAGAIN;
+       if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+               return -EBUSY;
 
        /*
         * Reread the user space value to handle the following situation:
@@ -1196,8 +1234,9 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
         *    *uaddr = 0xC0000000;           tsk = get_task(PID);
         *   }                               if (!tsk->flags & PF_EXITING) {
         *  ...                                attach();
-        *  tsk->flags |= PF_EXITPIDONE;     } else {
-        *                                     if (!(tsk->flags & PF_EXITPIDONE))
+        *  tsk->futex_state =               } else {
+        *      FUTEX_STATE_DEAD;              if (tsk->futex_state !=
+        *                                        FUTEX_STATE_DEAD)
         *                                       return -EAGAIN;
         *                                     return -ESRCH; <--- FAIL
         *                                   }
@@ -1228,7 +1267,8 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
  * it after doing proper sanity checks.
  */
 static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
-                             struct futex_pi_state **ps)
+                             struct futex_pi_state **ps,
+                             struct task_struct **exiting)
 {
        pid_t pid = uval & FUTEX_TID_MASK;
        struct futex_pi_state *pi_state;
@@ -1253,22 +1293,33 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
        }
 
        /*
-        * We need to look at the task state flags to figure out,
-        * whether the task is exiting. To protect against the do_exit
-        * change of the task flags, we do this protected by
-        * p->pi_lock:
+        * We need to look at the task state to figure out, whether the
+        * task is exiting. To protect against the change of the task state
+        * in futex_exit_release(), we do this protected by p->pi_lock:
         */
        raw_spin_lock_irq(&p->pi_lock);
-       if (unlikely(p->flags & PF_EXITING)) {
+       if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
                /*
-                * The task is on the way out. When PF_EXITPIDONE is
-                * set, we know that the task has finished the
-                * cleanup:
+                * The task is on the way out. When the futex state is
+                * FUTEX_STATE_DEAD, we know that the task has finished
+                * the cleanup:
                 */
                int ret = handle_exit_race(uaddr, uval, p);
 
                raw_spin_unlock_irq(&p->pi_lock);
-               put_task_struct(p);
+               /*
+                * If the owner task is between FUTEX_STATE_EXITING and
+                * FUTEX_STATE_DEAD then store the task pointer and keep
+                * the reference on the task struct. The calling code will
+                * drop all locks, wait for the task to reach
+                * FUTEX_STATE_DEAD and then drop the refcount. This is
+                * required to prevent a live lock when the current task
+                * preempted the exiting task between the two states.
+                */
+               if (ret == -EBUSY)
+                       *exiting = p;
+               else
+                       put_task_struct(p);
                return ret;
        }
 
@@ -1307,7 +1358,8 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
 
 static int lookup_pi_state(u32 __user *uaddr, u32 uval,
                           struct futex_hash_bucket *hb,
-                          union futex_key *key, struct futex_pi_state **ps)
+                          union futex_key *key, struct futex_pi_state **ps,
+                          struct task_struct **exiting)
 {
        struct futex_q *top_waiter = futex_top_waiter(hb, key);
 
@@ -1322,7 +1374,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
         * We are the first waiter - try to look up the owner based on
         * @uval and attach to it.
         */
-       return attach_to_pi_owner(uaddr, uval, key, ps);
+       return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
 }
 
 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
@@ -1350,6 +1402,8 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
  *                     lookup
  * @task:              the task to perform the atomic lock work for.  This will
  *                     be "current" except in the case of requeue pi.
+ * @exiting:           Pointer to store the task pointer of the owner task
+ *                     which is in the middle of exiting
  * @set_waiters:       force setting the FUTEX_WAITERS bit (1) or not (0)
  *
  * Return:
@@ -1358,11 +1412,17 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
  *  - <0 - error
  *
  * The hb->lock and futex_key refs shall be held by the caller.
+ *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
  */
 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                                union futex_key *key,
                                struct futex_pi_state **ps,
-                               struct task_struct *task, int set_waiters)
+                               struct task_struct *task,
+                               struct task_struct **exiting,
+                               int set_waiters)
 {
        u32 uval, newval, vpid = task_pid_vnr(task);
        struct futex_q *top_waiter;
@@ -1432,7 +1492,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
         * attach to the owner. If that fails, no harm done, we only
         * set the FUTEX_WAITERS bit in the user space variable.
         */
-       return attach_to_pi_owner(uaddr, newval, key, ps);
+       return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
 }
 
 /**
@@ -1480,7 +1540,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
 
        /*
         * Queue the task for later wakeup for after we've released
-        * the hb->lock. wake_q_add() grabs reference to p.
+        * the hb->lock.
         */
        wake_q_add_safe(wake_q, p);
 }
@@ -1850,6 +1910,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
  * @key1:              the from futex key
  * @key2:              the to futex key
  * @ps:                        address to store the pi_state pointer
+ * @exiting:           Pointer to store the task pointer of the owner task
+ *                     which is in the middle of exiting
  * @set_waiters:       force setting the FUTEX_WAITERS bit (1) or not (0)
  *
  * Try and get the lock on behalf of the top waiter if we can do it atomically.
@@ -1857,16 +1919,20 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
  * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
  * hb1 and hb2 must be held by the caller.
  *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
+ *
  * Return:
  *  -  0 - failed to acquire the lock atomically;
  *  - >0 - acquired the lock, return value is vpid of the top_waiter
  *  - <0 - error
  */
-static int futex_proxy_trylock_atomic(u32 __user *pifutex,
-                                struct futex_hash_bucket *hb1,
-                                struct futex_hash_bucket *hb2,
-                                union futex_key *key1, union futex_key *key2,
-                                struct futex_pi_state **ps, int set_waiters)
+static int
+futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
+                          struct futex_hash_bucket *hb2, union futex_key *key1,
+                          union futex_key *key2, struct futex_pi_state **ps,
+                          struct task_struct **exiting, int set_waiters)
 {
        struct futex_q *top_waiter = NULL;
        u32 curval;
@@ -1903,7 +1969,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
         */
        vpid = task_pid_vnr(top_waiter->task);
        ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
-                                  set_waiters);
+                                  exiting, set_waiters);
        if (ret == 1) {
                requeue_pi_wake_futex(top_waiter, key2, hb2);
                return vpid;
@@ -2032,6 +2098,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
        }
 
        if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+               struct task_struct *exiting = NULL;
+
                /*
                 * Attempt to acquire uaddr2 and wake the top waiter. If we
                 * intend to requeue waiters, force setting the FUTEX_WAITERS
@@ -2039,7 +2107,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
                 * faults rather in the requeue loop below.
                 */
                ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
-                                                &key2, &pi_state, nr_requeue);
+                                                &key2, &pi_state,
+                                                &exiting, nr_requeue);
 
                /*
                 * At this point the top_waiter has either taken uaddr2 or is
@@ -2066,7 +2135,8 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
                         * If that call succeeds then we have pi_state and an
                         * initial refcount on it.
                         */
-                       ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
+                       ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
+                                             &pi_state, &exiting);
                }
 
                switch (ret) {
@@ -2084,17 +2154,24 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
                        if (!ret)
                                goto retry;
                        goto out;
+               case -EBUSY:
                case -EAGAIN:
                        /*
                         * Two reasons for this:
-                        * - Owner is exiting and we just wait for the
+                        * - EBUSY: Owner is exiting and we just wait for the
                         *   exit to complete.
-                        * - The user space value changed.
+                        * - EAGAIN: The user space value changed.
                         */
                        double_unlock_hb(hb1, hb2);
                        hb_waiters_dec(hb2);
                        put_futex_key(&key2);
                        put_futex_key(&key1);
+                       /*
+                        * Handle the case where the owner is in the middle of
+                        * exiting. Wait for the exit to complete otherwise
+                        * this task might loop forever, aka. live lock.
+                        */
+                       wait_for_owner_exiting(ret, exiting);
                        cond_resched();
                        goto retry;
                default:
@@ -2801,6 +2878,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 {
        struct hrtimer_sleeper timeout, *to;
        struct futex_pi_state *pi_state = NULL;
+       struct task_struct *exiting = NULL;
        struct rt_mutex_waiter rt_waiter;
        struct futex_hash_bucket *hb;
        struct futex_q q = futex_q_init;
@@ -2822,7 +2900,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 retry_private:
        hb = queue_lock(&q);
 
-       ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
+       ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
+                                  &exiting, 0);
        if (unlikely(ret)) {
                /*
                 * Atomic work succeeded and we got the lock,
@@ -2835,15 +2914,22 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
                        goto out_unlock_put_key;
                case -EFAULT:
                        goto uaddr_faulted;
+               case -EBUSY:
                case -EAGAIN:
                        /*
                         * Two reasons for this:
-                        * - Task is exiting and we just wait for the
+                        * - EBUSY: Task is exiting and we just wait for the
                         *   exit to complete.
-                        * - The user space value changed.
+                        * - EAGAIN: The user space value changed.
                         */
                        queue_unlock(hb);
                        put_futex_key(&q.key);
+                       /*
+                        * Handle the case where the owner is in the middle of
+                        * exiting. Wait for the exit to complete otherwise
+                        * this task might loop forever, aka. live lock.
+                        */
+                       wait_for_owner_exiting(ret, exiting);
                        cond_resched();
                        goto retry;
                default:
@@ -3452,11 +3538,16 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
        return ret;
 }
 
+/* Constants for the pending_op argument of handle_futex_death */
+#define HANDLE_DEATH_PENDING   true
+#define HANDLE_DEATH_LIST      false
+
 /*
  * Process a futex-list entry, check whether it's owned by the
  * dying task, and do notification if so:
  */
-static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
+                             bool pi, bool pending_op)
 {
        u32 uval, uninitialized_var(nval), mval;
        int err;
@@ -3469,6 +3560,42 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int p
        if (get_user(uval, uaddr))
                return -1;
 
+       /*
+        * Special case for regular (non PI) futexes. The unlock path in
+        * user space has two race scenarios:
+        *
+        * 1. The unlock path releases the user space futex value and
+        *    before it can execute the futex() syscall to wake up
+        *    waiters it is killed.
+        *
+        * 2. A woken up waiter is killed before it can acquire the
+        *    futex in user space.
+        *
+        * In both cases the TID validation below prevents a wakeup of
+        * potential waiters which can cause these waiters to block
+        * forever.
+        *
+        * In both cases the following conditions are met:
+        *
+        *      1) task->robust_list->list_op_pending != NULL
+        *         @pending_op == true
+        *      2) User space futex value == 0
+        *      3) Regular futex: @pi == false
+        *
+        * If these conditions are met, it is safe to attempt waking up a
+        * potential waiter without touching the user space futex value and
+        * trying to set the OWNER_DIED bit. The user space futex value is
+        * uncontended and the rest of the user space mutex state is
+        * consistent, so a woken waiter will just take over the
+        * uncontended futex. Setting the OWNER_DIED bit would create
+        * inconsistent state and malfunction of the user space owner died
+        * handling.
+        */
+       if (pending_op && !pi && !uval) {
+               futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+               return 0;
+       }
+
        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
                return 0;
 
@@ -3547,7 +3674,7 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
  *
  * We silently return on any sign of list-walking problem.
  */
-void exit_robust_list(struct task_struct *curr)
+static void exit_robust_list(struct task_struct *curr)
 {
        struct robust_list_head __user *head = curr->robust_list;
        struct robust_list __user *entry, *next_entry, *pending;
@@ -3588,10 +3715,11 @@ void exit_robust_list(struct task_struct *curr)
                 * A pending lock might already be on the list, so
                 * don't process it twice:
                 */
-               if (entry != pending)
+               if (entry != pending) {
                        if (handle_futex_death((void __user *)entry + futex_offset,
-                                               curr, pi))
+                                               curr, pi, HANDLE_DEATH_LIST))
                                return;
+               }
                if (rc)
                        return;
                entry = next_entry;
@@ -3605,9 +3733,118 @@ void exit_robust_list(struct task_struct *curr)
                cond_resched();
        }
 
-       if (pending)
+       if (pending) {
                handle_futex_death((void __user *)pending + futex_offset,
-                                  curr, pip);
+                                  curr, pip, HANDLE_DEATH_PENDING);
+       }
+}
+
+static void futex_cleanup(struct task_struct *tsk)
+{
+       if (unlikely(tsk->robust_list)) {
+               exit_robust_list(tsk);
+               tsk->robust_list = NULL;
+       }
+
+#ifdef CONFIG_COMPAT
+       if (unlikely(tsk->compat_robust_list)) {
+               compat_exit_robust_list(tsk);
+               tsk->compat_robust_list = NULL;
+       }
+#endif
+
+       if (unlikely(!list_empty(&tsk->pi_state_list)))
+               exit_pi_state_list(tsk);
+}
+
+/**
+ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
+ * @tsk:       task to set the state on
+ *
+ * Set the futex exit state of the task lockless. The futex waiter code
+ * observes that state when a task is exiting and loops until the task has
+ * actually finished the futex cleanup. The worst case for this is that the
+ * waiter runs through the wait loop until the state becomes visible.
+ *
+ * This is called from the recursive fault handling path in do_exit().
+ *
+ * This is best effort. Either the futex exit code has run already or
+ * not. If the OWNER_DIED bit has been set on the futex then the waiter can
+ * take it over. If not, the problem is pushed back to user space. If the
+ * futex exit code did not run yet, then an already queued waiter might
+ * block forever, but there is nothing which can be done about that.
+ */
+void futex_exit_recursive(struct task_struct *tsk)
+{
+       /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
+       if (tsk->futex_state == FUTEX_STATE_EXITING)
+               mutex_unlock(&tsk->futex_exit_mutex);
+       tsk->futex_state = FUTEX_STATE_DEAD;
+}
+
+static void futex_cleanup_begin(struct task_struct *tsk)
+{
+       /*
+        * Prevent various race issues against a concurrent incoming waiter
+        * including live locks by forcing the waiter to block on
+        * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
+        * attach_to_pi_owner().
+        */
+       mutex_lock(&tsk->futex_exit_mutex);
+
+       /*
+        * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
+        *
+        * This ensures that all subsequent checks of tsk->futex_state in
+        * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
+        * tsk->pi_lock held.
+        *
+        * It guarantees also that a pi_state which was queued right before
+        * the state change under tsk->pi_lock by a concurrent waiter must
+        * be observed in exit_pi_state_list().
+        */
+       raw_spin_lock_irq(&tsk->pi_lock);
+       tsk->futex_state = FUTEX_STATE_EXITING;
+       raw_spin_unlock_irq(&tsk->pi_lock);
+}
+
+static void futex_cleanup_end(struct task_struct *tsk, int state)
+{
+       /*
+        * Lockless store. The only side effect is that an observer might
+        * take another loop until it becomes visible.
+        */
+       tsk->futex_state = state;
+       /*
+        * Drop the exit protection. This unblocks waiters which observed
+        * FUTEX_STATE_EXITING to reevaluate the state.
+        */
+       mutex_unlock(&tsk->futex_exit_mutex);
+}
+
+void futex_exec_release(struct task_struct *tsk)
+{
+       /*
+        * The state handling is done for consistency, but in the case of
+        * exec() there is no way to prevent futher damage as the PID stays
+        * the same. But for the unlikely and arguably buggy case that a
+        * futex is held on exec(), this provides at least as much state
+        * consistency protection which is possible.
+        */
+       futex_cleanup_begin(tsk);
+       futex_cleanup(tsk);
+       /*
+        * Reset the state to FUTEX_STATE_OK. The task is alive and about
+        * exec a new binary.
+        */
+       futex_cleanup_end(tsk, FUTEX_STATE_OK);
+}
+
+void futex_exit_release(struct task_struct *tsk)
+{
+       futex_cleanup_begin(tsk);
+       futex_cleanup(tsk);
+       futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
 }
 
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
@@ -3737,7 +3974,7 @@ static void __user *futex_uaddr(struct robust_list __user *entry,
  *
  * We silently return on any sign of list-walking problem.
  */
-void compat_exit_robust_list(struct task_struct *curr)
+static void compat_exit_robust_list(struct task_struct *curr)
 {
        struct compat_robust_list_head __user *head = curr->compat_robust_list;
        struct robust_list __user *entry, *next_entry, *pending;
@@ -3784,7 +4021,8 @@ void compat_exit_robust_list(struct task_struct *curr)
                if (entry != pending) {
                        void __user *uaddr = futex_uaddr(entry, futex_offset);
 
-                       if (handle_futex_death(uaddr, curr, pi))
+                       if (handle_futex_death(uaddr, curr, pi,
+                                              HANDLE_DEATH_LIST))
                                return;
                }
                if (rc)
@@ -3803,7 +4041,7 @@ void compat_exit_robust_list(struct task_struct *curr)
        if (pending) {
                void __user *uaddr = futex_uaddr(pending, futex_offset);
 
-               handle_futex_death(uaddr, curr, pip);
+               handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
        }
 }