]> asedeno.scripts.mit.edu Git - linux.git/blob - kernel/locking/rwsem.c
decda9fb8c6d376fd2b6d0412db5c399a360d05d
[linux.git] / kernel / locking / rwsem.c
1 // SPDX-License-Identifier: GPL-2.0
2 /* kernel/rwsem.c: R/W semaphores, public implementation
3  *
4  * Written by David Howells (dhowells@redhat.com).
5  * Derived from asm-i386/semaphore.h
6  *
7  * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
8  * and Michel Lespinasse <walken@google.com>
9  *
10  * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
11  * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
12  *
13  * Rwsem count bit fields re-definition and rwsem rearchitecture by
14  * Waiman Long <longman@redhat.com> and
15  * Peter Zijlstra <peterz@infradead.org>.
16  */
17
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/sched/rt.h>
22 #include <linux/sched/task.h>
23 #include <linux/sched/debug.h>
24 #include <linux/sched/wake_q.h>
25 #include <linux/sched/signal.h>
26 #include <linux/export.h>
27 #include <linux/rwsem.h>
28 #include <linux/atomic.h>
29
30 #include "rwsem.h"
31 #include "lock_events.h"
32
33 /*
34  * The least significant 2 bits of the owner value has the following
35  * meanings when set.
36  *  - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers
37  *  - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned,
38  *    i.e. the owner(s) cannot be readily determined. It can be reader
39  *    owned or the owning writer is indeterminate.
40  *
41  * When a writer acquires a rwsem, it puts its task_struct pointer
42  * into the owner field. It is cleared after an unlock.
43  *
44  * When a reader acquires a rwsem, it will also puts its task_struct
45  * pointer into the owner field with both the RWSEM_READER_OWNED and
46  * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will
47  * largely be left untouched. So for a free or reader-owned rwsem,
48  * the owner value may contain information about the last reader that
49  * acquires the rwsem. The anonymous bit is set because that particular
50  * reader may or may not still own the lock.
51  *
52  * That information may be helpful in debugging cases where the system
53  * seems to hang on a reader owned rwsem especially if only one reader
54  * is involved. Ideally we would like to track all the readers that own
55  * a rwsem, but the overhead is simply too big.
56  */
57 #define RWSEM_READER_OWNED      (1UL << 0)
58 #define RWSEM_ANONYMOUSLY_OWNED (1UL << 1)
59
60 #ifdef CONFIG_DEBUG_RWSEMS
61 # define DEBUG_RWSEMS_WARN_ON(c, sem)   do {                    \
62         if (!debug_locks_silent &&                              \
63             WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
64                 #c, atomic_long_read(&(sem)->count),            \
65                 (long)((sem)->owner), (long)current,            \
66                 list_empty(&(sem)->wait_list) ? "" : "not "))   \
67                         debug_locks_off();                      \
68         } while (0)
69 #else
70 # define DEBUG_RWSEMS_WARN_ON(c, sem)
71 #endif
72
73 /*
74  * The definition of the atomic counter in the semaphore:
75  *
76  * Bit  0   - writer locked bit
77  * Bit  1   - waiters present bit
78  * Bit  2   - lock handoff bit
79  * Bits 3-7 - reserved
80  * Bits 8-X - 24-bit (32-bit) or 56-bit reader count
81  *
82  * atomic_long_fetch_add() is used to obtain reader lock, whereas
83  * atomic_long_cmpxchg() will be used to obtain writer lock.
84  *
85  * There are three places where the lock handoff bit may be set or cleared.
86  * 1) rwsem_mark_wake() for readers.
87  * 2) rwsem_try_write_lock() for writers.
88  * 3) Error path of rwsem_down_write_slowpath().
89  *
90  * For all the above cases, wait_lock will be held. A writer must also
91  * be the first one in the wait_list to be eligible for setting the handoff
92  * bit. So concurrent setting/clearing of handoff bit is not possible.
93  */
94 #define RWSEM_WRITER_LOCKED     (1UL << 0)
95 #define RWSEM_FLAG_WAITERS      (1UL << 1)
96 #define RWSEM_FLAG_HANDOFF      (1UL << 2)
97
98 #define RWSEM_READER_SHIFT      8
99 #define RWSEM_READER_BIAS       (1UL << RWSEM_READER_SHIFT)
100 #define RWSEM_READER_MASK       (~(RWSEM_READER_BIAS - 1))
101 #define RWSEM_WRITER_MASK       RWSEM_WRITER_LOCKED
102 #define RWSEM_LOCK_MASK         (RWSEM_WRITER_MASK|RWSEM_READER_MASK)
103 #define RWSEM_READ_FAILED_MASK  (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
104                                  RWSEM_FLAG_HANDOFF)
105
106 /*
107  * All writes to owner are protected by WRITE_ONCE() to make sure that
108  * store tearing can't happen as optimistic spinners may read and use
109  * the owner value concurrently without lock. Read from owner, however,
110  * may not need READ_ONCE() as long as the pointer value is only used
111  * for comparison and isn't being dereferenced.
112  */
113 static inline void rwsem_set_owner(struct rw_semaphore *sem)
114 {
115         WRITE_ONCE(sem->owner, current);
116 }
117
118 static inline void rwsem_clear_owner(struct rw_semaphore *sem)
119 {
120         WRITE_ONCE(sem->owner, NULL);
121 }
122
123 /*
124  * The task_struct pointer of the last owning reader will be left in
125  * the owner field.
126  *
127  * Note that the owner value just indicates the task has owned the rwsem
128  * previously, it may not be the real owner or one of the real owners
129  * anymore when that field is examined, so take it with a grain of salt.
130  */
131 static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
132                                             struct task_struct *owner)
133 {
134         unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED
135                                                  | RWSEM_ANONYMOUSLY_OWNED;
136
137         WRITE_ONCE(sem->owner, (struct task_struct *)val);
138 }
139
140 static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
141 {
142         __rwsem_set_reader_owned(sem, current);
143 }
144
145 /*
146  * Return true if the a rwsem waiter can spin on the rwsem's owner
147  * and steal the lock, i.e. the lock is not anonymously owned.
148  * N.B. !owner is considered spinnable.
149  */
150 static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
151 {
152         return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED);
153 }
154
155 /*
156  * Return true if rwsem is owned by an anonymous writer or readers.
157  */
158 static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
159 {
160         return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED;
161 }
162
163 #ifdef CONFIG_DEBUG_RWSEMS
164 /*
165  * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
166  * is a task pointer in owner of a reader-owned rwsem, it will be the
167  * real owner or one of the real owners. The only exception is when the
168  * unlock is done by up_read_non_owner().
169  */
170 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
171 {
172         unsigned long val = (unsigned long)current | RWSEM_READER_OWNED
173                                                    | RWSEM_ANONYMOUSLY_OWNED;
174         if (READ_ONCE(sem->owner) == (struct task_struct *)val)
175                 cmpxchg_relaxed((unsigned long *)&sem->owner, val,
176                                 RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED);
177 }
178 #else
179 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
180 {
181 }
182 #endif
183
184 /*
185  * Guide to the rw_semaphore's count field.
186  *
187  * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
188  * by a writer.
189  *
190  * The lock is owned by readers when
191  * (1) the RWSEM_WRITER_LOCKED isn't set in count,
192  * (2) some of the reader bits are set in count, and
193  * (3) the owner field has RWSEM_READ_OWNED bit set.
194  *
195  * Having some reader bits set is not enough to guarantee a readers owned
196  * lock as the readers may be in the process of backing out from the count
197  * and a writer has just released the lock. So another writer may steal
198  * the lock immediately after that.
199  */
200
201 /*
202  * Initialize an rwsem:
203  */
204 void __init_rwsem(struct rw_semaphore *sem, const char *name,
205                   struct lock_class_key *key)
206 {
207 #ifdef CONFIG_DEBUG_LOCK_ALLOC
208         /*
209          * Make sure we are not reinitializing a held semaphore:
210          */
211         debug_check_no_locks_freed((void *)sem, sizeof(*sem));
212         lockdep_init_map(&sem->dep_map, name, key, 0);
213 #endif
214         atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
215         raw_spin_lock_init(&sem->wait_lock);
216         INIT_LIST_HEAD(&sem->wait_list);
217         sem->owner = NULL;
218 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
219         osq_lock_init(&sem->osq);
220 #endif
221 }
222 EXPORT_SYMBOL(__init_rwsem);
223
224 enum rwsem_waiter_type {
225         RWSEM_WAITING_FOR_WRITE,
226         RWSEM_WAITING_FOR_READ
227 };
228
229 struct rwsem_waiter {
230         struct list_head list;
231         struct task_struct *task;
232         enum rwsem_waiter_type type;
233         unsigned long timeout;
234 };
235 #define rwsem_first_waiter(sem) \
236         list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
237
238 enum rwsem_wake_type {
239         RWSEM_WAKE_ANY,         /* Wake whatever's at head of wait list */
240         RWSEM_WAKE_READERS,     /* Wake readers only */
241         RWSEM_WAKE_READ_OWNED   /* Waker thread holds the read lock */
242 };
243
244 enum writer_wait_state {
245         WRITER_NOT_FIRST,       /* Writer is not first in wait list */
246         WRITER_FIRST,           /* Writer is first in wait list     */
247         WRITER_HANDOFF          /* Writer is first & handoff needed */
248 };
249
250 /*
251  * The typical HZ value is either 250 or 1000. So set the minimum waiting
252  * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
253  * queue before initiating the handoff protocol.
254  */
255 #define RWSEM_WAIT_TIMEOUT      DIV_ROUND_UP(HZ, 250)
256
257 /*
258  * handle the lock release when processes blocked on it that can now run
259  * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
260  *   have been set.
261  * - there must be someone on the queue
262  * - the wait_lock must be held by the caller
263  * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
264  *   to actually wakeup the blocked task(s) and drop the reference count,
265  *   preferably when the wait_lock is released
266  * - woken process blocks are discarded from the list after having task zeroed
267  * - writers are only marked woken if downgrading is false
268  */
269 static void rwsem_mark_wake(struct rw_semaphore *sem,
270                             enum rwsem_wake_type wake_type,
271                             struct wake_q_head *wake_q)
272 {
273         struct rwsem_waiter *waiter, *tmp;
274         long oldcount, woken = 0, adjustment = 0;
275         struct list_head wlist;
276
277         lockdep_assert_held(&sem->wait_lock);
278
279         /*
280          * Take a peek at the queue head waiter such that we can determine
281          * the wakeup(s) to perform.
282          */
283         waiter = rwsem_first_waiter(sem);
284
285         if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
286                 if (wake_type == RWSEM_WAKE_ANY) {
287                         /*
288                          * Mark writer at the front of the queue for wakeup.
289                          * Until the task is actually later awoken later by
290                          * the caller, other writers are able to steal it.
291                          * Readers, on the other hand, will block as they
292                          * will notice the queued writer.
293                          */
294                         wake_q_add(wake_q, waiter->task);
295                         lockevent_inc(rwsem_wake_writer);
296                 }
297
298                 return;
299         }
300
301         /*
302          * Writers might steal the lock before we grant it to the next reader.
303          * We prefer to do the first reader grant before counting readers
304          * so we can bail out early if a writer stole the lock.
305          */
306         if (wake_type != RWSEM_WAKE_READ_OWNED) {
307                 adjustment = RWSEM_READER_BIAS;
308                 oldcount = atomic_long_fetch_add(adjustment, &sem->count);
309                 if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
310                         /*
311                          * When we've been waiting "too" long (for writers
312                          * to give up the lock), request a HANDOFF to
313                          * force the issue.
314                          */
315                         if (!(oldcount & RWSEM_FLAG_HANDOFF) &&
316                             time_after(jiffies, waiter->timeout)) {
317                                 adjustment -= RWSEM_FLAG_HANDOFF;
318                                 lockevent_inc(rwsem_rlock_handoff);
319                         }
320
321                         atomic_long_add(-adjustment, &sem->count);
322                         return;
323                 }
324                 /*
325                  * Set it to reader-owned to give spinners an early
326                  * indication that readers now have the lock.
327                  */
328                 __rwsem_set_reader_owned(sem, waiter->task);
329         }
330
331         /*
332          * Grant an infinite number of read locks to the readers at the front
333          * of the queue. We know that woken will be at least 1 as we accounted
334          * for above. Note we increment the 'active part' of the count by the
335          * number of readers before waking any processes up.
336          *
337          * We have to do wakeup in 2 passes to prevent the possibility that
338          * the reader count may be decremented before it is incremented. It
339          * is because the to-be-woken waiter may not have slept yet. So it
340          * may see waiter->task got cleared, finish its critical section and
341          * do an unlock before the reader count increment.
342          *
343          * 1) Collect the read-waiters in a separate list, count them and
344          *    fully increment the reader count in rwsem.
345          * 2) For each waiters in the new list, clear waiter->task and
346          *    put them into wake_q to be woken up later.
347          */
348         list_for_each_entry(waiter, &sem->wait_list, list) {
349                 if (waiter->type == RWSEM_WAITING_FOR_WRITE)
350                         break;
351
352                 woken++;
353         }
354         list_cut_before(&wlist, &sem->wait_list, &waiter->list);
355
356         adjustment = woken * RWSEM_READER_BIAS - adjustment;
357         lockevent_cond_inc(rwsem_wake_reader, woken);
358         if (list_empty(&sem->wait_list)) {
359                 /* hit end of list above */
360                 adjustment -= RWSEM_FLAG_WAITERS;
361         }
362
363         /*
364          * When we've woken a reader, we no longer need to force writers
365          * to give up the lock and we can clear HANDOFF.
366          */
367         if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
368                 adjustment -= RWSEM_FLAG_HANDOFF;
369
370         if (adjustment)
371                 atomic_long_add(adjustment, &sem->count);
372
373         /* 2nd pass */
374         list_for_each_entry_safe(waiter, tmp, &wlist, list) {
375                 struct task_struct *tsk;
376
377                 tsk = waiter->task;
378                 get_task_struct(tsk);
379
380                 /*
381                  * Ensure calling get_task_struct() before setting the reader
382                  * waiter to nil such that rwsem_down_read_slowpath() cannot
383                  * race with do_exit() by always holding a reference count
384                  * to the task to wakeup.
385                  */
386                 smp_store_release(&waiter->task, NULL);
387                 /*
388                  * Ensure issuing the wakeup (either by us or someone else)
389                  * after setting the reader waiter to nil.
390                  */
391                 wake_q_add_safe(wake_q, tsk);
392         }
393 }
394
395 /*
396  * This function must be called with the sem->wait_lock held to prevent
397  * race conditions between checking the rwsem wait list and setting the
398  * sem->count accordingly.
399  *
400  * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
401  * bit is set or the lock is acquired with handoff bit cleared.
402  */
403 static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem,
404                                         enum writer_wait_state wstate)
405 {
406         long new;
407
408         lockdep_assert_held(&sem->wait_lock);
409
410         do {
411                 bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
412
413                 if (has_handoff && wstate == WRITER_NOT_FIRST)
414                         return false;
415
416                 new = count;
417
418                 if (count & RWSEM_LOCK_MASK) {
419                         if (has_handoff || (wstate != WRITER_HANDOFF))
420                                 return false;
421
422                         new |= RWSEM_FLAG_HANDOFF;
423                 } else {
424                         new |= RWSEM_WRITER_LOCKED;
425                         new &= ~RWSEM_FLAG_HANDOFF;
426
427                         if (list_is_singular(&sem->wait_list))
428                                 new &= ~RWSEM_FLAG_WAITERS;
429                 }
430         } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
431
432         /*
433          * We have either acquired the lock with handoff bit cleared or
434          * set the handoff bit.
435          */
436         if (new & RWSEM_FLAG_HANDOFF)
437                 return false;
438
439         rwsem_set_owner(sem);
440         return true;
441 }
442
443 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
444 /*
445  * Try to acquire write lock before the writer has been put on wait queue.
446  */
447 static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
448 {
449         long count = atomic_long_read(&sem->count);
450
451         while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) {
452                 if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
453                                         count | RWSEM_WRITER_LOCKED)) {
454                         rwsem_set_owner(sem);
455                         lockevent_inc(rwsem_opt_wlock);
456                         return true;
457                 }
458         }
459         return false;
460 }
461
462 static inline bool owner_on_cpu(struct task_struct *owner)
463 {
464         /*
465          * As lock holder preemption issue, we both skip spinning if
466          * task is not on cpu or its cpu is preempted
467          */
468         return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
469 }
470
471 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
472 {
473         struct task_struct *owner;
474         bool ret = true;
475
476         BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN));
477
478         if (need_resched())
479                 return false;
480
481         rcu_read_lock();
482         owner = READ_ONCE(sem->owner);
483         if (owner) {
484                 ret = is_rwsem_owner_spinnable(owner) &&
485                       owner_on_cpu(owner);
486         }
487         rcu_read_unlock();
488         return ret;
489 }
490
491 /*
492  * The rwsem_spin_on_owner() function returns the folowing 4 values
493  * depending on the lock owner state.
494  *   OWNER_NULL  : owner is currently NULL
495  *   OWNER_WRITER: when owner changes and is a writer
496  *   OWNER_READER: when owner changes and the new owner may be a reader.
497  *   OWNER_NONSPINNABLE:
498  *                 when optimistic spinning has to stop because either the
499  *                 owner stops running, is unknown, or its timeslice has
500  *                 been used up.
501  */
502 enum owner_state {
503         OWNER_NULL              = 1 << 0,
504         OWNER_WRITER            = 1 << 1,
505         OWNER_READER            = 1 << 2,
506         OWNER_NONSPINNABLE      = 1 << 3,
507 };
508 #define OWNER_SPINNABLE         (OWNER_NULL | OWNER_WRITER)
509
510 static inline enum owner_state rwsem_owner_state(unsigned long owner)
511 {
512         if (!owner)
513                 return OWNER_NULL;
514
515         if (owner & RWSEM_ANONYMOUSLY_OWNED)
516                 return OWNER_NONSPINNABLE;
517
518         if (owner & RWSEM_READER_OWNED)
519                 return OWNER_READER;
520
521         return OWNER_WRITER;
522 }
523
524 static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem)
525 {
526         struct task_struct *tmp, *owner = READ_ONCE(sem->owner);
527         enum owner_state state = rwsem_owner_state((unsigned long)owner);
528
529         if (state != OWNER_WRITER)
530                 return state;
531
532         rcu_read_lock();
533         for (;;) {
534                 if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) {
535                         state = OWNER_NONSPINNABLE;
536                         break;
537                 }
538
539                 tmp = READ_ONCE(sem->owner);
540                 if (tmp != owner) {
541                         state = rwsem_owner_state((unsigned long)tmp);
542                         break;
543                 }
544
545                 /*
546                  * Ensure we emit the owner->on_cpu, dereference _after_
547                  * checking sem->owner still matches owner, if that fails,
548                  * owner might point to free()d memory, if it still matches,
549                  * the rcu_read_lock() ensures the memory stays valid.
550                  */
551                 barrier();
552
553                 if (need_resched() || !owner_on_cpu(owner)) {
554                         state = OWNER_NONSPINNABLE;
555                         break;
556                 }
557
558                 cpu_relax();
559         }
560         rcu_read_unlock();
561
562         return state;
563 }
564
565 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
566 {
567         bool taken = false;
568
569         preempt_disable();
570
571         /* sem->wait_lock should not be held when doing optimistic spinning */
572         if (!rwsem_can_spin_on_owner(sem))
573                 goto done;
574
575         if (!osq_lock(&sem->osq))
576                 goto done;
577
578         /*
579          * Optimistically spin on the owner field and attempt to acquire the
580          * lock whenever the owner changes. Spinning will be stopped when:
581          *  1) the owning writer isn't running; or
582          *  2) readers own the lock as we can't determine if they are
583          *     actively running or not.
584          */
585         while (rwsem_spin_on_owner(sem) & OWNER_SPINNABLE) {
586                 /*
587                  * Try to acquire the lock
588                  */
589                 if (rwsem_try_write_lock_unqueued(sem)) {
590                         taken = true;
591                         break;
592                 }
593
594                 /*
595                  * When there's no owner, we might have preempted between the
596                  * owner acquiring the lock and setting the owner field. If
597                  * we're an RT task that will live-lock because we won't let
598                  * the owner complete.
599                  */
600                 if (!sem->owner && (need_resched() || rt_task(current)))
601                         break;
602
603                 /*
604                  * The cpu_relax() call is a compiler barrier which forces
605                  * everything in this loop to be re-loaded. We don't need
606                  * memory barriers as we'll eventually observe the right
607                  * values at the cost of a few extra spins.
608                  */
609                 cpu_relax();
610         }
611         osq_unlock(&sem->osq);
612 done:
613         preempt_enable();
614         lockevent_cond_inc(rwsem_opt_fail, !taken);
615         return taken;
616 }
617 #else
618 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
619 {
620         return false;
621 }
622 #endif
623
624 /*
625  * Wait for the read lock to be granted
626  */
627 static struct rw_semaphore __sched *
628 rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
629 {
630         long count, adjustment = -RWSEM_READER_BIAS;
631         struct rwsem_waiter waiter;
632         DEFINE_WAKE_Q(wake_q);
633
634         waiter.task = current;
635         waiter.type = RWSEM_WAITING_FOR_READ;
636         waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
637
638         raw_spin_lock_irq(&sem->wait_lock);
639         if (list_empty(&sem->wait_list)) {
640                 /*
641                  * In case the wait queue is empty and the lock isn't owned
642                  * by a writer or has the handoff bit set, this reader can
643                  * exit the slowpath and return immediately as its
644                  * RWSEM_READER_BIAS has already been set in the count.
645                  */
646                 if (!(atomic_long_read(&sem->count) &
647                      (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
648                         raw_spin_unlock_irq(&sem->wait_lock);
649                         rwsem_set_reader_owned(sem);
650                         lockevent_inc(rwsem_rlock_fast);
651                         return sem;
652                 }
653                 adjustment += RWSEM_FLAG_WAITERS;
654         }
655         list_add_tail(&waiter.list, &sem->wait_list);
656
657         /* we're now waiting on the lock, but no longer actively locking */
658         count = atomic_long_add_return(adjustment, &sem->count);
659
660         /*
661          * If there are no active locks, wake the front queued process(es).
662          *
663          * If there are no writers and we are first in the queue,
664          * wake our own waiter to join the existing active readers !
665          */
666         if (!(count & RWSEM_LOCK_MASK) ||
667            (!(count & RWSEM_WRITER_MASK) && (adjustment & RWSEM_FLAG_WAITERS)))
668                 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
669
670         raw_spin_unlock_irq(&sem->wait_lock);
671         wake_up_q(&wake_q);
672
673         /* wait to be given the lock */
674         while (true) {
675                 set_current_state(state);
676                 if (!waiter.task)
677                         break;
678                 if (signal_pending_state(state, current)) {
679                         raw_spin_lock_irq(&sem->wait_lock);
680                         if (waiter.task)
681                                 goto out_nolock;
682                         raw_spin_unlock_irq(&sem->wait_lock);
683                         break;
684                 }
685                 schedule();
686                 lockevent_inc(rwsem_sleep_reader);
687         }
688
689         __set_current_state(TASK_RUNNING);
690         lockevent_inc(rwsem_rlock);
691         return sem;
692 out_nolock:
693         list_del(&waiter.list);
694         if (list_empty(&sem->wait_list)) {
695                 atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
696                                    &sem->count);
697         }
698         raw_spin_unlock_irq(&sem->wait_lock);
699         __set_current_state(TASK_RUNNING);
700         lockevent_inc(rwsem_rlock_fail);
701         return ERR_PTR(-EINTR);
702 }
703
704 /*
705  * Wait until we successfully acquire the write lock
706  */
707 static struct rw_semaphore *
708 rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
709 {
710         long count;
711         enum writer_wait_state wstate;
712         struct rwsem_waiter waiter;
713         struct rw_semaphore *ret = sem;
714         DEFINE_WAKE_Q(wake_q);
715
716         /* do optimistic spinning and steal lock if possible */
717         if (rwsem_optimistic_spin(sem))
718                 return sem;
719
720         /*
721          * Optimistic spinning failed, proceed to the slowpath
722          * and block until we can acquire the sem.
723          */
724         waiter.task = current;
725         waiter.type = RWSEM_WAITING_FOR_WRITE;
726         waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
727
728         raw_spin_lock_irq(&sem->wait_lock);
729
730         /* account for this before adding a new element to the list */
731         wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
732
733         list_add_tail(&waiter.list, &sem->wait_list);
734
735         /* we're now waiting on the lock */
736         if (wstate == WRITER_NOT_FIRST) {
737                 count = atomic_long_read(&sem->count);
738
739                 /*
740                  * If there were already threads queued before us and:
741                  *  1) there are no no active locks, wake the front
742                  *     queued process(es) as the handoff bit might be set.
743                  *  2) there are no active writers and some readers, the lock
744                  *     must be read owned; so we try to wake any read lock
745                  *     waiters that were queued ahead of us.
746                  */
747                 if (count & RWSEM_WRITER_MASK)
748                         goto wait;
749
750                 rwsem_mark_wake(sem, (count & RWSEM_READER_MASK)
751                                         ? RWSEM_WAKE_READERS
752                                         : RWSEM_WAKE_ANY, &wake_q);
753
754                 /*
755                  * The wakeup is normally called _after_ the wait_lock
756                  * is released, but given that we are proactively waking
757                  * readers we can deal with the wake_q overhead as it is
758                  * similar to releasing and taking the wait_lock again
759                  * for attempting rwsem_try_write_lock().
760                  */
761                 wake_up_q(&wake_q);
762
763                 /* We need wake_q again below, reinitialize */
764                 wake_q_init(&wake_q);
765         } else {
766                 count = atomic_long_add_return(RWSEM_FLAG_WAITERS, &sem->count);
767         }
768
769 wait:
770         /* wait until we successfully acquire the lock */
771         set_current_state(state);
772         while (true) {
773                 if (rwsem_try_write_lock(count, sem, wstate))
774                         break;
775
776                 raw_spin_unlock_irq(&sem->wait_lock);
777
778                 /* Block until there are no active lockers. */
779                 for (;;) {
780                         if (signal_pending_state(state, current))
781                                 goto out_nolock;
782
783                         schedule();
784                         lockevent_inc(rwsem_sleep_writer);
785                         set_current_state(state);
786                         /*
787                          * If HANDOFF bit is set, unconditionally do
788                          * a trylock.
789                          */
790                         if (wstate == WRITER_HANDOFF)
791                                 break;
792
793                         if ((wstate == WRITER_NOT_FIRST) &&
794                             (rwsem_first_waiter(sem) == &waiter))
795                                 wstate = WRITER_FIRST;
796
797                         count = atomic_long_read(&sem->count);
798                         if (!(count & RWSEM_LOCK_MASK))
799                                 break;
800
801                         /*
802                          * The setting of the handoff bit is deferred
803                          * until rwsem_try_write_lock() is called.
804                          */
805                         if ((wstate == WRITER_FIRST) && (rt_task(current) ||
806                             time_after(jiffies, waiter.timeout))) {
807                                 wstate = WRITER_HANDOFF;
808                                 lockevent_inc(rwsem_wlock_handoff);
809                                 break;
810                         }
811                 }
812
813                 raw_spin_lock_irq(&sem->wait_lock);
814                 count = atomic_long_read(&sem->count);
815         }
816         __set_current_state(TASK_RUNNING);
817         list_del(&waiter.list);
818         raw_spin_unlock_irq(&sem->wait_lock);
819         lockevent_inc(rwsem_wlock);
820
821         return ret;
822
823 out_nolock:
824         __set_current_state(TASK_RUNNING);
825         raw_spin_lock_irq(&sem->wait_lock);
826         list_del(&waiter.list);
827
828         if (unlikely(wstate == WRITER_HANDOFF))
829                 atomic_long_add(-RWSEM_FLAG_HANDOFF,  &sem->count);
830
831         if (list_empty(&sem->wait_list))
832                 atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
833         else
834                 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
835         raw_spin_unlock_irq(&sem->wait_lock);
836         wake_up_q(&wake_q);
837         lockevent_inc(rwsem_wlock_fail);
838
839         return ERR_PTR(-EINTR);
840 }
841
842 /*
843  * handle waking up a waiter on the semaphore
844  * - up_read/up_write has decremented the active part of count if we come here
845  */
846 static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count)
847 {
848         unsigned long flags;
849         DEFINE_WAKE_Q(wake_q);
850
851         raw_spin_lock_irqsave(&sem->wait_lock, flags);
852
853         if (!list_empty(&sem->wait_list))
854                 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
855
856         raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
857         wake_up_q(&wake_q);
858
859         return sem;
860 }
861
862 /*
863  * downgrade a write lock into a read lock
864  * - caller incremented waiting part of count and discovered it still negative
865  * - just wake up any readers at the front of the queue
866  */
867 static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
868 {
869         unsigned long flags;
870         DEFINE_WAKE_Q(wake_q);
871
872         raw_spin_lock_irqsave(&sem->wait_lock, flags);
873
874         if (!list_empty(&sem->wait_list))
875                 rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
876
877         raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
878         wake_up_q(&wake_q);
879
880         return sem;
881 }
882
883 /*
884  * lock for reading
885  */
886 inline void __down_read(struct rw_semaphore *sem)
887 {
888         if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS,
889                         &sem->count) & RWSEM_READ_FAILED_MASK)) {
890                 rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
891                 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
892                                         RWSEM_READER_OWNED), sem);
893         } else {
894                 rwsem_set_reader_owned(sem);
895         }
896 }
897
898 static inline int __down_read_killable(struct rw_semaphore *sem)
899 {
900         if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS,
901                         &sem->count) & RWSEM_READ_FAILED_MASK)) {
902                 if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE)))
903                         return -EINTR;
904                 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
905                                         RWSEM_READER_OWNED), sem);
906         } else {
907                 rwsem_set_reader_owned(sem);
908         }
909         return 0;
910 }
911
912 static inline int __down_read_trylock(struct rw_semaphore *sem)
913 {
914         /*
915          * Optimize for the case when the rwsem is not locked at all.
916          */
917         long tmp = RWSEM_UNLOCKED_VALUE;
918
919         do {
920                 if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
921                                         tmp + RWSEM_READER_BIAS)) {
922                         rwsem_set_reader_owned(sem);
923                         return 1;
924                 }
925         } while (!(tmp & RWSEM_READ_FAILED_MASK));
926         return 0;
927 }
928
929 /*
930  * lock for writing
931  */
932 static inline void __down_write(struct rw_semaphore *sem)
933 {
934         long tmp = RWSEM_UNLOCKED_VALUE;
935
936         if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
937                                                       RWSEM_WRITER_LOCKED)))
938                 rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE);
939         rwsem_set_owner(sem);
940 }
941
942 static inline int __down_write_killable(struct rw_semaphore *sem)
943 {
944         long tmp = RWSEM_UNLOCKED_VALUE;
945
946         if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
947                                                       RWSEM_WRITER_LOCKED))) {
948                 if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE)))
949                         return -EINTR;
950         }
951         rwsem_set_owner(sem);
952         return 0;
953 }
954
955 static inline int __down_write_trylock(struct rw_semaphore *sem)
956 {
957         long tmp = RWSEM_UNLOCKED_VALUE;
958
959         if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
960                                             RWSEM_WRITER_LOCKED)) {
961                 rwsem_set_owner(sem);
962                 return true;
963         }
964         return false;
965 }
966
967 /*
968  * unlock after reading
969  */
970 inline void __up_read(struct rw_semaphore *sem)
971 {
972         long tmp;
973
974         DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem);
975         rwsem_clear_reader_owned(sem);
976         tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
977         if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
978                       RWSEM_FLAG_WAITERS))
979                 rwsem_wake(sem, tmp);
980 }
981
982 /*
983  * unlock after writing
984  */
985 static inline void __up_write(struct rw_semaphore *sem)
986 {
987         long tmp;
988
989         DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
990         rwsem_clear_owner(sem);
991         tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
992         if (unlikely(tmp & RWSEM_FLAG_WAITERS))
993                 rwsem_wake(sem, tmp);
994 }
995
996 /*
997  * downgrade write lock to read lock
998  */
999 static inline void __downgrade_write(struct rw_semaphore *sem)
1000 {
1001         long tmp;
1002
1003         /*
1004          * When downgrading from exclusive to shared ownership,
1005          * anything inside the write-locked region cannot leak
1006          * into the read side. In contrast, anything in the
1007          * read-locked region is ok to be re-ordered into the
1008          * write side. As such, rely on RELEASE semantics.
1009          */
1010         DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
1011         tmp = atomic_long_fetch_add_release(
1012                 -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
1013         rwsem_set_reader_owned(sem);
1014         if (tmp & RWSEM_FLAG_WAITERS)
1015                 rwsem_downgrade_wake(sem);
1016 }
1017
1018 /*
1019  * lock for reading
1020  */
1021 void __sched down_read(struct rw_semaphore *sem)
1022 {
1023         might_sleep();
1024         rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1025
1026         LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1027 }
1028 EXPORT_SYMBOL(down_read);
1029
1030 int __sched down_read_killable(struct rw_semaphore *sem)
1031 {
1032         might_sleep();
1033         rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1034
1035         if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
1036                 rwsem_release(&sem->dep_map, 1, _RET_IP_);
1037                 return -EINTR;
1038         }
1039
1040         return 0;
1041 }
1042 EXPORT_SYMBOL(down_read_killable);
1043
1044 /*
1045  * trylock for reading -- returns 1 if successful, 0 if contention
1046  */
1047 int down_read_trylock(struct rw_semaphore *sem)
1048 {
1049         int ret = __down_read_trylock(sem);
1050
1051         if (ret == 1)
1052                 rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
1053         return ret;
1054 }
1055 EXPORT_SYMBOL(down_read_trylock);
1056
1057 /*
1058  * lock for writing
1059  */
1060 void __sched down_write(struct rw_semaphore *sem)
1061 {
1062         might_sleep();
1063         rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
1064         LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1065 }
1066 EXPORT_SYMBOL(down_write);
1067
1068 /*
1069  * lock for writing
1070  */
1071 int __sched down_write_killable(struct rw_semaphore *sem)
1072 {
1073         might_sleep();
1074         rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
1075
1076         if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1077                                   __down_write_killable)) {
1078                 rwsem_release(&sem->dep_map, 1, _RET_IP_);
1079                 return -EINTR;
1080         }
1081
1082         return 0;
1083 }
1084 EXPORT_SYMBOL(down_write_killable);
1085
1086 /*
1087  * trylock for writing -- returns 1 if successful, 0 if contention
1088  */
1089 int down_write_trylock(struct rw_semaphore *sem)
1090 {
1091         int ret = __down_write_trylock(sem);
1092
1093         if (ret == 1)
1094                 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
1095
1096         return ret;
1097 }
1098 EXPORT_SYMBOL(down_write_trylock);
1099
1100 /*
1101  * release a read lock
1102  */
1103 void up_read(struct rw_semaphore *sem)
1104 {
1105         rwsem_release(&sem->dep_map, 1, _RET_IP_);
1106         __up_read(sem);
1107 }
1108 EXPORT_SYMBOL(up_read);
1109
1110 /*
1111  * release a write lock
1112  */
1113 void up_write(struct rw_semaphore *sem)
1114 {
1115         rwsem_release(&sem->dep_map, 1, _RET_IP_);
1116         __up_write(sem);
1117 }
1118 EXPORT_SYMBOL(up_write);
1119
1120 /*
1121  * downgrade write lock to read lock
1122  */
1123 void downgrade_write(struct rw_semaphore *sem)
1124 {
1125         lock_downgrade(&sem->dep_map, _RET_IP_);
1126         __downgrade_write(sem);
1127 }
1128 EXPORT_SYMBOL(downgrade_write);
1129
1130 #ifdef CONFIG_DEBUG_LOCK_ALLOC
1131
1132 void down_read_nested(struct rw_semaphore *sem, int subclass)
1133 {
1134         might_sleep();
1135         rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
1136         LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1137 }
1138 EXPORT_SYMBOL(down_read_nested);
1139
1140 void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
1141 {
1142         might_sleep();
1143         rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
1144         LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1145 }
1146 EXPORT_SYMBOL(_down_write_nest_lock);
1147
1148 void down_read_non_owner(struct rw_semaphore *sem)
1149 {
1150         might_sleep();
1151         __down_read(sem);
1152         __rwsem_set_reader_owned(sem, NULL);
1153 }
1154 EXPORT_SYMBOL(down_read_non_owner);
1155
1156 void down_write_nested(struct rw_semaphore *sem, int subclass)
1157 {
1158         might_sleep();
1159         rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
1160         LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1161 }
1162 EXPORT_SYMBOL(down_write_nested);
1163
1164 int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
1165 {
1166         might_sleep();
1167         rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
1168
1169         if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1170                                   __down_write_killable)) {
1171                 rwsem_release(&sem->dep_map, 1, _RET_IP_);
1172                 return -EINTR;
1173         }
1174
1175         return 0;
1176 }
1177 EXPORT_SYMBOL(down_write_killable_nested);
1178
1179 void up_read_non_owner(struct rw_semaphore *sem)
1180 {
1181         DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
1182                                 sem);
1183         __up_read(sem);
1184 }
1185 EXPORT_SYMBOL(up_read_non_owner);
1186
1187 #endif