kernel/locking/rwsem.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /* kernel/rwsem.c: R/W semaphores, public implementation
   3  *
   4  * Written by David Howells (dhowells@redhat.com).
   5  * Derived from asm-i386/semaphore.h
   6  *
   7  * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
   8  * and Michel Lespinasse <walken@google.com>
   9  *
  10  * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
  11  * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
  12  *
  13  * Rwsem count bit fields re-definition and rwsem rearchitecture by
  14  * Waiman Long <longman@redhat.com> and
  15  * Peter Zijlstra <peterz@infradead.org>.
  16  */
  17
  18 #include <linux/types.h>
  19 #include <linux/kernel.h>
  20 #include <linux/sched.h>
  21 #include <linux/sched/rt.h>
  22 #include <linux/sched/task.h>
  23 #include <linux/sched/debug.h>
  24 #include <linux/sched/wake_q.h>
  25 #include <linux/sched/signal.h>
  26 #include <linux/export.h>
  27 #include <linux/rwsem.h>
  28 #include <linux/atomic.h>
  29
  30 #include "rwsem.h"
  31 #include "lock_events.h"
  32
  33 /*
  34  * The least significant 2 bits of the owner value has the following
  35  * meanings when set.
  36  *  - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers
  37  *  - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned,
  38  *    i.e. the owner(s) cannot be readily determined. It can be reader
  39  *    owned or the owning writer is indeterminate.
  40  *
  41  * When a writer acquires a rwsem, it puts its task_struct pointer
  42  * into the owner field. It is cleared after an unlock.
  43  *
  44  * When a reader acquires a rwsem, it will also puts its task_struct
  45  * pointer into the owner field with both the RWSEM_READER_OWNED and
  46  * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will
  47  * largely be left untouched. So for a free or reader-owned rwsem,
  48  * the owner value may contain information about the last reader that
  49  * acquires the rwsem. The anonymous bit is set because that particular
  50  * reader may or may not still own the lock.
  51  *
  52  * That information may be helpful in debugging cases where the system
  53  * seems to hang on a reader owned rwsem especially if only one reader
  54  * is involved. Ideally we would like to track all the readers that own
  55  * a rwsem, but the overhead is simply too big.
  56  */
  57 #define RWSEM_READER_OWNED      (1UL << 0)
  58 #define RWSEM_ANONYMOUSLY_OWNED (1UL << 1)
  59
  60 #ifdef CONFIG_DEBUG_RWSEMS
  61 # define DEBUG_RWSEMS_WARN_ON(c, sem)   do {                    \
  62         if (!debug_locks_silent &&                              \
  63             WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
  64                 #c, atomic_long_read(&(sem)->count),            \
  65                 (long)((sem)->owner), (long)current,            \
  66                 list_empty(&(sem)->wait_list) ? "" : "not "))   \
  67                         debug_locks_off();                      \
  68         } while (0)
  69 #else
  70 # define DEBUG_RWSEMS_WARN_ON(c, sem)
  71 #endif
  72
  73 /*
  74  * The definition of the atomic counter in the semaphore:
  75  *
  76  * Bit  0   - writer locked bit
  77  * Bit  1   - waiters present bit
  78  * Bit  2   - lock handoff bit
  79  * Bits 3-7 - reserved
  80  * Bits 8-X - 24-bit (32-bit) or 56-bit reader count
  81  *
  82  * atomic_long_fetch_add() is used to obtain reader lock, whereas
  83  * atomic_long_cmpxchg() will be used to obtain writer lock.
  84  *
  85  * There are three places where the lock handoff bit may be set or cleared.
  86  * 1) rwsem_mark_wake() for readers.
  87  * 2) rwsem_try_write_lock() for writers.
  88  * 3) Error path of rwsem_down_write_slowpath().
  89  *
  90  * For all the above cases, wait_lock will be held. A writer must also
  91  * be the first one in the wait_list to be eligible for setting the handoff
  92  * bit. So concurrent setting/clearing of handoff bit is not possible.
  93  */
  94 #define RWSEM_WRITER_LOCKED     (1UL << 0)
  95 #define RWSEM_FLAG_WAITERS      (1UL << 1)
  96 #define RWSEM_FLAG_HANDOFF      (1UL << 2)
  97
  98 #define RWSEM_READER_SHIFT      8
  99 #define RWSEM_READER_BIAS       (1UL << RWSEM_READER_SHIFT)
 100 #define RWSEM_READER_MASK       (~(RWSEM_READER_BIAS - 1))
 101 #define RWSEM_WRITER_MASK       RWSEM_WRITER_LOCKED
 102 #define RWSEM_LOCK_MASK         (RWSEM_WRITER_MASK|RWSEM_READER_MASK)
 103 #define RWSEM_READ_FAILED_MASK  (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
 104                                  RWSEM_FLAG_HANDOFF)
 105
 106 /*
 107  * All writes to owner are protected by WRITE_ONCE() to make sure that
 108  * store tearing can't happen as optimistic spinners may read and use
 109  * the owner value concurrently without lock. Read from owner, however,
 110  * may not need READ_ONCE() as long as the pointer value is only used
 111  * for comparison and isn't being dereferenced.
 112  */
 113 static inline void rwsem_set_owner(struct rw_semaphore *sem)
 114 {
 115         WRITE_ONCE(sem->owner, current);
 116 }
 117
 118 static inline void rwsem_clear_owner(struct rw_semaphore *sem)
 119 {
 120         WRITE_ONCE(sem->owner, NULL);
 121 }
 122
 123 /*
 124  * The task_struct pointer of the last owning reader will be left in
 125  * the owner field.
 126  *
 127  * Note that the owner value just indicates the task has owned the rwsem
 128  * previously, it may not be the real owner or one of the real owners
 129  * anymore when that field is examined, so take it with a grain of salt.
 130  */
 131 static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
 132                                             struct task_struct *owner)
 133 {
 134         unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED
 135                                                  | RWSEM_ANONYMOUSLY_OWNED;
 136
 137         WRITE_ONCE(sem->owner, (struct task_struct *)val);
 138 }
 139
 140 static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
 141 {
 142         __rwsem_set_reader_owned(sem, current);
 143 }
 144
 145 /*
 146  * Return true if the a rwsem waiter can spin on the rwsem's owner
 147  * and steal the lock, i.e. the lock is not anonymously owned.
 148  * N.B. !owner is considered spinnable.
 149  */
 150 static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
 151 {
 152         return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED);
 153 }
 154
 155 /*
 156  * Return true if rwsem is owned by an anonymous writer or readers.
 157  */
 158 static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
 159 {
 160         return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED;
 161 }
 162
 163 #ifdef CONFIG_DEBUG_RWSEMS
 164 /*
 165  * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
 166  * is a task pointer in owner of a reader-owned rwsem, it will be the
 167  * real owner or one of the real owners. The only exception is when the
 168  * unlock is done by up_read_non_owner().
 169  */
 170 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 171 {
 172         unsigned long val = (unsigned long)current | RWSEM_READER_OWNED
 173                                                    | RWSEM_ANONYMOUSLY_OWNED;
 174         if (READ_ONCE(sem->owner) == (struct task_struct *)val)
 175                 cmpxchg_relaxed((unsigned long *)&sem->owner, val,
 176                                 RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED);
 177 }
 178 #else
 179 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 180 {
 181 }
 182 #endif
 183
 184 /*
 185  * Guide to the rw_semaphore's count field.
 186  *
 187  * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
 188  * by a writer.
 189  *
 190  * The lock is owned by readers when
 191  * (1) the RWSEM_WRITER_LOCKED isn't set in count,
 192  * (2) some of the reader bits are set in count, and
 193  * (3) the owner field has RWSEM_READ_OWNED bit set.
 194  *
 195  * Having some reader bits set is not enough to guarantee a readers owned
 196  * lock as the readers may be in the process of backing out from the count
 197  * and a writer has just released the lock. So another writer may steal
 198  * the lock immediately after that.
 199  */
 200
 201 /*
 202  * Initialize an rwsem:
 203  */
 204 void __init_rwsem(struct rw_semaphore *sem, const char *name,
 205                   struct lock_class_key *key)
 206 {
 207 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 208         /*
 209          * Make sure we are not reinitializing a held semaphore:
 210          */
 211         debug_check_no_locks_freed((void *)sem, sizeof(*sem));
 212         lockdep_init_map(&sem->dep_map, name, key, 0);
 213 #endif
 214         atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
 215         raw_spin_lock_init(&sem->wait_lock);
 216         INIT_LIST_HEAD(&sem->wait_list);
 217         sem->owner = NULL;
 218 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 219         osq_lock_init(&sem->osq);
 220 #endif
 221 }
 222 EXPORT_SYMBOL(__init_rwsem);
 223
 224 enum rwsem_waiter_type {
 225         RWSEM_WAITING_FOR_WRITE,
 226         RWSEM_WAITING_FOR_READ
 227 };
 228
 229 struct rwsem_waiter {
 230         struct list_head list;
 231         struct task_struct *task;
 232         enum rwsem_waiter_type type;
 233         unsigned long timeout;
 234 };
 235 #define rwsem_first_waiter(sem) \
 236         list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
 237
 238 enum rwsem_wake_type {
 239         RWSEM_WAKE_ANY,         /* Wake whatever's at head of wait list */
 240         RWSEM_WAKE_READERS,     /* Wake readers only */
 241         RWSEM_WAKE_READ_OWNED   /* Waker thread holds the read lock */
 242 };
 243
 244 enum writer_wait_state {
 245         WRITER_NOT_FIRST,       /* Writer is not first in wait list */
 246         WRITER_FIRST,           /* Writer is first in wait list     */
 247         WRITER_HANDOFF          /* Writer is first & handoff needed */
 248 };
 249
 250 /*
 251  * The typical HZ value is either 250 or 1000. So set the minimum waiting
 252  * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
 253  * queue before initiating the handoff protocol.
 254  */
 255 #define RWSEM_WAIT_TIMEOUT      DIV_ROUND_UP(HZ, 250)
 256
 257 /*
 258  * handle the lock release when processes blocked on it that can now run
 259  * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
 260  *   have been set.
 261  * - there must be someone on the queue
 262  * - the wait_lock must be held by the caller
 263  * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
 264  *   to actually wakeup the blocked task(s) and drop the reference count,
 265  *   preferably when the wait_lock is released
 266  * - woken process blocks are discarded from the list after having task zeroed
 267  * - writers are only marked woken if downgrading is false
 268  */
 269 static void rwsem_mark_wake(struct rw_semaphore *sem,
 270                             enum rwsem_wake_type wake_type,
 271                             struct wake_q_head *wake_q)
 272 {
 273         struct rwsem_waiter *waiter, *tmp;
 274         long oldcount, woken = 0, adjustment = 0;
 275         struct list_head wlist;
 276
 277         lockdep_assert_held(&sem->wait_lock);
 278
 279         /*
 280          * Take a peek at the queue head waiter such that we can determine
 281          * the wakeup(s) to perform.
 282          */
 283         waiter = rwsem_first_waiter(sem);
 284
 285         if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
 286                 if (wake_type == RWSEM_WAKE_ANY) {
 287                         /*
 288                          * Mark writer at the front of the queue for wakeup.
 289                          * Until the task is actually later awoken later by
 290                          * the caller, other writers are able to steal it.
 291                          * Readers, on the other hand, will block as they
 292                          * will notice the queued writer.
 293                          */
 294                         wake_q_add(wake_q, waiter->task);
 295                         lockevent_inc(rwsem_wake_writer);
 296                 }
 297
 298                 return;
 299         }
 300
 301         /*
 302          * Writers might steal the lock before we grant it to the next reader.
 303          * We prefer to do the first reader grant before counting readers
 304          * so we can bail out early if a writer stole the lock.
 305          */
 306         if (wake_type != RWSEM_WAKE_READ_OWNED) {
 307                 adjustment = RWSEM_READER_BIAS;
 308                 oldcount = atomic_long_fetch_add(adjustment, &sem->count);
 309                 if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
 310                         /*
 311                          * When we've been waiting "too" long (for writers
 312                          * to give up the lock), request a HANDOFF to
 313                          * force the issue.
 314                          */
 315                         if (!(oldcount & RWSEM_FLAG_HANDOFF) &&
 316                             time_after(jiffies, waiter->timeout)) {
 317                                 adjustment -= RWSEM_FLAG_HANDOFF;
 318                                 lockevent_inc(rwsem_rlock_handoff);
 319                         }
 320
 321                         atomic_long_add(-adjustment, &sem->count);
 322                         return;
 323                 }
 324                 /*
 325                  * Set it to reader-owned to give spinners an early
 326                  * indication that readers now have the lock.
 327                  */
 328                 __rwsem_set_reader_owned(sem, waiter->task);
 329         }
 330
 331         /*
 332          * Grant an infinite number of read locks to the readers at the front
 333          * of the queue. We know that woken will be at least 1 as we accounted
 334          * for above. Note we increment the 'active part' of the count by the
 335          * number of readers before waking any processes up.
 336          *
 337          * We have to do wakeup in 2 passes to prevent the possibility that
 338          * the reader count may be decremented before it is incremented. It
 339          * is because the to-be-woken waiter may not have slept yet. So it
 340          * may see waiter->task got cleared, finish its critical section and
 341          * do an unlock before the reader count increment.
 342          *
 343          * 1) Collect the read-waiters in a separate list, count them and
 344          *    fully increment the reader count in rwsem.
 345          * 2) For each waiters in the new list, clear waiter->task and
 346          *    put them into wake_q to be woken up later.
 347          */
 348         list_for_each_entry(waiter, &sem->wait_list, list) {
 349                 if (waiter->type == RWSEM_WAITING_FOR_WRITE)
 350                         break;
 351
 352                 woken++;
 353         }
 354         list_cut_before(&wlist, &sem->wait_list, &waiter->list);
 355
 356         adjustment = woken * RWSEM_READER_BIAS - adjustment;
 357         lockevent_cond_inc(rwsem_wake_reader, woken);
 358         if (list_empty(&sem->wait_list)) {
 359                 /* hit end of list above */
 360                 adjustment -= RWSEM_FLAG_WAITERS;
 361         }
 362
 363         /*
 364          * When we've woken a reader, we no longer need to force writers
 365          * to give up the lock and we can clear HANDOFF.
 366          */
 367         if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
 368                 adjustment -= RWSEM_FLAG_HANDOFF;
 369
 370         if (adjustment)
 371                 atomic_long_add(adjustment, &sem->count);
 372
 373         /* 2nd pass */
 374         list_for_each_entry_safe(waiter, tmp, &wlist, list) {
 375                 struct task_struct *tsk;
 376
 377                 tsk = waiter->task;
 378                 get_task_struct(tsk);
 379
 380                 /*
 381                  * Ensure calling get_task_struct() before setting the reader
 382                  * waiter to nil such that rwsem_down_read_slowpath() cannot
 383                  * race with do_exit() by always holding a reference count
 384                  * to the task to wakeup.
 385                  */
 386                 smp_store_release(&waiter->task, NULL);
 387                 /*
 388                  * Ensure issuing the wakeup (either by us or someone else)
 389                  * after setting the reader waiter to nil.
 390                  */
 391                 wake_q_add_safe(wake_q, tsk);
 392         }
 393 }
 394
 395 /*
 396  * This function must be called with the sem->wait_lock held to prevent
 397  * race conditions between checking the rwsem wait list and setting the
 398  * sem->count accordingly.
 399  *
 400  * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
 401  * bit is set or the lock is acquired with handoff bit cleared.
 402  */
 403 static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem,
 404                                         enum writer_wait_state wstate)
 405 {
 406         long new;
 407
 408         lockdep_assert_held(&sem->wait_lock);
 409
 410         do {
 411                 bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
 412
 413                 if (has_handoff && wstate == WRITER_NOT_FIRST)
 414                         return false;
 415
 416                 new = count;
 417
 418                 if (count & RWSEM_LOCK_MASK) {
 419                         if (has_handoff || (wstate != WRITER_HANDOFF))
 420                                 return false;
 421
 422                         new |= RWSEM_FLAG_HANDOFF;
 423                 } else {
 424                         new |= RWSEM_WRITER_LOCKED;
 425                         new &= ~RWSEM_FLAG_HANDOFF;
 426
 427                         if (list_is_singular(&sem->wait_list))
 428                                 new &= ~RWSEM_FLAG_WAITERS;
 429                 }
 430         } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
 431
 432         /*
 433          * We have either acquired the lock with handoff bit cleared or
 434          * set the handoff bit.
 435          */
 436         if (new & RWSEM_FLAG_HANDOFF)
 437                 return false;
 438
 439         rwsem_set_owner(sem);
 440         return true;
 441 }
 442
 443 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 444 /*
 445  * Try to acquire write lock before the writer has been put on wait queue.
 446  */
 447 static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 448 {
 449         long count = atomic_long_read(&sem->count);
 450
 451         while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) {
 452                 if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
 453                                         count | RWSEM_WRITER_LOCKED)) {
 454                         rwsem_set_owner(sem);
 455                         lockevent_inc(rwsem_opt_wlock);
 456                         return true;
 457                 }
 458         }
 459         return false;
 460 }
 461
 462 static inline bool owner_on_cpu(struct task_struct *owner)
 463 {
 464         /*
 465          * As lock holder preemption issue, we both skip spinning if
 466          * task is not on cpu or its cpu is preempted
 467          */
 468         return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
 469 }
 470
 471 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 472 {
 473         struct task_struct *owner;
 474         bool ret = true;
 475
 476         BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN));
 477
 478         if (need_resched())
 479                 return false;
 480
 481         rcu_read_lock();
 482         owner = READ_ONCE(sem->owner);
 483         if (owner) {
 484                 ret = is_rwsem_owner_spinnable(owner) &&
 485                       owner_on_cpu(owner);
 486         }
 487         rcu_read_unlock();
 488         return ret;
 489 }
 490
 491 /*
 492  * The rwsem_spin_on_owner() function returns the folowing 4 values
 493  * depending on the lock owner state.
 494  *   OWNER_NULL  : owner is currently NULL
 495  *   OWNER_WRITER: when owner changes and is a writer
 496  *   OWNER_READER: when owner changes and the new owner may be a reader.
 497  *   OWNER_NONSPINNABLE:
 498  *                 when optimistic spinning has to stop because either the
 499  *                 owner stops running, is unknown, or its timeslice has
 500  *                 been used up.
 501  */
 502 enum owner_state {
 503         OWNER_NULL              = 1 << 0,
 504         OWNER_WRITER            = 1 << 1,
 505         OWNER_READER            = 1 << 2,
 506         OWNER_NONSPINNABLE      = 1 << 3,
 507 };
 508 #define OWNER_SPINNABLE         (OWNER_NULL | OWNER_WRITER)
 509
 510 static inline enum owner_state rwsem_owner_state(unsigned long owner)
 511 {
 512         if (!owner)
 513                 return OWNER_NULL;
 514
 515         if (owner & RWSEM_ANONYMOUSLY_OWNED)
 516                 return OWNER_NONSPINNABLE;
 517
 518         if (owner & RWSEM_READER_OWNED)
 519                 return OWNER_READER;
 520
 521         return OWNER_WRITER;
 522 }
 523
 524 static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem)
 525 {
 526         struct task_struct *tmp, *owner = READ_ONCE(sem->owner);
 527         enum owner_state state = rwsem_owner_state((unsigned long)owner);
 528
 529         if (state != OWNER_WRITER)
 530                 return state;
 531
 532         rcu_read_lock();
 533         for (;;) {
 534                 if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) {
 535                         state = OWNER_NONSPINNABLE;
 536                         break;
 537                 }
 538
 539                 tmp = READ_ONCE(sem->owner);
 540                 if (tmp != owner) {
 541                         state = rwsem_owner_state((unsigned long)tmp);
 542                         break;
 543                 }
 544
 545                 /*
 546                  * Ensure we emit the owner->on_cpu, dereference _after_
 547                  * checking sem->owner still matches owner, if that fails,
 548                  * owner might point to free()d memory, if it still matches,
 549                  * the rcu_read_lock() ensures the memory stays valid.
 550                  */
 551                 barrier();
 552
 553                 if (need_resched() || !owner_on_cpu(owner)) {
 554                         state = OWNER_NONSPINNABLE;
 555                         break;
 556                 }
 557
 558                 cpu_relax();
 559         }
 560         rcu_read_unlock();
 561
 562         return state;
 563 }
 564
 565 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 566 {
 567         bool taken = false;
 568
 569         preempt_disable();
 570
 571         /* sem->wait_lock should not be held when doing optimistic spinning */
 572         if (!rwsem_can_spin_on_owner(sem))
 573                 goto done;
 574
 575         if (!osq_lock(&sem->osq))
 576                 goto done;
 577
 578         /*
 579          * Optimistically spin on the owner field and attempt to acquire the
 580          * lock whenever the owner changes. Spinning will be stopped when:
 581          *  1) the owning writer isn't running; or
 582          *  2) readers own the lock as we can't determine if they are
 583          *     actively running or not.
 584          */
 585         while (rwsem_spin_on_owner(sem) & OWNER_SPINNABLE) {
 586                 /*
 587                  * Try to acquire the lock
 588                  */
 589                 if (rwsem_try_write_lock_unqueued(sem)) {
 590                         taken = true;
 591                         break;
 592                 }
 593
 594                 /*
 595                  * When there's no owner, we might have preempted between the
 596                  * owner acquiring the lock and setting the owner field. If
 597                  * we're an RT task that will live-lock because we won't let
 598                  * the owner complete.
 599                  */
 600                 if (!sem->owner && (need_resched() || rt_task(current)))
 601                         break;
 602
 603                 /*
 604                  * The cpu_relax() call is a compiler barrier which forces
 605                  * everything in this loop to be re-loaded. We don't need
 606                  * memory barriers as we'll eventually observe the right
 607                  * values at the cost of a few extra spins.
 608                  */
 609                 cpu_relax();
 610         }
 611         osq_unlock(&sem->osq);
 612 done:
 613         preempt_enable();
 614         lockevent_cond_inc(rwsem_opt_fail, !taken);
 615         return taken;
 616 }
 617 #else
 618 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 619 {
 620         return false;
 621 }
 622 #endif
 623
 624 /*
 625  * Wait for the read lock to be granted
 626  */
 627 static struct rw_semaphore __sched *
 628 rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
 629 {
 630         long count, adjustment = -RWSEM_READER_BIAS;
 631         struct rwsem_waiter waiter;
 632         DEFINE_WAKE_Q(wake_q);
 633
 634         waiter.task = current;
 635         waiter.type = RWSEM_WAITING_FOR_READ;
 636         waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
 637
 638         raw_spin_lock_irq(&sem->wait_lock);
 639         if (list_empty(&sem->wait_list)) {
 640                 /*
 641                  * In case the wait queue is empty and the lock isn't owned
 642                  * by a writer or has the handoff bit set, this reader can
 643                  * exit the slowpath and return immediately as its
 644                  * RWSEM_READER_BIAS has already been set in the count.
 645                  */
 646                 if (!(atomic_long_read(&sem->count) &
 647                      (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
 648                         raw_spin_unlock_irq(&sem->wait_lock);
 649                         rwsem_set_reader_owned(sem);
 650                         lockevent_inc(rwsem_rlock_fast);
 651                         return sem;
 652                 }
 653                 adjustment += RWSEM_FLAG_WAITERS;
 654         }
 655         list_add_tail(&waiter.list, &sem->wait_list);
 656
 657         /* we're now waiting on the lock, but no longer actively locking */
 658         count = atomic_long_add_return(adjustment, &sem->count);
 659
 660         /*
 661          * If there are no active locks, wake the front queued process(es).
 662          *
 663          * If there are no writers and we are first in the queue,
 664          * wake our own waiter to join the existing active readers !
 665          */
 666         if (!(count & RWSEM_LOCK_MASK) ||
 667            (!(count & RWSEM_WRITER_MASK) && (adjustment & RWSEM_FLAG_WAITERS)))
 668                 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 669
 670         raw_spin_unlock_irq(&sem->wait_lock);
 671         wake_up_q(&wake_q);
 672
 673         /* wait to be given the lock */
 674         while (true) {
 675                 set_current_state(state);
 676                 if (!waiter.task)
 677                         break;
 678                 if (signal_pending_state(state, current)) {
 679                         raw_spin_lock_irq(&sem->wait_lock);
 680                         if (waiter.task)
 681                                 goto out_nolock;
 682                         raw_spin_unlock_irq(&sem->wait_lock);
 683                         break;
 684                 }
 685                 schedule();
 686                 lockevent_inc(rwsem_sleep_reader);
 687         }
 688
 689         __set_current_state(TASK_RUNNING);
 690         lockevent_inc(rwsem_rlock);
 691         return sem;
 692 out_nolock:
 693         list_del(&waiter.list);
 694         if (list_empty(&sem->wait_list)) {
 695                 atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
 696                                    &sem->count);
 697         }
 698         raw_spin_unlock_irq(&sem->wait_lock);
 699         __set_current_state(TASK_RUNNING);
 700         lockevent_inc(rwsem_rlock_fail);
 701         return ERR_PTR(-EINTR);
 702 }
 703
 704 /*
 705  * Wait until we successfully acquire the write lock
 706  */
 707 static struct rw_semaphore *
 708 rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 709 {
 710         long count;
 711         enum writer_wait_state wstate;
 712         struct rwsem_waiter waiter;
 713         struct rw_semaphore *ret = sem;
 714         DEFINE_WAKE_Q(wake_q);
 715
 716         /* do optimistic spinning and steal lock if possible */
 717         if (rwsem_optimistic_spin(sem))
 718                 return sem;
 719
 720         /*
 721          * Optimistic spinning failed, proceed to the slowpath
 722          * and block until we can acquire the sem.
 723          */
 724         waiter.task = current;
 725         waiter.type = RWSEM_WAITING_FOR_WRITE;
 726         waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
 727
 728         raw_spin_lock_irq(&sem->wait_lock);
 729
 730         /* account for this before adding a new element to the list */
 731         wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
 732
 733         list_add_tail(&waiter.list, &sem->wait_list);
 734
 735         /* we're now waiting on the lock */
 736         if (wstate == WRITER_NOT_FIRST) {
 737                 count = atomic_long_read(&sem->count);
 738
 739                 /*
 740                  * If there were already threads queued before us and:
 741                  *  1) there are no no active locks, wake the front
 742                  *     queued process(es) as the handoff bit might be set.
 743                  *  2) there are no active writers and some readers, the lock
 744                  *     must be read owned; so we try to wake any read lock
 745                  *     waiters that were queued ahead of us.
 746                  */
 747                 if (count & RWSEM_WRITER_MASK)
 748                         goto wait;
 749
 750                 rwsem_mark_wake(sem, (count & RWSEM_READER_MASK)
 751                                         ? RWSEM_WAKE_READERS
 752                                         : RWSEM_WAKE_ANY, &wake_q);
 753
 754                 /*
 755                  * The wakeup is normally called _after_ the wait_lock
 756                  * is released, but given that we are proactively waking
 757                  * readers we can deal with the wake_q overhead as it is
 758                  * similar to releasing and taking the wait_lock again
 759                  * for attempting rwsem_try_write_lock().
 760                  */
 761                 wake_up_q(&wake_q);
 762
 763                 /* We need wake_q again below, reinitialize */
 764                 wake_q_init(&wake_q);
 765         } else {
 766                 count = atomic_long_add_return(RWSEM_FLAG_WAITERS, &sem->count);
 767         }
 768
 769 wait:
 770         /* wait until we successfully acquire the lock */
 771         set_current_state(state);
 772         while (true) {
 773                 if (rwsem_try_write_lock(count, sem, wstate))
 774                         break;
 775
 776                 raw_spin_unlock_irq(&sem->wait_lock);
 777
 778                 /* Block until there are no active lockers. */
 779                 for (;;) {
 780                         if (signal_pending_state(state, current))
 781                                 goto out_nolock;
 782
 783                         schedule();
 784                         lockevent_inc(rwsem_sleep_writer);
 785                         set_current_state(state);
 786                         /*
 787                          * If HANDOFF bit is set, unconditionally do
 788                          * a trylock.
 789                          */
 790                         if (wstate == WRITER_HANDOFF)
 791                                 break;
 792
 793                         if ((wstate == WRITER_NOT_FIRST) &&
 794                             (rwsem_first_waiter(sem) == &waiter))
 795                                 wstate = WRITER_FIRST;
 796
 797                         count = atomic_long_read(&sem->count);
 798                         if (!(count & RWSEM_LOCK_MASK))
 799                                 break;
 800
 801                         /*
 802                          * The setting of the handoff bit is deferred
 803                          * until rwsem_try_write_lock() is called.
 804                          */
 805                         if ((wstate == WRITER_FIRST) && (rt_task(current) ||
 806                             time_after(jiffies, waiter.timeout))) {
 807                                 wstate = WRITER_HANDOFF;
 808                                 lockevent_inc(rwsem_wlock_handoff);
 809                                 break;
 810                         }
 811                 }
 812
 813                 raw_spin_lock_irq(&sem->wait_lock);
 814                 count = atomic_long_read(&sem->count);
 815         }
 816         __set_current_state(TASK_RUNNING);
 817         list_del(&waiter.list);
 818         raw_spin_unlock_irq(&sem->wait_lock);
 819         lockevent_inc(rwsem_wlock);
 820
 821         return ret;
 822
 823 out_nolock:
 824         __set_current_state(TASK_RUNNING);
 825         raw_spin_lock_irq(&sem->wait_lock);
 826         list_del(&waiter.list);
 827
 828         if (unlikely(wstate == WRITER_HANDOFF))
 829                 atomic_long_add(-RWSEM_FLAG_HANDOFF,  &sem->count);
 830
 831         if (list_empty(&sem->wait_list))
 832                 atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
 833         else
 834                 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 835         raw_spin_unlock_irq(&sem->wait_lock);
 836         wake_up_q(&wake_q);
 837         lockevent_inc(rwsem_wlock_fail);
 838
 839         return ERR_PTR(-EINTR);
 840 }
 841
 842 /*
 843  * handle waking up a waiter on the semaphore
 844  * - up_read/up_write has decremented the active part of count if we come here
 845  */
 846 static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count)
 847 {
 848         unsigned long flags;
 849         DEFINE_WAKE_Q(wake_q);
 850
 851         raw_spin_lock_irqsave(&sem->wait_lock, flags);
 852
 853         if (!list_empty(&sem->wait_list))
 854                 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 855
 856         raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 857         wake_up_q(&wake_q);
 858
 859         return sem;
 860 }
 861
 862 /*
 863  * downgrade a write lock into a read lock
 864  * - caller incremented waiting part of count and discovered it still negative
 865  * - just wake up any readers at the front of the queue
 866  */
 867 static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
 868 {
 869         unsigned long flags;
 870         DEFINE_WAKE_Q(wake_q);
 871
 872         raw_spin_lock_irqsave(&sem->wait_lock, flags);
 873
 874         if (!list_empty(&sem->wait_list))
 875                 rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
 876
 877         raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 878         wake_up_q(&wake_q);
 879
 880         return sem;
 881 }
 882
 883 /*
 884  * lock for reading
 885  */
 886 inline void __down_read(struct rw_semaphore *sem)
 887 {
 888         if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS,
 889                         &sem->count) & RWSEM_READ_FAILED_MASK)) {
 890                 rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
 891                 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
 892                                         RWSEM_READER_OWNED), sem);
 893         } else {
 894                 rwsem_set_reader_owned(sem);
 895         }
 896 }
 897
 898 static inline int __down_read_killable(struct rw_semaphore *sem)
 899 {
 900         if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS,
 901                         &sem->count) & RWSEM_READ_FAILED_MASK)) {
 902                 if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE)))
 903                         return -EINTR;
 904                 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
 905                                         RWSEM_READER_OWNED), sem);
 906         } else {
 907                 rwsem_set_reader_owned(sem);
 908         }
 909         return 0;
 910 }
 911
 912 static inline int __down_read_trylock(struct rw_semaphore *sem)
 913 {
 914         /*
 915          * Optimize for the case when the rwsem is not locked at all.
 916          */
 917         long tmp = RWSEM_UNLOCKED_VALUE;
 918
 919         do {
 920                 if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
 921                                         tmp + RWSEM_READER_BIAS)) {
 922                         rwsem_set_reader_owned(sem);
 923                         return 1;
 924                 }
 925         } while (!(tmp & RWSEM_READ_FAILED_MASK));
 926         return 0;
 927 }
 928
 929 /*
 930  * lock for writing
 931  */
 932 static inline void __down_write(struct rw_semaphore *sem)
 933 {
 934         long tmp = RWSEM_UNLOCKED_VALUE;
 935
 936         if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
 937                                                       RWSEM_WRITER_LOCKED)))
 938                 rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE);
 939         rwsem_set_owner(sem);
 940 }
 941
 942 static inline int __down_write_killable(struct rw_semaphore *sem)
 943 {
 944         long tmp = RWSEM_UNLOCKED_VALUE;
 945
 946         if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
 947                                                       RWSEM_WRITER_LOCKED))) {
 948                 if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE)))
 949                         return -EINTR;
 950         }
 951         rwsem_set_owner(sem);
 952         return 0;
 953 }
 954
 955 static inline int __down_write_trylock(struct rw_semaphore *sem)
 956 {
 957         long tmp = RWSEM_UNLOCKED_VALUE;
 958
 959         if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
 960                                             RWSEM_WRITER_LOCKED)) {
 961                 rwsem_set_owner(sem);
 962                 return true;
 963         }
 964         return false;
 965 }
 966
 967 /*
 968  * unlock after reading
 969  */
 970 inline void __up_read(struct rw_semaphore *sem)
 971 {
 972         long tmp;
 973
 974         DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem);
 975         rwsem_clear_reader_owned(sem);
 976         tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
 977         if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
 978                       RWSEM_FLAG_WAITERS))
 979                 rwsem_wake(sem, tmp);
 980 }
 981
 982 /*
 983  * unlock after writing
 984  */
 985 static inline void __up_write(struct rw_semaphore *sem)
 986 {
 987         long tmp;
 988
 989         DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
 990         rwsem_clear_owner(sem);
 991         tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
 992         if (unlikely(tmp & RWSEM_FLAG_WAITERS))
 993                 rwsem_wake(sem, tmp);
 994 }
 995
 996 /*
 997  * downgrade write lock to read lock
 998  */
 999 static inline void __downgrade_write(struct rw_semaphore *sem)
1000 {
1001         long tmp;
1002
1003         /*
1004          * When downgrading from exclusive to shared ownership,
1005          * anything inside the write-locked region cannot leak
1006          * into the read side. In contrast, anything in the
1007          * read-locked region is ok to be re-ordered into the
1008          * write side. As such, rely on RELEASE semantics.
1009          */
1010         DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
1011         tmp = atomic_long_fetch_add_release(
1012                 -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
1013         rwsem_set_reader_owned(sem);
1014         if (tmp & RWSEM_FLAG_WAITERS)
1015                 rwsem_downgrade_wake(sem);
1016 }
1017
1018 /*
1019  * lock for reading
1020  */
1021 void __sched down_read(struct rw_semaphore *sem)
1022 {
1023         might_sleep();
1024         rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1025
1026         LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1027 }
1028 EXPORT_SYMBOL(down_read);
1029
1030 int __sched down_read_killable(struct rw_semaphore *sem)
1031 {
1032         might_sleep();
1033         rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1034
1035         if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
1036                 rwsem_release(&sem->dep_map, 1, _RET_IP_);
1037                 return -EINTR;
1038         }
1039
1040         return 0;
1041 }
1042 EXPORT_SYMBOL(down_read_killable);
1043
1044 /*
1045  * trylock for reading -- returns 1 if successful, 0 if contention
1046  */
1047 int down_read_trylock(struct rw_semaphore *sem)
1048 {
1049         int ret = __down_read_trylock(sem);
1050
1051         if (ret == 1)
1052                 rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
1053         return ret;
1054 }
1055 EXPORT_SYMBOL(down_read_trylock);
1056
1057 /*
1058  * lock for writing
1059  */
1060 void __sched down_write(struct rw_semaphore *sem)
1061 {
1062         might_sleep();
1063         rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
1064         LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1065 }
1066 EXPORT_SYMBOL(down_write);
1067
1068 /*
1069  * lock for writing
1070  */
1071 int __sched down_write_killable(struct rw_semaphore *sem)
1072 {
1073         might_sleep();
1074         rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
1075
1076         if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1077                                   __down_write_killable)) {
1078                 rwsem_release(&sem->dep_map, 1, _RET_IP_);
1079                 return -EINTR;
1080         }
1081
1082         return 0;
1083 }
1084 EXPORT_SYMBOL(down_write_killable);
1085
1086 /*
1087  * trylock for writing -- returns 1 if successful, 0 if contention
1088  */
1089 int down_write_trylock(struct rw_semaphore *sem)
1090 {
1091         int ret = __down_write_trylock(sem);
1092
1093         if (ret == 1)
1094                 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
1095
1096         return ret;
1097 }
1098 EXPORT_SYMBOL(down_write_trylock);
1099
1100 /*
1101  * release a read lock
1102  */
1103 void up_read(struct rw_semaphore *sem)
1104 {
1105         rwsem_release(&sem->dep_map, 1, _RET_IP_);
1106         __up_read(sem);
1107 }
1108 EXPORT_SYMBOL(up_read);
1109
1110 /*
1111  * release a write lock
1112  */
1113 void up_write(struct rw_semaphore *sem)
1114 {
1115         rwsem_release(&sem->dep_map, 1, _RET_IP_);
1116         __up_write(sem);
1117 }
1118 EXPORT_SYMBOL(up_write);
1119
1120 /*
1121  * downgrade write lock to read lock
1122  */
1123 void downgrade_write(struct rw_semaphore *sem)
1124 {
1125         lock_downgrade(&sem->dep_map, _RET_IP_);
1126         __downgrade_write(sem);
1127 }
1128 EXPORT_SYMBOL(downgrade_write);
1129
1130 #ifdef CONFIG_DEBUG_LOCK_ALLOC
1131
1132 void down_read_nested(struct rw_semaphore *sem, int subclass)
1133 {
1134         might_sleep();
1135         rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
1136         LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1137 }
1138 EXPORT_SYMBOL(down_read_nested);
1139
1140 void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
1141 {
1142         might_sleep();
1143         rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
1144         LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1145 }
1146 EXPORT_SYMBOL(_down_write_nest_lock);
1147
1148 void down_read_non_owner(struct rw_semaphore *sem)
1149 {
1150         might_sleep();
1151         __down_read(sem);
1152         __rwsem_set_reader_owned(sem, NULL);
1153 }
1154 EXPORT_SYMBOL(down_read_non_owner);
1155
1156 void down_write_nested(struct rw_semaphore *sem, int subclass)
1157 {
1158         might_sleep();
1159         rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
1160         LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1161 }
1162 EXPORT_SYMBOL(down_write_nested);
1163
1164 int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
1165 {
1166         might_sleep();
1167         rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
1168
1169         if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1170                                   __down_write_killable)) {
1171                 rwsem_release(&sem->dep_map, 1, _RET_IP_);
1172                 return -EINTR;
1173         }
1174
1175         return 0;
1176 }
1177 EXPORT_SYMBOL(down_write_killable_nested);
1178
1179 void up_read_non_owner(struct rw_semaphore *sem)
1180 {
1181         DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
1182                                 sem);
1183         __up_read(sem);
1184 }
1185 EXPORT_SYMBOL(up_read_non_owner);
1186
1187 #endif