kernel/locking/rwsem.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /* kernel/rwsem.c: R/W semaphores, public implementation
   3  *
   4  * Written by David Howells (dhowells@redhat.com).
   5  * Derived from asm-i386/semaphore.h
   6  *
   7  * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
   8  * and Michel Lespinasse <walken@google.com>
   9  *
  10  * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
  11  * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
  12  *
  13  * Rwsem count bit fields re-definition and rwsem rearchitecture by
  14  * Waiman Long <longman@redhat.com> and
  15  * Peter Zijlstra <peterz@infradead.org>.
  16  */
  17
  18 #include <linux/types.h>
  19 #include <linux/kernel.h>
  20 #include <linux/sched.h>
  21 #include <linux/sched/rt.h>
  22 #include <linux/sched/task.h>
  23 #include <linux/sched/debug.h>
  24 #include <linux/sched/wake_q.h>
  25 #include <linux/sched/signal.h>
  26 #include <linux/export.h>
  27 #include <linux/rwsem.h>
  28 #include <linux/atomic.h>
  29
  30 #include "rwsem.h"
  31 #include "lock_events.h"
  32
  33 /*
  34  * The least significant 2 bits of the owner value has the following
  35  * meanings when set.
  36  *  - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers
  37  *  - Bit 1: RWSEM_NONSPINNABLE - Waiters cannot spin on the rwsem
  38  *    The rwsem is anonymously owned, i.e. the owner(s) cannot be
  39  *    readily determined. It can be reader owned or the owning writer
  40  *    is indeterminate.
  41  *
  42  * When a writer acquires a rwsem, it puts its task_struct pointer
  43  * into the owner field. It is cleared after an unlock.
  44  *
  45  * When a reader acquires a rwsem, it will also puts its task_struct
  46  * pointer into the owner field with both the RWSEM_READER_OWNED and
  47  * RWSEM_NONSPINNABLE bits set. On unlock, the owner field will
  48  * largely be left untouched. So for a free or reader-owned rwsem,
  49  * the owner value may contain information about the last reader that
  50  * acquires the rwsem. The anonymous bit is set because that particular
  51  * reader may or may not still own the lock.
  52  *
  53  * That information may be helpful in debugging cases where the system
  54  * seems to hang on a reader owned rwsem especially if only one reader
  55  * is involved. Ideally we would like to track all the readers that own
  56  * a rwsem, but the overhead is simply too big.
  57  */
  58 #define RWSEM_READER_OWNED      (1UL << 0)
  59 #define RWSEM_NONSPINNABLE      (1UL << 1)
  60 #define RWSEM_OWNER_FLAGS_MASK  (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
  61
  62 #ifdef CONFIG_DEBUG_RWSEMS
  63 # define DEBUG_RWSEMS_WARN_ON(c, sem)   do {                    \
  64         if (!debug_locks_silent &&                              \
  65             WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
  66                 #c, atomic_long_read(&(sem)->count),            \
  67                 (long)((sem)->owner), (long)current,            \
  68                 list_empty(&(sem)->wait_list) ? "" : "not "))   \
  69                         debug_locks_off();                      \
  70         } while (0)
  71 #else
  72 # define DEBUG_RWSEMS_WARN_ON(c, sem)
  73 #endif
  74
  75 /*
  76  * The definition of the atomic counter in the semaphore:
  77  *
  78  * Bit  0   - writer locked bit
  79  * Bit  1   - waiters present bit
  80  * Bit  2   - lock handoff bit
  81  * Bits 3-7 - reserved
  82  * Bits 8-X - 24-bit (32-bit) or 56-bit reader count
  83  *
  84  * atomic_long_fetch_add() is used to obtain reader lock, whereas
  85  * atomic_long_cmpxchg() will be used to obtain writer lock.
  86  *
  87  * There are three places where the lock handoff bit may be set or cleared.
  88  * 1) rwsem_mark_wake() for readers.
  89  * 2) rwsem_try_write_lock() for writers.
  90  * 3) Error path of rwsem_down_write_slowpath().
  91  *
  92  * For all the above cases, wait_lock will be held. A writer must also
  93  * be the first one in the wait_list to be eligible for setting the handoff
  94  * bit. So concurrent setting/clearing of handoff bit is not possible.
  95  */
  96 #define RWSEM_WRITER_LOCKED     (1UL << 0)
  97 #define RWSEM_FLAG_WAITERS      (1UL << 1)
  98 #define RWSEM_FLAG_HANDOFF      (1UL << 2)
  99
 100 #define RWSEM_READER_SHIFT      8
 101 #define RWSEM_READER_BIAS       (1UL << RWSEM_READER_SHIFT)
 102 #define RWSEM_READER_MASK       (~(RWSEM_READER_BIAS - 1))
 103 #define RWSEM_WRITER_MASK       RWSEM_WRITER_LOCKED
 104 #define RWSEM_LOCK_MASK         (RWSEM_WRITER_MASK|RWSEM_READER_MASK)
 105 #define RWSEM_READ_FAILED_MASK  (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
 106                                  RWSEM_FLAG_HANDOFF)
 107
 108 /*
 109  * All writes to owner are protected by WRITE_ONCE() to make sure that
 110  * store tearing can't happen as optimistic spinners may read and use
 111  * the owner value concurrently without lock. Read from owner, however,
 112  * may not need READ_ONCE() as long as the pointer value is only used
 113  * for comparison and isn't being dereferenced.
 114  */
 115 static inline void rwsem_set_owner(struct rw_semaphore *sem)
 116 {
 117         WRITE_ONCE(sem->owner, current);
 118 }
 119
 120 static inline void rwsem_clear_owner(struct rw_semaphore *sem)
 121 {
 122         WRITE_ONCE(sem->owner, NULL);
 123 }
 124
 125 /*
 126  * The task_struct pointer of the last owning reader will be left in
 127  * the owner field.
 128  *
 129  * Note that the owner value just indicates the task has owned the rwsem
 130  * previously, it may not be the real owner or one of the real owners
 131  * anymore when that field is examined, so take it with a grain of salt.
 132  */
 133 static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
 134                                             struct task_struct *owner)
 135 {
 136         unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED
 137                                                  | RWSEM_NONSPINNABLE;
 138
 139         WRITE_ONCE(sem->owner, (struct task_struct *)val);
 140 }
 141
 142 static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
 143 {
 144         __rwsem_set_reader_owned(sem, current);
 145 }
 146
 147 /*
 148  * Return true if the a rwsem waiter can spin on the rwsem's owner
 149  * and steal the lock.
 150  * N.B. !owner is considered spinnable.
 151  */
 152 static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
 153 {
 154         return !((unsigned long)owner & RWSEM_NONSPINNABLE);
 155 }
 156
 157 #ifdef CONFIG_DEBUG_RWSEMS
 158 /*
 159  * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
 160  * is a task pointer in owner of a reader-owned rwsem, it will be the
 161  * real owner or one of the real owners. The only exception is when the
 162  * unlock is done by up_read_non_owner().
 163  */
 164 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 165 {
 166         unsigned long val = (unsigned long)current | RWSEM_READER_OWNED
 167                                                    | RWSEM_NONSPINNABLE;
 168         if (READ_ONCE(sem->owner) == (struct task_struct *)val)
 169                 cmpxchg_relaxed((unsigned long *)&sem->owner, val,
 170                                 RWSEM_READER_OWNED | RWSEM_NONSPINNABLE);
 171 }
 172 #else
 173 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 174 {
 175 }
 176 #endif
 177
 178 /*
 179  * Guide to the rw_semaphore's count field.
 180  *
 181  * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
 182  * by a writer.
 183  *
 184  * The lock is owned by readers when
 185  * (1) the RWSEM_WRITER_LOCKED isn't set in count,
 186  * (2) some of the reader bits are set in count, and
 187  * (3) the owner field has RWSEM_READ_OWNED bit set.
 188  *
 189  * Having some reader bits set is not enough to guarantee a readers owned
 190  * lock as the readers may be in the process of backing out from the count
 191  * and a writer has just released the lock. So another writer may steal
 192  * the lock immediately after that.
 193  */
 194
 195 /*
 196  * Initialize an rwsem:
 197  */
 198 void __init_rwsem(struct rw_semaphore *sem, const char *name,
 199                   struct lock_class_key *key)
 200 {
 201 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 202         /*
 203          * Make sure we are not reinitializing a held semaphore:
 204          */
 205         debug_check_no_locks_freed((void *)sem, sizeof(*sem));
 206         lockdep_init_map(&sem->dep_map, name, key, 0);
 207 #endif
 208         atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
 209         raw_spin_lock_init(&sem->wait_lock);
 210         INIT_LIST_HEAD(&sem->wait_list);
 211         sem->owner = NULL;
 212 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 213         osq_lock_init(&sem->osq);
 214 #endif
 215 }
 216 EXPORT_SYMBOL(__init_rwsem);
 217
 218 enum rwsem_waiter_type {
 219         RWSEM_WAITING_FOR_WRITE,
 220         RWSEM_WAITING_FOR_READ
 221 };
 222
 223 struct rwsem_waiter {
 224         struct list_head list;
 225         struct task_struct *task;
 226         enum rwsem_waiter_type type;
 227         unsigned long timeout;
 228 };
 229 #define rwsem_first_waiter(sem) \
 230         list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
 231
 232 enum rwsem_wake_type {
 233         RWSEM_WAKE_ANY,         /* Wake whatever's at head of wait list */
 234         RWSEM_WAKE_READERS,     /* Wake readers only */
 235         RWSEM_WAKE_READ_OWNED   /* Waker thread holds the read lock */
 236 };
 237
 238 enum writer_wait_state {
 239         WRITER_NOT_FIRST,       /* Writer is not first in wait list */
 240         WRITER_FIRST,           /* Writer is first in wait list     */
 241         WRITER_HANDOFF          /* Writer is first & handoff needed */
 242 };
 243
 244 /*
 245  * The typical HZ value is either 250 or 1000. So set the minimum waiting
 246  * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
 247  * queue before initiating the handoff protocol.
 248  */
 249 #define RWSEM_WAIT_TIMEOUT      DIV_ROUND_UP(HZ, 250)
 250
 251 /*
 252  * Magic number to batch-wakeup waiting readers, even when writers are
 253  * also present in the queue. This both limits the amount of work the
 254  * waking thread must do and also prevents any potential counter overflow,
 255  * however unlikely.
 256  */
 257 #define MAX_READERS_WAKEUP      0x100
 258
 259 /*
 260  * handle the lock release when processes blocked on it that can now run
 261  * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
 262  *   have been set.
 263  * - there must be someone on the queue
 264  * - the wait_lock must be held by the caller
 265  * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
 266  *   to actually wakeup the blocked task(s) and drop the reference count,
 267  *   preferably when the wait_lock is released
 268  * - woken process blocks are discarded from the list after having task zeroed
 269  * - writers are only marked woken if downgrading is false
 270  */
 271 static void rwsem_mark_wake(struct rw_semaphore *sem,
 272                             enum rwsem_wake_type wake_type,
 273                             struct wake_q_head *wake_q)
 274 {
 275         struct rwsem_waiter *waiter, *tmp;
 276         long oldcount, woken = 0, adjustment = 0;
 277         struct list_head wlist;
 278
 279         lockdep_assert_held(&sem->wait_lock);
 280
 281         /*
 282          * Take a peek at the queue head waiter such that we can determine
 283          * the wakeup(s) to perform.
 284          */
 285         waiter = rwsem_first_waiter(sem);
 286
 287         if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
 288                 if (wake_type == RWSEM_WAKE_ANY) {
 289                         /*
 290                          * Mark writer at the front of the queue for wakeup.
 291                          * Until the task is actually later awoken later by
 292                          * the caller, other writers are able to steal it.
 293                          * Readers, on the other hand, will block as they
 294                          * will notice the queued writer.
 295                          */
 296                         wake_q_add(wake_q, waiter->task);
 297                         lockevent_inc(rwsem_wake_writer);
 298                 }
 299
 300                 return;
 301         }
 302
 303         /*
 304          * Writers might steal the lock before we grant it to the next reader.
 305          * We prefer to do the first reader grant before counting readers
 306          * so we can bail out early if a writer stole the lock.
 307          */
 308         if (wake_type != RWSEM_WAKE_READ_OWNED) {
 309                 adjustment = RWSEM_READER_BIAS;
 310                 oldcount = atomic_long_fetch_add(adjustment, &sem->count);
 311                 if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
 312                         /*
 313                          * When we've been waiting "too" long (for writers
 314                          * to give up the lock), request a HANDOFF to
 315                          * force the issue.
 316                          */
 317                         if (!(oldcount & RWSEM_FLAG_HANDOFF) &&
 318                             time_after(jiffies, waiter->timeout)) {
 319                                 adjustment -= RWSEM_FLAG_HANDOFF;
 320                                 lockevent_inc(rwsem_rlock_handoff);
 321                         }
 322
 323                         atomic_long_add(-adjustment, &sem->count);
 324                         return;
 325                 }
 326                 /*
 327                  * Set it to reader-owned to give spinners an early
 328                  * indication that readers now have the lock.
 329                  */
 330                 __rwsem_set_reader_owned(sem, waiter->task);
 331         }
 332
 333         /*
 334          * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the
 335          * queue. We know that the woken will be at least 1 as we accounted
 336          * for above. Note we increment the 'active part' of the count by the
 337          * number of readers before waking any processes up.
 338          *
 339          * This is an adaptation of the phase-fair R/W locks where at the
 340          * reader phase (first waiter is a reader), all readers are eligible
 341          * to acquire the lock at the same time irrespective of their order
 342          * in the queue. The writers acquire the lock according to their
 343          * order in the queue.
 344          *
 345          * We have to do wakeup in 2 passes to prevent the possibility that
 346          * the reader count may be decremented before it is incremented. It
 347          * is because the to-be-woken waiter may not have slept yet. So it
 348          * may see waiter->task got cleared, finish its critical section and
 349          * do an unlock before the reader count increment.
 350          *
 351          * 1) Collect the read-waiters in a separate list, count them and
 352          *    fully increment the reader count in rwsem.
 353          * 2) For each waiters in the new list, clear waiter->task and
 354          *    put them into wake_q to be woken up later.
 355          */
 356         INIT_LIST_HEAD(&wlist);
 357         list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
 358                 if (waiter->type == RWSEM_WAITING_FOR_WRITE)
 359                         continue;
 360
 361                 woken++;
 362                 list_move_tail(&waiter->list, &wlist);
 363
 364                 /*
 365                  * Limit # of readers that can be woken up per wakeup call.
 366                  */
 367                 if (woken >= MAX_READERS_WAKEUP)
 368                         break;
 369         }
 370
 371         adjustment = woken * RWSEM_READER_BIAS - adjustment;
 372         lockevent_cond_inc(rwsem_wake_reader, woken);
 373         if (list_empty(&sem->wait_list)) {
 374                 /* hit end of list above */
 375                 adjustment -= RWSEM_FLAG_WAITERS;
 376         }
 377
 378         /*
 379          * When we've woken a reader, we no longer need to force writers
 380          * to give up the lock and we can clear HANDOFF.
 381          */
 382         if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
 383                 adjustment -= RWSEM_FLAG_HANDOFF;
 384
 385         if (adjustment)
 386                 atomic_long_add(adjustment, &sem->count);
 387
 388         /* 2nd pass */
 389         list_for_each_entry_safe(waiter, tmp, &wlist, list) {
 390                 struct task_struct *tsk;
 391
 392                 tsk = waiter->task;
 393                 get_task_struct(tsk);
 394
 395                 /*
 396                  * Ensure calling get_task_struct() before setting the reader
 397                  * waiter to nil such that rwsem_down_read_slowpath() cannot
 398                  * race with do_exit() by always holding a reference count
 399                  * to the task to wakeup.
 400                  */
 401                 smp_store_release(&waiter->task, NULL);
 402                 /*
 403                  * Ensure issuing the wakeup (either by us or someone else)
 404                  * after setting the reader waiter to nil.
 405                  */
 406                 wake_q_add_safe(wake_q, tsk);
 407         }
 408 }
 409
 410 /*
 411  * This function must be called with the sem->wait_lock held to prevent
 412  * race conditions between checking the rwsem wait list and setting the
 413  * sem->count accordingly.
 414  *
 415  * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
 416  * bit is set or the lock is acquired with handoff bit cleared.
 417  */
 418 static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
 419                                         enum writer_wait_state wstate)
 420 {
 421         long count, new;
 422
 423         lockdep_assert_held(&sem->wait_lock);
 424
 425         count = atomic_long_read(&sem->count);
 426         do {
 427                 bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
 428
 429                 if (has_handoff && wstate == WRITER_NOT_FIRST)
 430                         return false;
 431
 432                 new = count;
 433
 434                 if (count & RWSEM_LOCK_MASK) {
 435                         if (has_handoff || (wstate != WRITER_HANDOFF))
 436                                 return false;
 437
 438                         new |= RWSEM_FLAG_HANDOFF;
 439                 } else {
 440                         new |= RWSEM_WRITER_LOCKED;
 441                         new &= ~RWSEM_FLAG_HANDOFF;
 442
 443                         if (list_is_singular(&sem->wait_list))
 444                                 new &= ~RWSEM_FLAG_WAITERS;
 445                 }
 446         } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
 447
 448         /*
 449          * We have either acquired the lock with handoff bit cleared or
 450          * set the handoff bit.
 451          */
 452         if (new & RWSEM_FLAG_HANDOFF)
 453                 return false;
 454
 455         rwsem_set_owner(sem);
 456         return true;
 457 }
 458
 459 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 460 /*
 461  * Try to acquire read lock before the reader is put on wait queue.
 462  * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff
 463  * is ongoing.
 464  */
 465 static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem)
 466 {
 467         long count = atomic_long_read(&sem->count);
 468
 469         if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))
 470                 return false;
 471
 472         count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count);
 473         if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
 474                 rwsem_set_reader_owned(sem);
 475                 lockevent_inc(rwsem_opt_rlock);
 476                 return true;
 477         }
 478
 479         /* Back out the change */
 480         atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
 481         return false;
 482 }
 483
 484 /*
 485  * Try to acquire write lock before the writer has been put on wait queue.
 486  */
 487 static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 488 {
 489         long count = atomic_long_read(&sem->count);
 490
 491         while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) {
 492                 if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
 493                                         count | RWSEM_WRITER_LOCKED)) {
 494                         rwsem_set_owner(sem);
 495                         lockevent_inc(rwsem_opt_wlock);
 496                         return true;
 497                 }
 498         }
 499         return false;
 500 }
 501
 502 static inline bool owner_on_cpu(struct task_struct *owner)
 503 {
 504         /*
 505          * As lock holder preemption issue, we both skip spinning if
 506          * task is not on cpu or its cpu is preempted
 507          */
 508         return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
 509 }
 510
 511 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 512 {
 513         struct task_struct *owner;
 514         bool ret = true;
 515
 516         BUILD_BUG_ON(is_rwsem_owner_spinnable(RWSEM_OWNER_UNKNOWN));
 517
 518         if (need_resched()) {
 519                 lockevent_inc(rwsem_opt_fail);
 520                 return false;
 521         }
 522
 523         preempt_disable();
 524         rcu_read_lock();
 525         owner = READ_ONCE(sem->owner);
 526         if (owner) {
 527                 ret = is_rwsem_owner_spinnable(owner) &&
 528                       owner_on_cpu(owner);
 529         }
 530         rcu_read_unlock();
 531         preempt_enable();
 532
 533         lockevent_cond_inc(rwsem_opt_fail, !ret);
 534         return ret;
 535 }
 536
 537 /*
 538  * The rwsem_spin_on_owner() function returns the folowing 4 values
 539  * depending on the lock owner state.
 540  *   OWNER_NULL  : owner is currently NULL
 541  *   OWNER_WRITER: when owner changes and is a writer
 542  *   OWNER_READER: when owner changes and the new owner may be a reader.
 543  *   OWNER_NONSPINNABLE:
 544  *                 when optimistic spinning has to stop because either the
 545  *                 owner stops running, is unknown, or its timeslice has
 546  *                 been used up.
 547  */
 548 enum owner_state {
 549         OWNER_NULL              = 1 << 0,
 550         OWNER_WRITER            = 1 << 1,
 551         OWNER_READER            = 1 << 2,
 552         OWNER_NONSPINNABLE      = 1 << 3,
 553 };
 554 #define OWNER_SPINNABLE         (OWNER_NULL | OWNER_WRITER)
 555
 556 static inline enum owner_state rwsem_owner_state(unsigned long owner)
 557 {
 558         if (!owner)
 559                 return OWNER_NULL;
 560
 561         if (owner & RWSEM_NONSPINNABLE)
 562                 return OWNER_NONSPINNABLE;
 563
 564         if (owner & RWSEM_READER_OWNED)
 565                 return OWNER_READER;
 566
 567         return OWNER_WRITER;
 568 }
 569
 570 static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem)
 571 {
 572         struct task_struct *tmp, *owner = READ_ONCE(sem->owner);
 573         enum owner_state state = rwsem_owner_state((unsigned long)owner);
 574
 575         if (state != OWNER_WRITER)
 576                 return state;
 577
 578         rcu_read_lock();
 579         for (;;) {
 580                 if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) {
 581                         state = OWNER_NONSPINNABLE;
 582                         break;
 583                 }
 584
 585                 tmp = READ_ONCE(sem->owner);
 586                 if (tmp != owner) {
 587                         state = rwsem_owner_state((unsigned long)tmp);
 588                         break;
 589                 }
 590
 591                 /*
 592                  * Ensure we emit the owner->on_cpu, dereference _after_
 593                  * checking sem->owner still matches owner, if that fails,
 594                  * owner might point to free()d memory, if it still matches,
 595                  * the rcu_read_lock() ensures the memory stays valid.
 596                  */
 597                 barrier();
 598
 599                 if (need_resched() || !owner_on_cpu(owner)) {
 600                         state = OWNER_NONSPINNABLE;
 601                         break;
 602                 }
 603
 604                 cpu_relax();
 605         }
 606         rcu_read_unlock();
 607
 608         return state;
 609 }
 610
 611 static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
 612 {
 613         bool taken = false;
 614         int prev_owner_state = OWNER_NULL;
 615
 616         preempt_disable();
 617
 618         /* sem->wait_lock should not be held when doing optimistic spinning */
 619         if (!osq_lock(&sem->osq))
 620                 goto done;
 621
 622         /*
 623          * Optimistically spin on the owner field and attempt to acquire the
 624          * lock whenever the owner changes. Spinning will be stopped when:
 625          *  1) the owning writer isn't running; or
 626          *  2) readers own the lock as we can't determine if they are
 627          *     actively running or not.
 628          */
 629         for (;;) {
 630                 enum owner_state owner_state = rwsem_spin_on_owner(sem);
 631
 632                 if (!(owner_state & OWNER_SPINNABLE))
 633                         break;
 634
 635                 /*
 636                  * Try to acquire the lock
 637                  */
 638                 taken = wlock ? rwsem_try_write_lock_unqueued(sem)
 639                               : rwsem_try_read_lock_unqueued(sem);
 640
 641                 if (taken)
 642                         break;
 643
 644                 /*
 645                  * An RT task cannot do optimistic spinning if it cannot
 646                  * be sure the lock holder is running or live-lock may
 647                  * happen if the current task and the lock holder happen
 648                  * to run in the same CPU. However, aborting optimistic
 649                  * spinning while a NULL owner is detected may miss some
 650                  * opportunity where spinning can continue without causing
 651                  * problem.
 652                  *
 653                  * There are 2 possible cases where an RT task may be able
 654                  * to continue spinning.
 655                  *
 656                  * 1) The lock owner is in the process of releasing the
 657                  *    lock, sem->owner is cleared but the lock has not
 658                  *    been released yet.
 659                  * 2) The lock was free and owner cleared, but another
 660                  *    task just comes in and acquire the lock before
 661                  *    we try to get it. The new owner may be a spinnable
 662                  *    writer.
 663                  *
 664                  * To take advantage of two scenarios listed agove, the RT
 665                  * task is made to retry one more time to see if it can
 666                  * acquire the lock or continue spinning on the new owning
 667                  * writer. Of course, if the time lag is long enough or the
 668                  * new owner is not a writer or spinnable, the RT task will
 669                  * quit spinning.
 670                  *
 671                  * If the owner is a writer, the need_resched() check is
 672                  * done inside rwsem_spin_on_owner(). If the owner is not
 673                  * a writer, need_resched() check needs to be done here.
 674                  */
 675                 if (owner_state != OWNER_WRITER) {
 676                         if (need_resched())
 677                                 break;
 678                         if (rt_task(current) &&
 679                            (prev_owner_state != OWNER_WRITER))
 680                                 break;
 681                 }
 682                 prev_owner_state = owner_state;
 683
 684                 /*
 685                  * The cpu_relax() call is a compiler barrier which forces
 686                  * everything in this loop to be re-loaded. We don't need
 687                  * memory barriers as we'll eventually observe the right
 688                  * values at the cost of a few extra spins.
 689                  */
 690                 cpu_relax();
 691         }
 692         osq_unlock(&sem->osq);
 693 done:
 694         preempt_enable();
 695         lockevent_cond_inc(rwsem_opt_fail, !taken);
 696         return taken;
 697 }
 698 #else
 699 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 700 {
 701         return false;
 702 }
 703
 704 static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
 705 {
 706         return false;
 707 }
 708 #endif
 709
 710 /*
 711  * Wait for the read lock to be granted
 712  */
 713 static struct rw_semaphore __sched *
 714 rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
 715 {
 716         long count, adjustment = -RWSEM_READER_BIAS;
 717         struct rwsem_waiter waiter;
 718         DEFINE_WAKE_Q(wake_q);
 719
 720         if (!rwsem_can_spin_on_owner(sem))
 721                 goto queue;
 722
 723         /*
 724          * Undo read bias from down_read() and do optimistic spinning.
 725          */
 726         atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
 727         adjustment = 0;
 728         if (rwsem_optimistic_spin(sem, false)) {
 729                 /*
 730                  * Wake up other readers in the wait list if the front
 731                  * waiter is a reader.
 732                  */
 733                 if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) {
 734                         raw_spin_lock_irq(&sem->wait_lock);
 735                         if (!list_empty(&sem->wait_list))
 736                                 rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
 737                                                 &wake_q);
 738                         raw_spin_unlock_irq(&sem->wait_lock);
 739                         wake_up_q(&wake_q);
 740                 }
 741                 return sem;
 742         }
 743
 744 queue:
 745         waiter.task = current;
 746         waiter.type = RWSEM_WAITING_FOR_READ;
 747         waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
 748
 749         raw_spin_lock_irq(&sem->wait_lock);
 750         if (list_empty(&sem->wait_list)) {
 751                 /*
 752                  * In case the wait queue is empty and the lock isn't owned
 753                  * by a writer or has the handoff bit set, this reader can
 754                  * exit the slowpath and return immediately as its
 755                  * RWSEM_READER_BIAS has already been set in the count.
 756                  */
 757                 if (adjustment && !(atomic_long_read(&sem->count) &
 758                      (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
 759                         raw_spin_unlock_irq(&sem->wait_lock);
 760                         rwsem_set_reader_owned(sem);
 761                         lockevent_inc(rwsem_rlock_fast);
 762                         return sem;
 763                 }
 764                 adjustment += RWSEM_FLAG_WAITERS;
 765         }
 766         list_add_tail(&waiter.list, &sem->wait_list);
 767
 768         /* we're now waiting on the lock, but no longer actively locking */
 769         if (adjustment)
 770                 count = atomic_long_add_return(adjustment, &sem->count);
 771         else
 772                 count = atomic_long_read(&sem->count);
 773
 774         /*
 775          * If there are no active locks, wake the front queued process(es).
 776          *
 777          * If there are no writers and we are first in the queue,
 778          * wake our own waiter to join the existing active readers !
 779          */
 780         if (!(count & RWSEM_LOCK_MASK) ||
 781            (!(count & RWSEM_WRITER_MASK) && (adjustment & RWSEM_FLAG_WAITERS)))
 782                 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 783
 784         raw_spin_unlock_irq(&sem->wait_lock);
 785         wake_up_q(&wake_q);
 786
 787         /* wait to be given the lock */
 788         while (true) {
 789                 set_current_state(state);
 790                 if (!waiter.task)
 791                         break;
 792                 if (signal_pending_state(state, current)) {
 793                         raw_spin_lock_irq(&sem->wait_lock);
 794                         if (waiter.task)
 795                                 goto out_nolock;
 796                         raw_spin_unlock_irq(&sem->wait_lock);
 797                         break;
 798                 }
 799                 schedule();
 800                 lockevent_inc(rwsem_sleep_reader);
 801         }
 802
 803         __set_current_state(TASK_RUNNING);
 804         lockevent_inc(rwsem_rlock);
 805         return sem;
 806 out_nolock:
 807         list_del(&waiter.list);
 808         if (list_empty(&sem->wait_list)) {
 809                 atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
 810                                    &sem->count);
 811         }
 812         raw_spin_unlock_irq(&sem->wait_lock);
 813         __set_current_state(TASK_RUNNING);
 814         lockevent_inc(rwsem_rlock_fail);
 815         return ERR_PTR(-EINTR);
 816 }
 817
 818 /*
 819  * Wait until we successfully acquire the write lock
 820  */
 821 static struct rw_semaphore *
 822 rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 823 {
 824         long count;
 825         enum writer_wait_state wstate;
 826         struct rwsem_waiter waiter;
 827         struct rw_semaphore *ret = sem;
 828         DEFINE_WAKE_Q(wake_q);
 829
 830         /* do optimistic spinning and steal lock if possible */
 831         if (rwsem_can_spin_on_owner(sem) &&
 832             rwsem_optimistic_spin(sem, true))
 833                 return sem;
 834
 835         /*
 836          * Optimistic spinning failed, proceed to the slowpath
 837          * and block until we can acquire the sem.
 838          */
 839         waiter.task = current;
 840         waiter.type = RWSEM_WAITING_FOR_WRITE;
 841         waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
 842
 843         raw_spin_lock_irq(&sem->wait_lock);
 844
 845         /* account for this before adding a new element to the list */
 846         wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
 847
 848         list_add_tail(&waiter.list, &sem->wait_list);
 849
 850         /* we're now waiting on the lock */
 851         if (wstate == WRITER_NOT_FIRST) {
 852                 count = atomic_long_read(&sem->count);
 853
 854                 /*
 855                  * If there were already threads queued before us and:
 856                  *  1) there are no no active locks, wake the front
 857                  *     queued process(es) as the handoff bit might be set.
 858                  *  2) there are no active writers and some readers, the lock
 859                  *     must be read owned; so we try to wake any read lock
 860                  *     waiters that were queued ahead of us.
 861                  */
 862                 if (count & RWSEM_WRITER_MASK)
 863                         goto wait;
 864
 865                 rwsem_mark_wake(sem, (count & RWSEM_READER_MASK)
 866                                         ? RWSEM_WAKE_READERS
 867                                         : RWSEM_WAKE_ANY, &wake_q);
 868
 869                 if (!wake_q_empty(&wake_q)) {
 870                         /*
 871                          * We want to minimize wait_lock hold time especially
 872                          * when a large number of readers are to be woken up.
 873                          */
 874                         raw_spin_unlock_irq(&sem->wait_lock);
 875                         wake_up_q(&wake_q);
 876                         wake_q_init(&wake_q);   /* Used again, reinit */
 877                         raw_spin_lock_irq(&sem->wait_lock);
 878                 }
 879         } else {
 880                 atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
 881         }
 882
 883 wait:
 884         /* wait until we successfully acquire the lock */
 885         set_current_state(state);
 886         while (true) {
 887                 if (rwsem_try_write_lock(sem, wstate))
 888                         break;
 889
 890                 raw_spin_unlock_irq(&sem->wait_lock);
 891
 892                 /* Block until there are no active lockers. */
 893                 for (;;) {
 894                         if (signal_pending_state(state, current))
 895                                 goto out_nolock;
 896
 897                         schedule();
 898                         lockevent_inc(rwsem_sleep_writer);
 899                         set_current_state(state);
 900                         /*
 901                          * If HANDOFF bit is set, unconditionally do
 902                          * a trylock.
 903                          */
 904                         if (wstate == WRITER_HANDOFF)
 905                                 break;
 906
 907                         if ((wstate == WRITER_NOT_FIRST) &&
 908                             (rwsem_first_waiter(sem) == &waiter))
 909                                 wstate = WRITER_FIRST;
 910
 911                         count = atomic_long_read(&sem->count);
 912                         if (!(count & RWSEM_LOCK_MASK))
 913                                 break;
 914
 915                         /*
 916                          * The setting of the handoff bit is deferred
 917                          * until rwsem_try_write_lock() is called.
 918                          */
 919                         if ((wstate == WRITER_FIRST) && (rt_task(current) ||
 920                             time_after(jiffies, waiter.timeout))) {
 921                                 wstate = WRITER_HANDOFF;
 922                                 lockevent_inc(rwsem_wlock_handoff);
 923                                 break;
 924                         }
 925                 }
 926
 927                 raw_spin_lock_irq(&sem->wait_lock);
 928         }
 929         __set_current_state(TASK_RUNNING);
 930         list_del(&waiter.list);
 931         raw_spin_unlock_irq(&sem->wait_lock);
 932         lockevent_inc(rwsem_wlock);
 933
 934         return ret;
 935
 936 out_nolock:
 937         __set_current_state(TASK_RUNNING);
 938         raw_spin_lock_irq(&sem->wait_lock);
 939         list_del(&waiter.list);
 940
 941         if (unlikely(wstate == WRITER_HANDOFF))
 942                 atomic_long_add(-RWSEM_FLAG_HANDOFF,  &sem->count);
 943
 944         if (list_empty(&sem->wait_list))
 945                 atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
 946         else
 947                 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 948         raw_spin_unlock_irq(&sem->wait_lock);
 949         wake_up_q(&wake_q);
 950         lockevent_inc(rwsem_wlock_fail);
 951
 952         return ERR_PTR(-EINTR);
 953 }
 954
 955 /*
 956  * handle waking up a waiter on the semaphore
 957  * - up_read/up_write has decremented the active part of count if we come here
 958  */
 959 static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count)
 960 {
 961         unsigned long flags;
 962         DEFINE_WAKE_Q(wake_q);
 963
 964         raw_spin_lock_irqsave(&sem->wait_lock, flags);
 965
 966         if (!list_empty(&sem->wait_list))
 967                 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 968
 969         raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 970         wake_up_q(&wake_q);
 971
 972         return sem;
 973 }
 974
 975 /*
 976  * downgrade a write lock into a read lock
 977  * - caller incremented waiting part of count and discovered it still negative
 978  * - just wake up any readers at the front of the queue
 979  */
 980 static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
 981 {
 982         unsigned long flags;
 983         DEFINE_WAKE_Q(wake_q);
 984
 985         raw_spin_lock_irqsave(&sem->wait_lock, flags);
 986
 987         if (!list_empty(&sem->wait_list))
 988                 rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
 989
 990         raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 991         wake_up_q(&wake_q);
 992
 993         return sem;
 994 }
 995
 996 /*
 997  * lock for reading
 998  */
 999 inline void __down_read(struct rw_semaphore *sem)
1000 {
1001         if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS,
1002                         &sem->count) & RWSEM_READ_FAILED_MASK)) {
1003                 rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
1004                 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
1005                                         RWSEM_READER_OWNED), sem);
1006         } else {
1007                 rwsem_set_reader_owned(sem);
1008         }
1009 }
1010
1011 static inline int __down_read_killable(struct rw_semaphore *sem)
1012 {
1013         if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS,
1014                         &sem->count) & RWSEM_READ_FAILED_MASK)) {
1015                 if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE)))
1016                         return -EINTR;
1017                 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
1018                                         RWSEM_READER_OWNED), sem);
1019         } else {
1020                 rwsem_set_reader_owned(sem);
1021         }
1022         return 0;
1023 }
1024
1025 static inline int __down_read_trylock(struct rw_semaphore *sem)
1026 {
1027         /*
1028          * Optimize for the case when the rwsem is not locked at all.
1029          */
1030         long tmp = RWSEM_UNLOCKED_VALUE;
1031
1032         do {
1033                 if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
1034                                         tmp + RWSEM_READER_BIAS)) {
1035                         rwsem_set_reader_owned(sem);
1036                         return 1;
1037                 }
1038         } while (!(tmp & RWSEM_READ_FAILED_MASK));
1039         return 0;
1040 }
1041
1042 /*
1043  * lock for writing
1044  */
1045 static inline void __down_write(struct rw_semaphore *sem)
1046 {
1047         long tmp = RWSEM_UNLOCKED_VALUE;
1048
1049         if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
1050                                                       RWSEM_WRITER_LOCKED)))
1051                 rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE);
1052         rwsem_set_owner(sem);
1053 }
1054
1055 static inline int __down_write_killable(struct rw_semaphore *sem)
1056 {
1057         long tmp = RWSEM_UNLOCKED_VALUE;
1058
1059         if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
1060                                                       RWSEM_WRITER_LOCKED))) {
1061                 if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE)))
1062                         return -EINTR;
1063         }
1064         rwsem_set_owner(sem);
1065         return 0;
1066 }
1067
1068 static inline int __down_write_trylock(struct rw_semaphore *sem)
1069 {
1070         long tmp = RWSEM_UNLOCKED_VALUE;
1071
1072         if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
1073                                             RWSEM_WRITER_LOCKED)) {
1074                 rwsem_set_owner(sem);
1075                 return true;
1076         }
1077         return false;
1078 }
1079
1080 /*
1081  * unlock after reading
1082  */
1083 inline void __up_read(struct rw_semaphore *sem)
1084 {
1085         long tmp;
1086
1087         DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem);
1088         rwsem_clear_reader_owned(sem);
1089         tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
1090         if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
1091                       RWSEM_FLAG_WAITERS))
1092                 rwsem_wake(sem, tmp);
1093 }
1094
1095 /*
1096  * unlock after writing
1097  */
1098 static inline void __up_write(struct rw_semaphore *sem)
1099 {
1100         long tmp;
1101
1102         /*
1103          * sem->owner may differ from current if the ownership is transferred
1104          * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
1105          */
1106         DEBUG_RWSEMS_WARN_ON((sem->owner != current) &&
1107                             !((long)sem->owner & RWSEM_NONSPINNABLE), sem);
1108         rwsem_clear_owner(sem);
1109         tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
1110         if (unlikely(tmp & RWSEM_FLAG_WAITERS))
1111                 rwsem_wake(sem, tmp);
1112 }
1113
1114 /*
1115  * downgrade write lock to read lock
1116  */
1117 static inline void __downgrade_write(struct rw_semaphore *sem)
1118 {
1119         long tmp;
1120
1121         /*
1122          * When downgrading from exclusive to shared ownership,
1123          * anything inside the write-locked region cannot leak
1124          * into the read side. In contrast, anything in the
1125          * read-locked region is ok to be re-ordered into the
1126          * write side. As such, rely on RELEASE semantics.
1127          */
1128         DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
1129         tmp = atomic_long_fetch_add_release(
1130                 -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
1131         rwsem_set_reader_owned(sem);
1132         if (tmp & RWSEM_FLAG_WAITERS)
1133                 rwsem_downgrade_wake(sem);
1134 }
1135
1136 /*
1137  * lock for reading
1138  */
1139 void __sched down_read(struct rw_semaphore *sem)
1140 {
1141         might_sleep();
1142         rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1143
1144         LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1145 }
1146 EXPORT_SYMBOL(down_read);
1147
1148 int __sched down_read_killable(struct rw_semaphore *sem)
1149 {
1150         might_sleep();
1151         rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1152
1153         if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
1154                 rwsem_release(&sem->dep_map, 1, _RET_IP_);
1155                 return -EINTR;
1156         }
1157
1158         return 0;
1159 }
1160 EXPORT_SYMBOL(down_read_killable);
1161
1162 /*
1163  * trylock for reading -- returns 1 if successful, 0 if contention
1164  */
1165 int down_read_trylock(struct rw_semaphore *sem)
1166 {
1167         int ret = __down_read_trylock(sem);
1168
1169         if (ret == 1)
1170                 rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
1171         return ret;
1172 }
1173 EXPORT_SYMBOL(down_read_trylock);
1174
1175 /*
1176  * lock for writing
1177  */
1178 void __sched down_write(struct rw_semaphore *sem)
1179 {
1180         might_sleep();
1181         rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
1182         LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1183 }
1184 EXPORT_SYMBOL(down_write);
1185
1186 /*
1187  * lock for writing
1188  */
1189 int __sched down_write_killable(struct rw_semaphore *sem)
1190 {
1191         might_sleep();
1192         rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
1193
1194         if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1195                                   __down_write_killable)) {
1196                 rwsem_release(&sem->dep_map, 1, _RET_IP_);
1197                 return -EINTR;
1198         }
1199
1200         return 0;
1201 }
1202 EXPORT_SYMBOL(down_write_killable);
1203
1204 /*
1205  * trylock for writing -- returns 1 if successful, 0 if contention
1206  */
1207 int down_write_trylock(struct rw_semaphore *sem)
1208 {
1209         int ret = __down_write_trylock(sem);
1210
1211         if (ret == 1)
1212                 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
1213
1214         return ret;
1215 }
1216 EXPORT_SYMBOL(down_write_trylock);
1217
1218 /*
1219  * release a read lock
1220  */
1221 void up_read(struct rw_semaphore *sem)
1222 {
1223         rwsem_release(&sem->dep_map, 1, _RET_IP_);
1224         __up_read(sem);
1225 }
1226 EXPORT_SYMBOL(up_read);
1227
1228 /*
1229  * release a write lock
1230  */
1231 void up_write(struct rw_semaphore *sem)
1232 {
1233         rwsem_release(&sem->dep_map, 1, _RET_IP_);
1234         __up_write(sem);
1235 }
1236 EXPORT_SYMBOL(up_write);
1237
1238 /*
1239  * downgrade write lock to read lock
1240  */
1241 void downgrade_write(struct rw_semaphore *sem)
1242 {
1243         lock_downgrade(&sem->dep_map, _RET_IP_);
1244         __downgrade_write(sem);
1245 }
1246 EXPORT_SYMBOL(downgrade_write);
1247
1248 #ifdef CONFIG_DEBUG_LOCK_ALLOC
1249
1250 void down_read_nested(struct rw_semaphore *sem, int subclass)
1251 {
1252         might_sleep();
1253         rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
1254         LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1255 }
1256 EXPORT_SYMBOL(down_read_nested);
1257
1258 void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
1259 {
1260         might_sleep();
1261         rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
1262         LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1263 }
1264 EXPORT_SYMBOL(_down_write_nest_lock);
1265
1266 void down_read_non_owner(struct rw_semaphore *sem)
1267 {
1268         might_sleep();
1269         __down_read(sem);
1270         __rwsem_set_reader_owned(sem, NULL);
1271 }
1272 EXPORT_SYMBOL(down_read_non_owner);
1273
1274 void down_write_nested(struct rw_semaphore *sem, int subclass)
1275 {
1276         might_sleep();
1277         rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
1278         LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1279 }
1280 EXPORT_SYMBOL(down_write_nested);
1281
1282 int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
1283 {
1284         might_sleep();
1285         rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
1286
1287         if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1288                                   __down_write_killable)) {
1289                 rwsem_release(&sem->dep_map, 1, _RET_IP_);
1290                 return -EINTR;
1291         }
1292
1293         return 0;
1294 }
1295 EXPORT_SYMBOL(down_write_killable_nested);
1296
1297 void up_read_non_owner(struct rw_semaphore *sem)
1298 {
1299         DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
1300                                 sem);
1301         __up_read(sem);
1302 }
1303 EXPORT_SYMBOL(up_read_non_owner);
1304
1305 #endif