kernel/locking/rwsem.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /* kernel/rwsem.c: R/W semaphores, public implementation
   3  *
   4  * Written by David Howells (dhowells@redhat.com).
   5  * Derived from asm-i386/semaphore.h
   6  *
   7  * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
   8  * and Michel Lespinasse <walken@google.com>
   9  *
  10  * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
  11  * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
  12  *
  13  * Rwsem count bit fields re-definition and rwsem rearchitecture
  14  * by Waiman Long <longman@redhat.com>.
  15  */
  16
  17 #include <linux/types.h>
  18 #include <linux/kernel.h>
  19 #include <linux/sched.h>
  20 #include <linux/sched/rt.h>
  21 #include <linux/sched/task.h>
  22 #include <linux/sched/debug.h>
  23 #include <linux/sched/wake_q.h>
  24 #include <linux/sched/signal.h>
  25 #include <linux/export.h>
  26 #include <linux/rwsem.h>
  27 #include <linux/atomic.h>
  28
  29 #include "rwsem.h"
  30 #include "lock_events.h"
  31
  32 /*
  33  * The least significant 2 bits of the owner value has the following
  34  * meanings when set.
  35  *  - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers
  36  *  - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned,
  37  *    i.e. the owner(s) cannot be readily determined. It can be reader
  38  *    owned or the owning writer is indeterminate.
  39  *
  40  * When a writer acquires a rwsem, it puts its task_struct pointer
  41  * into the owner field. It is cleared after an unlock.
  42  *
  43  * When a reader acquires a rwsem, it will also puts its task_struct
  44  * pointer into the owner field with both the RWSEM_READER_OWNED and
  45  * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will
  46  * largely be left untouched. So for a free or reader-owned rwsem,
  47  * the owner value may contain information about the last reader that
  48  * acquires the rwsem. The anonymous bit is set because that particular
  49  * reader may or may not still own the lock.
  50  *
  51  * That information may be helpful in debugging cases where the system
  52  * seems to hang on a reader owned rwsem especially if only one reader
  53  * is involved. Ideally we would like to track all the readers that own
  54  * a rwsem, but the overhead is simply too big.
  55  */
  56 #define RWSEM_READER_OWNED      (1UL << 0)
  57 #define RWSEM_ANONYMOUSLY_OWNED (1UL << 1)
  58
  59 #ifdef CONFIG_DEBUG_RWSEMS
  60 # define DEBUG_RWSEMS_WARN_ON(c, sem)   do {                    \
  61         if (!debug_locks_silent &&                              \
  62             WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
  63                 #c, atomic_long_read(&(sem)->count),            \
  64                 (long)((sem)->owner), (long)current,            \
  65                 list_empty(&(sem)->wait_list) ? "" : "not "))   \
  66                         debug_locks_off();                      \
  67         } while (0)
  68 #else
  69 # define DEBUG_RWSEMS_WARN_ON(c, sem)
  70 #endif
  71
  72 /*
  73  * The definition of the atomic counter in the semaphore:
  74  *
  75  * Bit  0   - writer locked bit
  76  * Bit  1   - waiters present bit
  77  * Bits 2-7 - reserved
  78  * Bits 8-X - 24-bit (32-bit) or 56-bit reader count
  79  *
  80  * atomic_long_fetch_add() is used to obtain reader lock, whereas
  81  * atomic_long_cmpxchg() will be used to obtain writer lock.
  82  */
  83 #define RWSEM_WRITER_LOCKED     (1UL << 0)
  84 #define RWSEM_FLAG_WAITERS      (1UL << 1)
  85 #define RWSEM_READER_SHIFT      8
  86 #define RWSEM_READER_BIAS       (1UL << RWSEM_READER_SHIFT)
  87 #define RWSEM_READER_MASK       (~(RWSEM_READER_BIAS - 1))
  88 #define RWSEM_WRITER_MASK       RWSEM_WRITER_LOCKED
  89 #define RWSEM_LOCK_MASK         (RWSEM_WRITER_MASK|RWSEM_READER_MASK)
  90 #define RWSEM_READ_FAILED_MASK  (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS)
  91
  92 /*
  93  * All writes to owner are protected by WRITE_ONCE() to make sure that
  94  * store tearing can't happen as optimistic spinners may read and use
  95  * the owner value concurrently without lock. Read from owner, however,
  96  * may not need READ_ONCE() as long as the pointer value is only used
  97  * for comparison and isn't being dereferenced.
  98  */
  99 static inline void rwsem_set_owner(struct rw_semaphore *sem)
 100 {
 101         WRITE_ONCE(sem->owner, current);
 102 }
 103
 104 static inline void rwsem_clear_owner(struct rw_semaphore *sem)
 105 {
 106         WRITE_ONCE(sem->owner, NULL);
 107 }
 108
 109 /*
 110  * The task_struct pointer of the last owning reader will be left in
 111  * the owner field.
 112  *
 113  * Note that the owner value just indicates the task has owned the rwsem
 114  * previously, it may not be the real owner or one of the real owners
 115  * anymore when that field is examined, so take it with a grain of salt.
 116  */
 117 static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
 118                                             struct task_struct *owner)
 119 {
 120         unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED
 121                                                  | RWSEM_ANONYMOUSLY_OWNED;
 122
 123         WRITE_ONCE(sem->owner, (struct task_struct *)val);
 124 }
 125
 126 static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
 127 {
 128         __rwsem_set_reader_owned(sem, current);
 129 }
 130
 131 /*
 132  * Return true if the a rwsem waiter can spin on the rwsem's owner
 133  * and steal the lock, i.e. the lock is not anonymously owned.
 134  * N.B. !owner is considered spinnable.
 135  */
 136 static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
 137 {
 138         return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED);
 139 }
 140
 141 /*
 142  * Return true if rwsem is owned by an anonymous writer or readers.
 143  */
 144 static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
 145 {
 146         return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED;
 147 }
 148
 149 #ifdef CONFIG_DEBUG_RWSEMS
 150 /*
 151  * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
 152  * is a task pointer in owner of a reader-owned rwsem, it will be the
 153  * real owner or one of the real owners. The only exception is when the
 154  * unlock is done by up_read_non_owner().
 155  */
 156 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 157 {
 158         unsigned long val = (unsigned long)current | RWSEM_READER_OWNED
 159                                                    | RWSEM_ANONYMOUSLY_OWNED;
 160         if (READ_ONCE(sem->owner) == (struct task_struct *)val)
 161                 cmpxchg_relaxed((unsigned long *)&sem->owner, val,
 162                                 RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED);
 163 }
 164 #else
 165 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 166 {
 167 }
 168 #endif
 169
 170 /*
 171  * Guide to the rw_semaphore's count field.
 172  *
 173  * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
 174  * by a writer.
 175  *
 176  * The lock is owned by readers when
 177  * (1) the RWSEM_WRITER_LOCKED isn't set in count,
 178  * (2) some of the reader bits are set in count, and
 179  * (3) the owner field has RWSEM_READ_OWNED bit set.
 180  *
 181  * Having some reader bits set is not enough to guarantee a readers owned
 182  * lock as the readers may be in the process of backing out from the count
 183  * and a writer has just released the lock. So another writer may steal
 184  * the lock immediately after that.
 185  */
 186
 187 /*
 188  * Initialize an rwsem:
 189  */
 190 void __init_rwsem(struct rw_semaphore *sem, const char *name,
 191                   struct lock_class_key *key)
 192 {
 193 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 194         /*
 195          * Make sure we are not reinitializing a held semaphore:
 196          */
 197         debug_check_no_locks_freed((void *)sem, sizeof(*sem));
 198         lockdep_init_map(&sem->dep_map, name, key, 0);
 199 #endif
 200         atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
 201         raw_spin_lock_init(&sem->wait_lock);
 202         INIT_LIST_HEAD(&sem->wait_list);
 203         sem->owner = NULL;
 204 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 205         osq_lock_init(&sem->osq);
 206 #endif
 207 }
 208
 209 EXPORT_SYMBOL(__init_rwsem);
 210
 211 enum rwsem_waiter_type {
 212         RWSEM_WAITING_FOR_WRITE,
 213         RWSEM_WAITING_FOR_READ
 214 };
 215
 216 struct rwsem_waiter {
 217         struct list_head list;
 218         struct task_struct *task;
 219         enum rwsem_waiter_type type;
 220 };
 221
 222 enum rwsem_wake_type {
 223         RWSEM_WAKE_ANY,         /* Wake whatever's at head of wait list */
 224         RWSEM_WAKE_READERS,     /* Wake readers only */
 225         RWSEM_WAKE_READ_OWNED   /* Waker thread holds the read lock */
 226 };
 227
 228 /*
 229  * handle the lock release when processes blocked on it that can now run
 230  * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
 231  *   have been set.
 232  * - there must be someone on the queue
 233  * - the wait_lock must be held by the caller
 234  * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
 235  *   to actually wakeup the blocked task(s) and drop the reference count,
 236  *   preferably when the wait_lock is released
 237  * - woken process blocks are discarded from the list after having task zeroed
 238  * - writers are only marked woken if downgrading is false
 239  */
 240 static void __rwsem_mark_wake(struct rw_semaphore *sem,
 241                               enum rwsem_wake_type wake_type,
 242                               struct wake_q_head *wake_q)
 243 {
 244         struct rwsem_waiter *waiter, *tmp;
 245         long oldcount, woken = 0, adjustment = 0;
 246         struct list_head wlist;
 247
 248         /*
 249          * Take a peek at the queue head waiter such that we can determine
 250          * the wakeup(s) to perform.
 251          */
 252         waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list);
 253
 254         if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
 255                 if (wake_type == RWSEM_WAKE_ANY) {
 256                         /*
 257                          * Mark writer at the front of the queue for wakeup.
 258                          * Until the task is actually later awoken later by
 259                          * the caller, other writers are able to steal it.
 260                          * Readers, on the other hand, will block as they
 261                          * will notice the queued writer.
 262                          */
 263                         wake_q_add(wake_q, waiter->task);
 264                         lockevent_inc(rwsem_wake_writer);
 265                 }
 266
 267                 return;
 268         }
 269
 270         /*
 271          * Writers might steal the lock before we grant it to the next reader.
 272          * We prefer to do the first reader grant before counting readers
 273          * so we can bail out early if a writer stole the lock.
 274          */
 275         if (wake_type != RWSEM_WAKE_READ_OWNED) {
 276                 adjustment = RWSEM_READER_BIAS;
 277                 oldcount = atomic_long_fetch_add(adjustment, &sem->count);
 278                 if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
 279                         atomic_long_sub(adjustment, &sem->count);
 280                         return;
 281                 }
 282                 /*
 283                  * Set it to reader-owned to give spinners an early
 284                  * indication that readers now have the lock.
 285                  */
 286                 __rwsem_set_reader_owned(sem, waiter->task);
 287         }
 288
 289         /*
 290          * Grant an infinite number of read locks to the readers at the front
 291          * of the queue. We know that woken will be at least 1 as we accounted
 292          * for above. Note we increment the 'active part' of the count by the
 293          * number of readers before waking any processes up.
 294          *
 295          * We have to do wakeup in 2 passes to prevent the possibility that
 296          * the reader count may be decremented before it is incremented. It
 297          * is because the to-be-woken waiter may not have slept yet. So it
 298          * may see waiter->task got cleared, finish its critical section and
 299          * do an unlock before the reader count increment.
 300          *
 301          * 1) Collect the read-waiters in a separate list, count them and
 302          *    fully increment the reader count in rwsem.
 303          * 2) For each waiters in the new list, clear waiter->task and
 304          *    put them into wake_q to be woken up later.
 305          */
 306         list_for_each_entry(waiter, &sem->wait_list, list) {
 307                 if (waiter->type == RWSEM_WAITING_FOR_WRITE)
 308                         break;
 309
 310                 woken++;
 311         }
 312         list_cut_before(&wlist, &sem->wait_list, &waiter->list);
 313
 314         adjustment = woken * RWSEM_READER_BIAS - adjustment;
 315         lockevent_cond_inc(rwsem_wake_reader, woken);
 316         if (list_empty(&sem->wait_list)) {
 317                 /* hit end of list above */
 318                 adjustment -= RWSEM_FLAG_WAITERS;
 319         }
 320
 321         if (adjustment)
 322                 atomic_long_add(adjustment, &sem->count);
 323
 324         /* 2nd pass */
 325         list_for_each_entry_safe(waiter, tmp, &wlist, list) {
 326                 struct task_struct *tsk;
 327
 328                 tsk = waiter->task;
 329                 get_task_struct(tsk);
 330
 331                 /*
 332                  * Ensure calling get_task_struct() before setting the reader
 333                  * waiter to nil such that rwsem_down_read_failed() cannot
 334                  * race with do_exit() by always holding a reference count
 335                  * to the task to wakeup.
 336                  */
 337                 smp_store_release(&waiter->task, NULL);
 338                 /*
 339                  * Ensure issuing the wakeup (either by us or someone else)
 340                  * after setting the reader waiter to nil.
 341                  */
 342                 wake_q_add_safe(wake_q, tsk);
 343         }
 344 }
 345
 346 /*
 347  * This function must be called with the sem->wait_lock held to prevent
 348  * race conditions between checking the rwsem wait list and setting the
 349  * sem->count accordingly.
 350  */
 351 static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
 352 {
 353         long new;
 354
 355         if (count & RWSEM_LOCK_MASK)
 356                 return false;
 357
 358         new = count + RWSEM_WRITER_LOCKED -
 359              (list_is_singular(&sem->wait_list) ? RWSEM_FLAG_WAITERS : 0);
 360
 361         if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)) {
 362                 rwsem_set_owner(sem);
 363                 return true;
 364         }
 365
 366         return false;
 367 }
 368
 369 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 370 /*
 371  * Try to acquire write lock before the writer has been put on wait queue.
 372  */
 373 static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 374 {
 375         long count = atomic_long_read(&sem->count);
 376
 377         while (!(count & RWSEM_LOCK_MASK)) {
 378                 if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
 379                                         count + RWSEM_WRITER_LOCKED)) {
 380                         rwsem_set_owner(sem);
 381                         lockevent_inc(rwsem_opt_wlock);
 382                         return true;
 383                 }
 384         }
 385         return false;
 386 }
 387
 388 static inline bool owner_on_cpu(struct task_struct *owner)
 389 {
 390         /*
 391          * As lock holder preemption issue, we both skip spinning if
 392          * task is not on cpu or its cpu is preempted
 393          */
 394         return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
 395 }
 396
 397 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 398 {
 399         struct task_struct *owner;
 400         bool ret = true;
 401
 402         BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN));
 403
 404         if (need_resched())
 405                 return false;
 406
 407         rcu_read_lock();
 408         owner = READ_ONCE(sem->owner);
 409         if (owner) {
 410                 ret = is_rwsem_owner_spinnable(owner) &&
 411                       owner_on_cpu(owner);
 412         }
 413         rcu_read_unlock();
 414         return ret;
 415 }
 416
 417 /*
 418  * Return true only if we can still spin on the owner field of the rwsem.
 419  */
 420 static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
 421 {
 422         struct task_struct *owner = READ_ONCE(sem->owner);
 423
 424         if (!is_rwsem_owner_spinnable(owner))
 425                 return false;
 426
 427         rcu_read_lock();
 428         while (owner && (READ_ONCE(sem->owner) == owner)) {
 429                 /*
 430                  * Ensure we emit the owner->on_cpu, dereference _after_
 431                  * checking sem->owner still matches owner, if that fails,
 432                  * owner might point to free()d memory, if it still matches,
 433                  * the rcu_read_lock() ensures the memory stays valid.
 434                  */
 435                 barrier();
 436
 437                 /*
 438                  * abort spinning when need_resched or owner is not running or
 439                  * owner's cpu is preempted.
 440                  */
 441                 if (need_resched() || !owner_on_cpu(owner)) {
 442                         rcu_read_unlock();
 443                         return false;
 444                 }
 445
 446                 cpu_relax();
 447         }
 448         rcu_read_unlock();
 449
 450         /*
 451          * If there is a new owner or the owner is not set, we continue
 452          * spinning.
 453          */
 454         return is_rwsem_owner_spinnable(READ_ONCE(sem->owner));
 455 }
 456
 457 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 458 {
 459         bool taken = false;
 460
 461         preempt_disable();
 462
 463         /* sem->wait_lock should not be held when doing optimistic spinning */
 464         if (!rwsem_can_spin_on_owner(sem))
 465                 goto done;
 466
 467         if (!osq_lock(&sem->osq))
 468                 goto done;
 469
 470         /*
 471          * Optimistically spin on the owner field and attempt to acquire the
 472          * lock whenever the owner changes. Spinning will be stopped when:
 473          *  1) the owning writer isn't running; or
 474          *  2) readers own the lock as we can't determine if they are
 475          *     actively running or not.
 476          */
 477         while (rwsem_spin_on_owner(sem)) {
 478                 /*
 479                  * Try to acquire the lock
 480                  */
 481                 if (rwsem_try_write_lock_unqueued(sem)) {
 482                         taken = true;
 483                         break;
 484                 }
 485
 486                 /*
 487                  * When there's no owner, we might have preempted between the
 488                  * owner acquiring the lock and setting the owner field. If
 489                  * we're an RT task that will live-lock because we won't let
 490                  * the owner complete.
 491                  */
 492                 if (!sem->owner && (need_resched() || rt_task(current)))
 493                         break;
 494
 495                 /*
 496                  * The cpu_relax() call is a compiler barrier which forces
 497                  * everything in this loop to be re-loaded. We don't need
 498                  * memory barriers as we'll eventually observe the right
 499                  * values at the cost of a few extra spins.
 500                  */
 501                 cpu_relax();
 502         }
 503         osq_unlock(&sem->osq);
 504 done:
 505         preempt_enable();
 506         lockevent_cond_inc(rwsem_opt_fail, !taken);
 507         return taken;
 508 }
 509 #else
 510 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 511 {
 512         return false;
 513 }
 514 #endif
 515
 516 /*
 517  * Wait for the read lock to be granted
 518  */
 519 static inline struct rw_semaphore __sched *
 520 __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
 521 {
 522         long count, adjustment = -RWSEM_READER_BIAS;
 523         struct rwsem_waiter waiter;
 524         DEFINE_WAKE_Q(wake_q);
 525
 526         waiter.task = current;
 527         waiter.type = RWSEM_WAITING_FOR_READ;
 528
 529         raw_spin_lock_irq(&sem->wait_lock);
 530         if (list_empty(&sem->wait_list)) {
 531                 /*
 532                  * In case the wait queue is empty and the lock isn't owned
 533                  * by a writer, this reader can exit the slowpath and return
 534                  * immediately as its RWSEM_READER_BIAS has already been
 535                  * set in the count.
 536                  */
 537                 if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
 538                         raw_spin_unlock_irq(&sem->wait_lock);
 539                         rwsem_set_reader_owned(sem);
 540                         lockevent_inc(rwsem_rlock_fast);
 541                         return sem;
 542                 }
 543                 adjustment += RWSEM_FLAG_WAITERS;
 544         }
 545         list_add_tail(&waiter.list, &sem->wait_list);
 546
 547         /* we're now waiting on the lock, but no longer actively locking */
 548         count = atomic_long_add_return(adjustment, &sem->count);
 549
 550         /*
 551          * If there are no active locks, wake the front queued process(es).
 552          *
 553          * If there are no writers and we are first in the queue,
 554          * wake our own waiter to join the existing active readers !
 555          */
 556         if (!(count & RWSEM_LOCK_MASK) ||
 557            (!(count & RWSEM_WRITER_MASK) && (adjustment & RWSEM_FLAG_WAITERS)))
 558                 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 559
 560         raw_spin_unlock_irq(&sem->wait_lock);
 561         wake_up_q(&wake_q);
 562
 563         /* wait to be given the lock */
 564         while (true) {
 565                 set_current_state(state);
 566                 if (!waiter.task)
 567                         break;
 568                 if (signal_pending_state(state, current)) {
 569                         raw_spin_lock_irq(&sem->wait_lock);
 570                         if (waiter.task)
 571                                 goto out_nolock;
 572                         raw_spin_unlock_irq(&sem->wait_lock);
 573                         break;
 574                 }
 575                 schedule();
 576                 lockevent_inc(rwsem_sleep_reader);
 577         }
 578
 579         __set_current_state(TASK_RUNNING);
 580         lockevent_inc(rwsem_rlock);
 581         return sem;
 582 out_nolock:
 583         list_del(&waiter.list);
 584         if (list_empty(&sem->wait_list))
 585                 atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
 586         raw_spin_unlock_irq(&sem->wait_lock);
 587         __set_current_state(TASK_RUNNING);
 588         lockevent_inc(rwsem_rlock_fail);
 589         return ERR_PTR(-EINTR);
 590 }
 591
 592 __visible struct rw_semaphore * __sched
 593 rwsem_down_read_failed(struct rw_semaphore *sem)
 594 {
 595         return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
 596 }
 597 EXPORT_SYMBOL(rwsem_down_read_failed);
 598
 599 __visible struct rw_semaphore * __sched
 600 rwsem_down_read_failed_killable(struct rw_semaphore *sem)
 601 {
 602         return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
 603 }
 604 EXPORT_SYMBOL(rwsem_down_read_failed_killable);
 605
 606 /*
 607  * Wait until we successfully acquire the write lock
 608  */
 609 static inline struct rw_semaphore *
 610 __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 611 {
 612         long count;
 613         bool waiting = true; /* any queued threads before us */
 614         struct rwsem_waiter waiter;
 615         struct rw_semaphore *ret = sem;
 616         DEFINE_WAKE_Q(wake_q);
 617
 618         /* do optimistic spinning and steal lock if possible */
 619         if (rwsem_optimistic_spin(sem))
 620                 return sem;
 621
 622         /*
 623          * Optimistic spinning failed, proceed to the slowpath
 624          * and block until we can acquire the sem.
 625          */
 626         waiter.task = current;
 627         waiter.type = RWSEM_WAITING_FOR_WRITE;
 628
 629         raw_spin_lock_irq(&sem->wait_lock);
 630
 631         /* account for this before adding a new element to the list */
 632         if (list_empty(&sem->wait_list))
 633                 waiting = false;
 634
 635         list_add_tail(&waiter.list, &sem->wait_list);
 636
 637         /* we're now waiting on the lock */
 638         if (waiting) {
 639                 count = atomic_long_read(&sem->count);
 640
 641                 /*
 642                  * If there were already threads queued before us and there are
 643                  * no active writers and some readers, the lock must be read
 644                  * owned; so we try to  any read locks that were queued ahead
 645                  * of us.
 646                  */
 647                 if (!(count & RWSEM_WRITER_MASK) &&
 648                      (count & RWSEM_READER_MASK)) {
 649                         __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
 650                         /*
 651                          * The wakeup is normally called _after_ the wait_lock
 652                          * is released, but given that we are proactively waking
 653                          * readers we can deal with the wake_q overhead as it is
 654                          * similar to releasing and taking the wait_lock again
 655                          * for attempting rwsem_try_write_lock().
 656                          */
 657                         wake_up_q(&wake_q);
 658
 659                         /*
 660                          * Reinitialize wake_q after use.
 661                          */
 662                         wake_q_init(&wake_q);
 663                 }
 664
 665         } else {
 666                 count = atomic_long_add_return(RWSEM_FLAG_WAITERS, &sem->count);
 667         }
 668
 669         /* wait until we successfully acquire the lock */
 670         set_current_state(state);
 671         while (true) {
 672                 if (rwsem_try_write_lock(count, sem))
 673                         break;
 674                 raw_spin_unlock_irq(&sem->wait_lock);
 675
 676                 /* Block until there are no active lockers. */
 677                 do {
 678                         if (signal_pending_state(state, current))
 679                                 goto out_nolock;
 680
 681                         schedule();
 682                         lockevent_inc(rwsem_sleep_writer);
 683                         set_current_state(state);
 684                         count = atomic_long_read(&sem->count);
 685                 } while (count & RWSEM_LOCK_MASK);
 686
 687                 raw_spin_lock_irq(&sem->wait_lock);
 688         }
 689         __set_current_state(TASK_RUNNING);
 690         list_del(&waiter.list);
 691         raw_spin_unlock_irq(&sem->wait_lock);
 692         lockevent_inc(rwsem_wlock);
 693
 694         return ret;
 695
 696 out_nolock:
 697         __set_current_state(TASK_RUNNING);
 698         raw_spin_lock_irq(&sem->wait_lock);
 699         list_del(&waiter.list);
 700         if (list_empty(&sem->wait_list))
 701                 atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
 702         else
 703                 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 704         raw_spin_unlock_irq(&sem->wait_lock);
 705         wake_up_q(&wake_q);
 706         lockevent_inc(rwsem_wlock_fail);
 707
 708         return ERR_PTR(-EINTR);
 709 }
 710
 711 __visible struct rw_semaphore * __sched
 712 rwsem_down_write_failed(struct rw_semaphore *sem)
 713 {
 714         return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE);
 715 }
 716 EXPORT_SYMBOL(rwsem_down_write_failed);
 717
 718 __visible struct rw_semaphore * __sched
 719 rwsem_down_write_failed_killable(struct rw_semaphore *sem)
 720 {
 721         return __rwsem_down_write_failed_common(sem, TASK_KILLABLE);
 722 }
 723 EXPORT_SYMBOL(rwsem_down_write_failed_killable);
 724
 725 /*
 726  * handle waking up a waiter on the semaphore
 727  * - up_read/up_write has decremented the active part of count if we come here
 728  */
 729 __visible
 730 struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 731 {
 732         unsigned long flags;
 733         DEFINE_WAKE_Q(wake_q);
 734
 735         raw_spin_lock_irqsave(&sem->wait_lock, flags);
 736
 737         if (!list_empty(&sem->wait_list))
 738                 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
 739
 740         raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 741         wake_up_q(&wake_q);
 742
 743         return sem;
 744 }
 745 EXPORT_SYMBOL(rwsem_wake);
 746
 747 /*
 748  * downgrade a write lock into a read lock
 749  * - caller incremented waiting part of count and discovered it still negative
 750  * - just wake up any readers at the front of the queue
 751  */
 752 __visible
 753 struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
 754 {
 755         unsigned long flags;
 756         DEFINE_WAKE_Q(wake_q);
 757
 758         raw_spin_lock_irqsave(&sem->wait_lock, flags);
 759
 760         if (!list_empty(&sem->wait_list))
 761                 __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
 762
 763         raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 764         wake_up_q(&wake_q);
 765
 766         return sem;
 767 }
 768 EXPORT_SYMBOL(rwsem_downgrade_wake);
 769
 770 /*
 771  * lock for reading
 772  */
 773 inline void __down_read(struct rw_semaphore *sem)
 774 {
 775         if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS,
 776                         &sem->count) & RWSEM_READ_FAILED_MASK)) {
 777                 rwsem_down_read_failed(sem);
 778                 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
 779                                         RWSEM_READER_OWNED), sem);
 780         } else {
 781                 rwsem_set_reader_owned(sem);
 782         }
 783 }
 784
 785 static inline int __down_read_killable(struct rw_semaphore *sem)
 786 {
 787         if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS,
 788                         &sem->count) & RWSEM_READ_FAILED_MASK)) {
 789                 if (IS_ERR(rwsem_down_read_failed_killable(sem)))
 790                         return -EINTR;
 791                 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
 792                                         RWSEM_READER_OWNED), sem);
 793         } else {
 794                 rwsem_set_reader_owned(sem);
 795         }
 796         return 0;
 797 }
 798
 799 static inline int __down_read_trylock(struct rw_semaphore *sem)
 800 {
 801         /*
 802          * Optimize for the case when the rwsem is not locked at all.
 803          */
 804         long tmp = RWSEM_UNLOCKED_VALUE;
 805
 806         lockevent_inc(rwsem_rtrylock);
 807         do {
 808                 if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
 809                                         tmp + RWSEM_READER_BIAS)) {
 810                         rwsem_set_reader_owned(sem);
 811                         return 1;
 812                 }
 813         } while (!(tmp & RWSEM_READ_FAILED_MASK));
 814         return 0;
 815 }
 816
 817 /*
 818  * lock for writing
 819  */
 820 static inline void __down_write(struct rw_semaphore *sem)
 821 {
 822         if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0,
 823                                                  RWSEM_WRITER_LOCKED)))
 824                 rwsem_down_write_failed(sem);
 825         rwsem_set_owner(sem);
 826 }
 827
 828 static inline int __down_write_killable(struct rw_semaphore *sem)
 829 {
 830         if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0,
 831                                                  RWSEM_WRITER_LOCKED)))
 832                 if (IS_ERR(rwsem_down_write_failed_killable(sem)))
 833                         return -EINTR;
 834         rwsem_set_owner(sem);
 835         return 0;
 836 }
 837
 838 static inline int __down_write_trylock(struct rw_semaphore *sem)
 839 {
 840         long tmp;
 841
 842         lockevent_inc(rwsem_wtrylock);
 843         tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
 844                                           RWSEM_WRITER_LOCKED);
 845         if (tmp == RWSEM_UNLOCKED_VALUE) {
 846                 rwsem_set_owner(sem);
 847                 return true;
 848         }
 849         return false;
 850 }
 851
 852 /*
 853  * unlock after reading
 854  */
 855 inline void __up_read(struct rw_semaphore *sem)
 856 {
 857         long tmp;
 858
 859         DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
 860                                 sem);
 861         rwsem_clear_reader_owned(sem);
 862         tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
 863         if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS))
 864                         == RWSEM_FLAG_WAITERS))
 865                 rwsem_wake(sem);
 866 }
 867
 868 /*
 869  * unlock after writing
 870  */
 871 static inline void __up_write(struct rw_semaphore *sem)
 872 {
 873         DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
 874         rwsem_clear_owner(sem);
 875         if (unlikely(atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED,
 876                         &sem->count) & RWSEM_FLAG_WAITERS))
 877                 rwsem_wake(sem);
 878 }
 879
 880 /*
 881  * downgrade write lock to read lock
 882  */
 883 static inline void __downgrade_write(struct rw_semaphore *sem)
 884 {
 885         long tmp;
 886
 887         /*
 888          * When downgrading from exclusive to shared ownership,
 889          * anything inside the write-locked region cannot leak
 890          * into the read side. In contrast, anything in the
 891          * read-locked region is ok to be re-ordered into the
 892          * write side. As such, rely on RELEASE semantics.
 893          */
 894         DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
 895         tmp = atomic_long_fetch_add_release(
 896                 -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
 897         rwsem_set_reader_owned(sem);
 898         if (tmp & RWSEM_FLAG_WAITERS)
 899                 rwsem_downgrade_wake(sem);
 900 }
 901
 902 /*
 903  * lock for reading
 904  */
 905 void __sched down_read(struct rw_semaphore *sem)
 906 {
 907         might_sleep();
 908         rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
 909
 910         LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
 911 }
 912
 913 EXPORT_SYMBOL(down_read);
 914
 915 int __sched down_read_killable(struct rw_semaphore *sem)
 916 {
 917         might_sleep();
 918         rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
 919
 920         if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
 921                 rwsem_release(&sem->dep_map, 1, _RET_IP_);
 922                 return -EINTR;
 923         }
 924
 925         return 0;
 926 }
 927
 928 EXPORT_SYMBOL(down_read_killable);
 929
 930 /*
 931  * trylock for reading -- returns 1 if successful, 0 if contention
 932  */
 933 int down_read_trylock(struct rw_semaphore *sem)
 934 {
 935         int ret = __down_read_trylock(sem);
 936
 937         if (ret == 1)
 938                 rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
 939         return ret;
 940 }
 941
 942 EXPORT_SYMBOL(down_read_trylock);
 943
 944 /*
 945  * lock for writing
 946  */
 947 void __sched down_write(struct rw_semaphore *sem)
 948 {
 949         might_sleep();
 950         rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
 951
 952         LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
 953 }
 954
 955 EXPORT_SYMBOL(down_write);
 956
 957 /*
 958  * lock for writing
 959  */
 960 int __sched down_write_killable(struct rw_semaphore *sem)
 961 {
 962         might_sleep();
 963         rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
 964
 965         if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
 966                 rwsem_release(&sem->dep_map, 1, _RET_IP_);
 967                 return -EINTR;
 968         }
 969
 970         return 0;
 971 }
 972
 973 EXPORT_SYMBOL(down_write_killable);
 974
 975 /*
 976  * trylock for writing -- returns 1 if successful, 0 if contention
 977  */
 978 int down_write_trylock(struct rw_semaphore *sem)
 979 {
 980         int ret = __down_write_trylock(sem);
 981
 982         if (ret == 1)
 983                 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
 984
 985         return ret;
 986 }
 987
 988 EXPORT_SYMBOL(down_write_trylock);
 989
 990 /*
 991  * release a read lock
 992  */
 993 void up_read(struct rw_semaphore *sem)
 994 {
 995         rwsem_release(&sem->dep_map, 1, _RET_IP_);
 996
 997         __up_read(sem);
 998 }
 999
1000 EXPORT_SYMBOL(up_read);
1001
1002 /*
1003  * release a write lock
1004  */
1005 void up_write(struct rw_semaphore *sem)
1006 {
1007         rwsem_release(&sem->dep_map, 1, _RET_IP_);
1008
1009         __up_write(sem);
1010 }
1011
1012 EXPORT_SYMBOL(up_write);
1013
1014 /*
1015  * downgrade write lock to read lock
1016  */
1017 void downgrade_write(struct rw_semaphore *sem)
1018 {
1019         lock_downgrade(&sem->dep_map, _RET_IP_);
1020
1021         __downgrade_write(sem);
1022 }
1023
1024 EXPORT_SYMBOL(downgrade_write);
1025
1026 #ifdef CONFIG_DEBUG_LOCK_ALLOC
1027
1028 void down_read_nested(struct rw_semaphore *sem, int subclass)
1029 {
1030         might_sleep();
1031         rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
1032
1033         LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1034 }
1035
1036 EXPORT_SYMBOL(down_read_nested);
1037
1038 void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
1039 {
1040         might_sleep();
1041         rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
1042
1043         LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1044 }
1045
1046 EXPORT_SYMBOL(_down_write_nest_lock);
1047
1048 void down_read_non_owner(struct rw_semaphore *sem)
1049 {
1050         might_sleep();
1051
1052         __down_read(sem);
1053         __rwsem_set_reader_owned(sem, NULL);
1054 }
1055
1056 EXPORT_SYMBOL(down_read_non_owner);
1057
1058 void down_write_nested(struct rw_semaphore *sem, int subclass)
1059 {
1060         might_sleep();
1061         rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
1062
1063         LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1064 }
1065
1066 EXPORT_SYMBOL(down_write_nested);
1067
1068 int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
1069 {
1070         might_sleep();
1071         rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
1072
1073         if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
1074                 rwsem_release(&sem->dep_map, 1, _RET_IP_);
1075                 return -EINTR;
1076         }
1077
1078         return 0;
1079 }
1080
1081 EXPORT_SYMBOL(down_write_killable_nested);
1082
1083 void up_read_non_owner(struct rw_semaphore *sem)
1084 {
1085         DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
1086                                 sem);
1087         __up_read(sem);
1088 }
1089
1090 EXPORT_SYMBOL(up_read_non_owner);
1091
1092 #endif