kernel/locking/qspinlock_paravirt.h

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 #ifndef _GEN_PV_LOCK_SLOWPATH
   3 #error "do not include this file"
   4 #endif
   5
   6 #include <linux/hash.h>
   7 #include <linux/bootmem.h>
   8 #include <linux/debug_locks.h>
   9
  10 /*
  11  * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
  12  * of spinning them.
  13  *
  14  * This relies on the architecture to provide two paravirt hypercalls:
  15  *
  16  *   pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
  17  *   pv_kick(cpu)             -- wakes a suspended vcpu
  18  *
  19  * Using these we implement __pv_queued_spin_lock_slowpath() and
  20  * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
  21  * native_queued_spin_unlock().
  22  */
  23
  24 #define _Q_SLOW_VAL     (3U << _Q_LOCKED_OFFSET)
  25
  26 /*
  27  * Queue Node Adaptive Spinning
  28  *
  29  * A queue node vCPU will stop spinning if the vCPU in the previous node is
  30  * not running. The one lock stealing attempt allowed at slowpath entry
  31  * mitigates the slight slowdown for non-overcommitted guest with this
  32  * aggressive wait-early mechanism.
  33  *
  34  * The status of the previous node will be checked at fixed interval
  35  * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
  36  * pound on the cacheline of the previous node too heavily.
  37  */
  38 #define PV_PREV_CHECK_MASK      0xff
  39
  40 /*
  41  * Queue node uses: vcpu_running & vcpu_halted.
  42  * Queue head uses: vcpu_running & vcpu_hashed.
  43  */
  44 enum vcpu_state {
  45         vcpu_running = 0,
  46         vcpu_halted,            /* Used only in pv_wait_node */
  47         vcpu_hashed,            /* = pv_hash'ed + vcpu_halted */
  48 };
  49
  50 struct pv_node {
  51         struct mcs_spinlock     mcs;
  52         struct mcs_spinlock     __res[3];
  53
  54         int                     cpu;
  55         u8                      state;
  56 };
  57
  58 /*
  59  * Include queued spinlock statistics code
  60  */
  61 #include "qspinlock_stat.h"
  62
  63 /*
  64  * By replacing the regular queued_spin_trylock() with the function below,
  65  * it will be called once when a lock waiter enter the PV slowpath before
  66  * being queued. By allowing one lock stealing attempt here when the pending
  67  * bit is off, it helps to reduce the performance impact of lock waiter
  68  * preemption without the drawback of lock starvation.
  69  */
  70 #define queued_spin_trylock(l)  pv_queued_spin_steal_lock(l)
  71 static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
  72 {
  73         struct __qspinlock *l = (void *)lock;
  74
  75         if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
  76             (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
  77                 qstat_inc(qstat_pv_lock_stealing, true);
  78                 return true;
  79         }
  80
  81         return false;
  82 }
  83
  84 /*
  85  * The pending bit is used by the queue head vCPU to indicate that it
  86  * is actively spinning on the lock and no lock stealing is allowed.
  87  */
  88 #if _Q_PENDING_BITS == 8
  89 static __always_inline void set_pending(struct qspinlock *lock)
  90 {
  91         struct __qspinlock *l = (void *)lock;
  92
  93         WRITE_ONCE(l->pending, 1);
  94 }
  95
  96 static __always_inline void clear_pending(struct qspinlock *lock)
  97 {
  98         struct __qspinlock *l = (void *)lock;
  99
 100         WRITE_ONCE(l->pending, 0);
 101 }
 102
 103 /*
 104  * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
 105  * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
 106  * lock just to be sure that it will get it.
 107  */
 108 static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 109 {
 110         struct __qspinlock *l = (void *)lock;
 111
 112         return !READ_ONCE(l->locked) &&
 113                (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL,
 114                                 _Q_LOCKED_VAL) == _Q_PENDING_VAL);
 115 }
 116 #else /* _Q_PENDING_BITS == 8 */
 117 static __always_inline void set_pending(struct qspinlock *lock)
 118 {
 119         atomic_or(_Q_PENDING_VAL, &lock->val);
 120 }
 121
 122 static __always_inline void clear_pending(struct qspinlock *lock)
 123 {
 124         atomic_andnot(_Q_PENDING_VAL, &lock->val);
 125 }
 126
 127 static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 128 {
 129         int val = atomic_read(&lock->val);
 130
 131         for (;;) {
 132                 int old, new;
 133
 134                 if (val  & _Q_LOCKED_MASK)
 135                         break;
 136
 137                 /*
 138                  * Try to clear pending bit & set locked bit
 139                  */
 140                 old = val;
 141                 new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
 142                 val = atomic_cmpxchg_acquire(&lock->val, old, new);
 143
 144                 if (val == old)
 145                         return 1;
 146         }
 147         return 0;
 148 }
 149 #endif /* _Q_PENDING_BITS == 8 */
 150
 151 /*
 152  * Lock and MCS node addresses hash table for fast lookup
 153  *
 154  * Hashing is done on a per-cacheline basis to minimize the need to access
 155  * more than one cacheline.
 156  *
 157  * Dynamically allocate a hash table big enough to hold at least 4X the
 158  * number of possible cpus in the system. Allocation is done on page
 159  * granularity. So the minimum number of hash buckets should be at least
 160  * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
 161  *
 162  * Since we should not be holding locks from NMI context (very rare indeed) the
 163  * max load factor is 0.75, which is around the point where open addressing
 164  * breaks down.
 165  *
 166  */
 167 struct pv_hash_entry {
 168         struct qspinlock *lock;
 169         struct pv_node   *node;
 170 };
 171
 172 #define PV_HE_PER_LINE  (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
 173 #define PV_HE_MIN       (PAGE_SIZE / sizeof(struct pv_hash_entry))
 174
 175 static struct pv_hash_entry *pv_lock_hash;
 176 static unsigned int pv_lock_hash_bits __read_mostly;
 177
 178 /*
 179  * Allocate memory for the PV qspinlock hash buckets
 180  *
 181  * This function should be called from the paravirt spinlock initialization
 182  * routine.
 183  */
 184 void __init __pv_init_lock_hash(void)
 185 {
 186         int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
 187
 188         if (pv_hash_size < PV_HE_MIN)
 189                 pv_hash_size = PV_HE_MIN;
 190
 191         /*
 192          * Allocate space from bootmem which should be page-size aligned
 193          * and hence cacheline aligned.
 194          */
 195         pv_lock_hash = alloc_large_system_hash("PV qspinlock",
 196                                                sizeof(struct pv_hash_entry),
 197                                                pv_hash_size, 0,
 198                                                HASH_EARLY | HASH_ZERO,
 199                                                &pv_lock_hash_bits, NULL,
 200                                                pv_hash_size, pv_hash_size);
 201 }
 202
 203 #define for_each_hash_entry(he, offset, hash)                                           \
 204         for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0;       \
 205              offset < (1 << pv_lock_hash_bits);                                         \
 206              offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
 207
 208 static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
 209 {
 210         unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
 211         struct pv_hash_entry *he;
 212         int hopcnt = 0;
 213
 214         for_each_hash_entry(he, offset, hash) {
 215                 hopcnt++;
 216                 if (!cmpxchg(&he->lock, NULL, lock)) {
 217                         WRITE_ONCE(he->node, node);
 218                         qstat_hop(hopcnt);
 219                         return &he->lock;
 220                 }
 221         }
 222         /*
 223          * Hard assume there is a free entry for us.
 224          *
 225          * This is guaranteed by ensuring every blocked lock only ever consumes
 226          * a single entry, and since we only have 4 nesting levels per CPU
 227          * and allocated 4*nr_possible_cpus(), this must be so.
 228          *
 229          * The single entry is guaranteed by having the lock owner unhash
 230          * before it releases.
 231          */
 232         BUG();
 233 }
 234
 235 static struct pv_node *pv_unhash(struct qspinlock *lock)
 236 {
 237         unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
 238         struct pv_hash_entry *he;
 239         struct pv_node *node;
 240
 241         for_each_hash_entry(he, offset, hash) {
 242                 if (READ_ONCE(he->lock) == lock) {
 243                         node = READ_ONCE(he->node);
 244                         WRITE_ONCE(he->lock, NULL);
 245                         return node;
 246                 }
 247         }
 248         /*
 249          * Hard assume we'll find an entry.
 250          *
 251          * This guarantees a limited lookup time and is itself guaranteed by
 252          * having the lock owner do the unhash -- IFF the unlock sees the
 253          * SLOW flag, there MUST be a hash entry.
 254          */
 255         BUG();
 256 }
 257
 258 /*
 259  * Return true if when it is time to check the previous node which is not
 260  * in a running state.
 261  */
 262 static inline bool
 263 pv_wait_early(struct pv_node *prev, int loop)
 264 {
 265         if ((loop & PV_PREV_CHECK_MASK) != 0)
 266                 return false;
 267
 268         return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu);
 269 }
 270
 271 /*
 272  * Initialize the PV part of the mcs_spinlock node.
 273  */
 274 static void pv_init_node(struct mcs_spinlock *node)
 275 {
 276         struct pv_node *pn = (struct pv_node *)node;
 277
 278         BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
 279
 280         pn->cpu = smp_processor_id();
 281         pn->state = vcpu_running;
 282 }
 283
 284 /*
 285  * Wait for node->locked to become true, halt the vcpu after a short spin.
 286  * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
 287  * behalf.
 288  */
 289 static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 290 {
 291         struct pv_node *pn = (struct pv_node *)node;
 292         struct pv_node *pp = (struct pv_node *)prev;
 293         int loop;
 294         bool wait_early;
 295
 296         for (;;) {
 297                 for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
 298                         if (READ_ONCE(node->locked))
 299                                 return;
 300                         if (pv_wait_early(pp, loop)) {
 301                                 wait_early = true;
 302                                 break;
 303                         }
 304                         cpu_relax();
 305                 }
 306
 307                 /*
 308                  * Order pn->state vs pn->locked thusly:
 309                  *
 310                  * [S] pn->state = vcpu_halted    [S] next->locked = 1
 311                  *     MB                             MB
 312                  * [L] pn->locked               [RmW] pn->state = vcpu_hashed
 313                  *
 314                  * Matches the cmpxchg() from pv_kick_node().
 315                  */
 316                 smp_store_mb(pn->state, vcpu_halted);
 317
 318                 if (!READ_ONCE(node->locked)) {
 319                         qstat_inc(qstat_pv_wait_node, true);
 320                         qstat_inc(qstat_pv_wait_early, wait_early);
 321                         pv_wait(&pn->state, vcpu_halted);
 322                 }
 323
 324                 /*
 325                  * If pv_kick_node() changed us to vcpu_hashed, retain that
 326                  * value so that pv_wait_head_or_lock() knows to not also try
 327                  * to hash this lock.
 328                  */
 329                 cmpxchg(&pn->state, vcpu_halted, vcpu_running);
 330
 331                 /*
 332                  * If the locked flag is still not set after wakeup, it is a
 333                  * spurious wakeup and the vCPU should wait again. However,
 334                  * there is a pretty high overhead for CPU halting and kicking.
 335                  * So it is better to spin for a while in the hope that the
 336                  * MCS lock will be released soon.
 337                  */
 338                 qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
 339         }
 340
 341         /*
 342          * By now our node->locked should be 1 and our caller will not actually
 343          * spin-wait for it. We do however rely on our caller to do a
 344          * load-acquire for us.
 345          */
 346 }
 347
 348 /*
 349  * Called after setting next->locked = 1 when we're the lock owner.
 350  *
 351  * Instead of waking the waiters stuck in pv_wait_node() advance their state
 352  * such that they're waiting in pv_wait_head_or_lock(), this avoids a
 353  * wake/sleep cycle.
 354  */
 355 static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 356 {
 357         struct pv_node *pn = (struct pv_node *)node;
 358         struct __qspinlock *l = (void *)lock;
 359
 360         /*
 361          * If the vCPU is indeed halted, advance its state to match that of
 362          * pv_wait_node(). If OTOH this fails, the vCPU was running and will
 363          * observe its next->locked value and advance itself.
 364          *
 365          * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
 366          *
 367          * The write to next->locked in arch_mcs_spin_unlock_contended()
 368          * must be ordered before the read of pn->state in the cmpxchg()
 369          * below for the code to work correctly. To guarantee full ordering
 370          * irrespective of the success or failure of the cmpxchg(),
 371          * a relaxed version with explicit barrier is used. The control
 372          * dependency will order the reading of pn->state before any
 373          * subsequent writes.
 374          */
 375         smp_mb__before_atomic();
 376         if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed)
 377             != vcpu_halted)
 378                 return;
 379
 380         /*
 381          * Put the lock into the hash table and set the _Q_SLOW_VAL.
 382          *
 383          * As this is the same vCPU that will check the _Q_SLOW_VAL value and
 384          * the hash table later on at unlock time, no atomic instruction is
 385          * needed.
 386          */
 387         WRITE_ONCE(l->locked, _Q_SLOW_VAL);
 388         (void)pv_hash(lock, pn);
 389 }
 390
 391 /*
 392  * Wait for l->locked to become clear and acquire the lock;
 393  * halt the vcpu after a short spin.
 394  * __pv_queued_spin_unlock() will wake us.
 395  *
 396  * The current value of the lock will be returned for additional processing.
 397  */
 398 static u32
 399 pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 400 {
 401         struct pv_node *pn = (struct pv_node *)node;
 402         struct __qspinlock *l = (void *)lock;
 403         struct qspinlock **lp = NULL;
 404         int waitcnt = 0;
 405         int loop;
 406
 407         /*
 408          * If pv_kick_node() already advanced our state, we don't need to
 409          * insert ourselves into the hash table anymore.
 410          */
 411         if (READ_ONCE(pn->state) == vcpu_hashed)
 412                 lp = (struct qspinlock **)1;
 413
 414         /*
 415          * Tracking # of slowpath locking operations
 416          */
 417         qstat_inc(qstat_pv_lock_slowpath, true);
 418
 419         for (;; waitcnt++) {
 420                 /*
 421                  * Set correct vCPU state to be used by queue node wait-early
 422                  * mechanism.
 423                  */
 424                 WRITE_ONCE(pn->state, vcpu_running);
 425
 426                 /*
 427                  * Set the pending bit in the active lock spinning loop to
 428                  * disable lock stealing before attempting to acquire the lock.
 429                  */
 430                 set_pending(lock);
 431                 for (loop = SPIN_THRESHOLD; loop; loop--) {
 432                         if (trylock_clear_pending(lock))
 433                                 goto gotlock;
 434                         cpu_relax();
 435                 }
 436                 clear_pending(lock);
 437
 438
 439                 if (!lp) { /* ONCE */
 440                         lp = pv_hash(lock, pn);
 441
 442                         /*
 443                          * We must hash before setting _Q_SLOW_VAL, such that
 444                          * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
 445                          * we'll be sure to be able to observe our hash entry.
 446                          *
 447                          *   [S] <hash>                 [Rmw] l->locked == _Q_SLOW_VAL
 448                          *       MB                           RMB
 449                          * [RmW] l->locked = _Q_SLOW_VAL  [L] <unhash>
 450                          *
 451                          * Matches the smp_rmb() in __pv_queued_spin_unlock().
 452                          */
 453                         if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
 454                                 /*
 455                                  * The lock was free and now we own the lock.
 456                                  * Change the lock value back to _Q_LOCKED_VAL
 457                                  * and unhash the table.
 458                                  */
 459                                 WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
 460                                 WRITE_ONCE(*lp, NULL);
 461                                 goto gotlock;
 462                         }
 463                 }
 464                 WRITE_ONCE(pn->state, vcpu_hashed);
 465                 qstat_inc(qstat_pv_wait_head, true);
 466                 qstat_inc(qstat_pv_wait_again, waitcnt);
 467                 pv_wait(&l->locked, _Q_SLOW_VAL);
 468
 469                 /*
 470                  * Because of lock stealing, the queue head vCPU may not be
 471                  * able to acquire the lock before it has to wait again.
 472                  */
 473         }
 474
 475         /*
 476          * The cmpxchg() or xchg() call before coming here provides the
 477          * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
 478          * here is to indicate to the compiler that the value will always
 479          * be nozero to enable better code optimization.
 480          */
 481 gotlock:
 482         return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
 483 }
 484
 485 /*
 486  * PV versions of the unlock fastpath and slowpath functions to be used
 487  * instead of queued_spin_unlock().
 488  */
 489 __visible void
 490 __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 491 {
 492         struct __qspinlock *l = (void *)lock;
 493         struct pv_node *node;
 494
 495         if (unlikely(locked != _Q_SLOW_VAL)) {
 496                 WARN(!debug_locks_silent,
 497                      "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
 498                      (unsigned long)lock, atomic_read(&lock->val));
 499                 return;
 500         }
 501
 502         /*
 503          * A failed cmpxchg doesn't provide any memory-ordering guarantees,
 504          * so we need a barrier to order the read of the node data in
 505          * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
 506          *
 507          * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
 508          */
 509         smp_rmb();
 510
 511         /*
 512          * Since the above failed to release, this must be the SLOW path.
 513          * Therefore start by looking up the blocked node and unhashing it.
 514          */
 515         node = pv_unhash(lock);
 516
 517         /*
 518          * Now that we have a reference to the (likely) blocked pv_node,
 519          * release the lock.
 520          */
 521         smp_store_release(&l->locked, 0);
 522
 523         /*
 524          * At this point the memory pointed at by lock can be freed/reused,
 525          * however we can still use the pv_node to kick the CPU.
 526          * The other vCPU may not really be halted, but kicking an active
 527          * vCPU is harmless other than the additional latency in completing
 528          * the unlock.
 529          */
 530         qstat_inc(qstat_pv_kick_unlock, true);
 531         pv_kick(node->cpu);
 532 }
 533
 534 /*
 535  * Include the architecture specific callee-save thunk of the
 536  * __pv_queued_spin_unlock(). This thunk is put together with
 537  * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
 538  * function close to each other sharing consecutive instruction cachelines.
 539  * Alternatively, architecture specific version of __pv_queued_spin_unlock()
 540  * can be defined.
 541  */
 542 #include <asm/qspinlock_paravirt.h>
 543
 544 #ifndef __pv_queued_spin_unlock
 545 __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 546 {
 547         struct __qspinlock *l = (void *)lock;
 548         u8 locked;
 549
 550         /*
 551          * We must not unlock if SLOW, because in that case we must first
 552          * unhash. Otherwise it would be possible to have multiple @lock
 553          * entries, which would be BAD.
 554          */
 555         locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0);
 556         if (likely(locked == _Q_LOCKED_VAL))
 557                 return;
 558
 559         __pv_queued_spin_unlock_slowpath(lock, locked);
 560 }
 561 #endif /* __pv_queued_spin_unlock */