locking/pvqspinlock: Queue node adaptive spinning

author Waiman Long <Waiman.Long@hpe.com>

Tue, 10 Nov 2015 00:09:27 +0000 (19:09 -0500)

committer Ingo Molnar <mingo@kernel.org>

Fri, 4 Dec 2015 10:39:51 +0000 (11:39 +0100)
author Waiman Long <Waiman.Long@hpe.com>
Tue, 10 Nov 2015 00:09:27 +0000 (19:09 -0500)
committer Ingo Molnar <mingo@kernel.org>
Fri, 4 Dec 2015 10:39:51 +0000 (11:39 +0100)
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c

index 2ea42999d2d80185caa9056ad7ca2358cc52d2d2..393d1874b9e0c81d44ef9d0447ded22a03bccd81 100644 (file)
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -248,7 +248,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
   */
  
  static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+                                          struct mcs_spinlock *prev) { }
  static __always_inline void __pv_kick_node(struct qspinlock *lock,
                                            struct mcs_spinlock *node) { }
  static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
@@ -407,7 +408,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
                 prev = decode_tail(old);
                 WRITE_ONCE(prev->next, node);
  
-               pv_wait_node(node);
+               pv_wait_node(node, prev);
                 arch_mcs_spin_lock_contended(&node->locked);
  
                 /*
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h

index ace60a451b4f049a936ddace2dfc77fa53ba40b2..87bb235c3448054d63923d0f9549cbc7718a49ca 100644 (file)
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -22,6 +22,20 @@
  
  #define _Q_SLOW_VAL    (3U << _Q_LOCKED_OFFSET)
  
+/*
+ * Queue Node Adaptive Spinning
+ *
+ * A queue node vCPU will stop spinning if the vCPU in the previous node is
+ * not running. The one lock stealing attempt allowed at slowpath entry
+ * mitigates the slight slowdown for non-overcommitted guest with this
+ * aggressive wait-early mechanism.
+ *
+ * The status of the previous node will be checked at fixed interval
+ * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
+ * pound on the cacheline of the previous node too heavily.
+ */
+#define PV_PREV_CHECK_MASK     0xff
+
  /*
   * Queue node uses: vcpu_running & vcpu_halted.
   * Queue head uses: vcpu_running & vcpu_hashed.
@@ -234,6 +248,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
         BUG();
  }
  
+/*
+ * Return true if when it is time to check the previous node which is not
+ * in a running state.
+ */
+static inline bool
+pv_wait_early(struct pv_node *prev, int loop)
+{
+
+       if ((loop & PV_PREV_CHECK_MASK) != 0)
+               return false;
+
+       return READ_ONCE(prev->state) != vcpu_running;
+}
+
  /*
   * Initialize the PV part of the mcs_spinlock node.
   */
@@ -252,17 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node)
   * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
   * behalf.
   */
-static void pv_wait_node(struct mcs_spinlock *node)
+static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
  {
         struct pv_node *pn = (struct pv_node *)node;
+       struct pv_node *pp = (struct pv_node *)prev;
         int waitcnt = 0;
         int loop;
+       bool wait_early;
  
         /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
         for (;; waitcnt++) {
-               for (loop = SPIN_THRESHOLD; loop; loop--) {
+               for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
                         if (READ_ONCE(node->locked))
                                 return;
+                       if (pv_wait_early(pp, loop)) {
+                               wait_early = true;
+                               break;
+                       }
                         cpu_relax();
                 }
  
@@ -280,6 +314,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
                 if (!READ_ONCE(node->locked)) {
                         qstat_inc(qstat_pv_wait_node, true);
                         qstat_inc(qstat_pv_wait_again, waitcnt);
+                       qstat_inc(qstat_pv_wait_early, wait_early);
                         pv_wait(&pn->state, vcpu_halted);
                 }
  
@@ -364,6 +399,12 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
                 lp = (struct qspinlock **)1;
  
         for (;; waitcnt++) {
+               /*
+                * Set correct vCPU state to be used by queue node wait-early
+                * mechanism.
+                */
+               WRITE_ONCE(pn->state, vcpu_running);
+
                 /*
                  * Set the pending bit in the active lock spinning loop to
                  * disable lock stealing before attempting to acquire the lock.
@@ -402,6 +443,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
                                 goto gotlock;
                         }
                 }
+               WRITE_ONCE(pn->state, vcpu_halted);
                 qstat_inc(qstat_pv_wait_head, true);
                 qstat_inc(qstat_pv_wait_again, waitcnt);
                 pv_wait(&l->locked, _Q_SLOW_VAL);
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h

index 94d4533fe984b5c81a9e6be4d5679f430760aaeb..640dcecdd1df7a5ac5c8167fbd9e70e71b2e8f21 100644 (file)
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -25,6 +25,7 @@
   *   pv_lock_stealing  - # of lock stealing operations
   *   pv_spurious_wakeup        - # of spurious wakeups
   *   pv_wait_again     - # of vCPU wait's that happened after a vCPU kick
+ *   pv_wait_early     - # of early vCPU wait's
   *   pv_wait_head      - # of vCPU wait's at the queue head
   *   pv_wait_node      - # of vCPU wait's at a non-head queue node
   *
@@ -47,6 +48,7 @@ enum qlock_stats {
         qstat_pv_lock_stealing,
         qstat_pv_spurious_wakeup,
         qstat_pv_wait_again,
+       qstat_pv_wait_early,
         qstat_pv_wait_head,
         qstat_pv_wait_node,
         qstat_num,      /* Total number of statistical counters */
@@ -70,6 +72,7 @@ static const char * const qstat_names[qstat_num + 1] = {
         [qstat_pv_latency_wake]    = "pv_latency_wake",
         [qstat_pv_lock_stealing]   = "pv_lock_stealing",
         [qstat_pv_wait_again]      = "pv_wait_again",
+       [qstat_pv_wait_early]      = "pv_wait_early",
         [qstat_pv_wait_head]       = "pv_wait_head",
         [qstat_pv_wait_node]       = "pv_wait_node",
         [qstat_reset_cnts]         = "reset_counters",
author	Waiman Long <Waiman.Long@hpe.com>
	Tue, 10 Nov 2015 00:09:27 +0000 (19:09 -0500)
committer	Ingo Molnar <mingo@kernel.org>
	Fri, 4 Dec 2015 10:39:51 +0000 (11:39 +0100)
kernel/locking/qspinlock.c		patch \| blob \| history
kernel/locking/qspinlock_paravirt.h		patch \| blob \| history
kernel/locking/qspinlock_stat.h		patch \| blob \| history