drm/i915: Disable semaphore busywaits on saturated systems

[linux.git] / drivers / gpu / drm / i915 / i915_request.c
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c

index 0a3d94517d0a344543d41d7ab7721be65fe45f3f..ce342f7f7ddbf388ec0d2bc5d7d8eb2d0976fa8a 100644 (file)
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -29,10 +29,11 @@
  #include <linux/sched/clock.h>
  #include <linux/sched/signal.h>
  
-#include "i915_drv.h"
  #include "i915_active.h"
+#include "i915_drv.h"
  #include "i915_globals.h"
  #include "i915_reset.h"
+#include "intel_pm.h"
  
  struct execute_cb {
         struct list_head link;
@@ -66,7 +67,7 @@ static const char *i915_fence_get_timeline_name(struct dma_fence *fence)
         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
                 return "signaled";
  
-       return to_request(fence)->timeline->name;
+       return to_request(fence)->gem_context->name ?: "[i915]";
  }
  
  static bool i915_fence_signaled(struct dma_fence *fence)
@@ -100,6 +101,7 @@ static void i915_fence_release(struct dma_fence *fence)
          * caught trying to reuse dead objects.
          */
         i915_sw_fence_fini(&rq->submit);
+       i915_sw_fence_fini(&rq->semaphore);
  
         kmem_cache_free(global.slab_requests, rq);
  }
@@ -167,7 +169,6 @@ static void advance_ring(struct i915_request *request)
                  * is just about to be. Either works, if we miss the last two
                  * noops - they are safe to be replayed on a reset.
                  */
-               GEM_TRACE("marking %s as inactive\n", ring->timeline->name);
                 tail = READ_ONCE(request->tail);
                 list_del(&ring->active_link);
         } else {
@@ -424,6 +425,26 @@ void __i915_request_submit(struct i915_request *request)
         if (i915_gem_context_is_banned(request->gem_context))
                 i915_request_skip(request, -EIO);
  
+       /*
+        * Are we using semaphores when the gpu is already saturated?
+        *
+        * Using semaphores incurs a cost in having the GPU poll a
+        * memory location, busywaiting for it to change. The continual
+        * memory reads can have a noticeable impact on the rest of the
+        * system with the extra bus traffic, stalling the cpu as it too
+        * tries to access memory across the bus (perf stat -e bus-cycles).
+        *
+        * If we installed a semaphore on this request and we only submit
+        * the request after the signaler completed, that indicates the
+        * system is overloaded and using semaphores at this time only
+        * increases the amount of work we are doing. If so, we disable
+        * further use of semaphores until we are idle again, whence we
+        * optimistically try again.
+        */
+       if (request->sched.semaphores &&
+           i915_sw_fence_signaled(&request->semaphore))
+               request->hw_context->saturated |= request->sched.semaphores;
+
         /* We may be recursing from the signal callback of another i915 fence */
         spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
  
@@ -552,6 +573,36 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
         return NOTIFY_DONE;
  }
  
+static int __i915_sw_fence_call
+semaphore_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+       struct i915_request *request =
+               container_of(fence, typeof(*request), semaphore);
+
+       switch (state) {
+       case FENCE_COMPLETE:
+               /*
+                * We only check a small portion of our dependencies
+                * and so cannot guarantee that there remains no
+                * semaphore chain across all. Instead of opting
+                * for the full NOSEMAPHORE boost, we go for the
+                * smaller (but still preempting) boost of
+                * NEWCLIENT. This will be enough to boost over
+                * a busywaiting request (as that cannot be
+                * NEWCLIENT) without accidentally boosting
+                * a busywait over real work elsewhere.
+                */
+               i915_schedule_bump_priority(request, I915_PRIORITY_NEWCLIENT);
+               break;
+
+       case FENCE_FREE:
+               i915_request_put(request);
+               break;
+       }
+
+       return NOTIFY_DONE;
+}
+
  static void ring_retire_requests(struct intel_ring *ring)
  {
         struct i915_request *rq, *rn;
@@ -584,11 +635,6 @@ i915_request_alloc_slow(struct intel_context *ce)
         return kmem_cache_alloc(global.slab_requests, GFP_KERNEL);
  }
  
-static int add_timeline_barrier(struct i915_request *rq)
-{
-       return i915_request_await_active_request(rq, &rq->timeline->barrier);
-}
-
  /**
   * i915_request_alloc - allocate a request structure
   *
@@ -707,6 +753,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
  
         /* We bump the ref for the fence chain */
         i915_sw_fence_init(&i915_request_get(rq)->submit, submit_notify);
+       i915_sw_fence_init(&i915_request_get(rq)->semaphore, semaphore_notify);
  
         i915_sched_node_init(&rq->sched);
  
@@ -738,10 +785,6 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
          */
         rq->head = rq->ring->emit;
  
-       ret = add_timeline_barrier(rq);
-       if (ret)
-               goto err_unwind;
-
         ret = engine->request_alloc(rq);
         if (ret)
                 goto err_unwind;
@@ -752,7 +795,10 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
         rq->infix = rq->ring->emit; /* end of header; start of user payload */
  
         /* Check that we didn't interrupt ourselves with a new request */
+       lockdep_assert_held(&rq->timeline->mutex);
         GEM_BUG_ON(rq->timeline->seqno != rq->fence.seqno);
+       rq->cookie = lockdep_pin_lock(&rq->timeline->mutex);
+
         return rq;
  
  err_unwind:
@@ -772,6 +818,39 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
         return ERR_PTR(ret);
  }
  
+static int
+i915_request_await_start(struct i915_request *rq, struct i915_request *signal)
+{
+       if (list_is_first(&signal->ring_link, &signal->ring->request_list))
+               return 0;
+
+       signal = list_prev_entry(signal, ring_link);
+       if (i915_timeline_sync_is_later(rq->timeline, &signal->fence))
+               return 0;
+
+       return i915_sw_fence_await_dma_fence(&rq->submit,
+                                            &signal->fence, 0,
+                                            I915_FENCE_GFP);
+}
+
+static intel_engine_mask_t
+already_busywaiting(struct i915_request *rq)
+{
+       /*
+        * Polling a semaphore causes bus traffic, delaying other users of
+        * both the GPU and CPU. We want to limit the impact on others,
+        * while taking advantage of early submission to reduce GPU
+        * latency. Therefore we restrict ourselves to not using more
+        * than one semaphore from each source, and not using a semaphore
+        * if we have detected the engine is saturated (i.e. would not be
+        * submitted early and cause bus traffic reading an already passed
+        * semaphore).
+        *
+        * See the are-we-too-late? check in __i915_request_submit().
+        */
+       return rq->sched.semaphores | rq->hw_context->saturated;
+}
+
  static int
  emit_semaphore_wait(struct i915_request *to,
                     struct i915_request *from,
@@ -784,6 +863,22 @@ emit_semaphore_wait(struct i915_request *to,
         GEM_BUG_ON(!from->timeline->has_initial_breadcrumb);
         GEM_BUG_ON(INTEL_GEN(to->i915) < 8);
  
+       /* Just emit the first semaphore we see as request space is limited. */
+       if (already_busywaiting(to) & from->engine->mask)
+               return i915_sw_fence_await_dma_fence(&to->submit,
+                                                    &from->fence, 0,
+                                                    I915_FENCE_GFP);
+
+       err = i915_request_await_start(to, from);
+       if (err < 0)
+               return err;
+
+       err = i915_sw_fence_await_dma_fence(&to->semaphore,
+                                           &from->fence, 0,
+                                           I915_FENCE_GFP);
+       if (err < 0)
+               return err;
+
         /* We need to pin the signaler's HWSP until we are finished reading. */
         err = i915_timeline_read_hwsp(from, to, &hwsp_offset);
         if (err)
@@ -815,7 +910,8 @@ emit_semaphore_wait(struct i915_request *to,
         *cs++ = 0;
  
         intel_ring_advance(to, cs);
-       to->sched.flags |= I915_SCHED_HAS_SEMAPHORE;
+       to->sched.semaphores |= from->engine->mask;
+       to->sched.flags |= I915_SCHED_HAS_SEMAPHORE_CHAIN;
         return 0;
  }
  
@@ -993,6 +1089,60 @@ void i915_request_skip(struct i915_request *rq, int error)
         memset(vaddr + head, 0, rq->postfix - head);
  }
  
+static struct i915_request *
+__i915_request_add_to_timeline(struct i915_request *rq)
+{
+       struct i915_timeline *timeline = rq->timeline;
+       struct i915_request *prev;
+
+       /*
+        * Dependency tracking and request ordering along the timeline
+        * is special cased so that we can eliminate redundant ordering
+        * operations while building the request (we know that the timeline
+        * itself is ordered, and here we guarantee it).
+        *
+        * As we know we will need to emit tracking along the timeline,
+        * we embed the hooks into our request struct -- at the cost of
+        * having to have specialised no-allocation interfaces (which will
+        * be beneficial elsewhere).
+        *
+        * A second benefit to open-coding i915_request_await_request is
+        * that we can apply a slight variant of the rules specialised
+        * for timelines that jump between engines (such as virtual engines).
+        * If we consider the case of virtual engine, we must emit a dma-fence
+        * to prevent scheduling of the second request until the first is
+        * complete (to maximise our greedy late load balancing) and this
+        * precludes optimising to use semaphores serialisation of a single
+        * timeline across engines.
+        */
+       prev = i915_active_request_raw(&timeline->last_request,
+                                      &rq->i915->drm.struct_mutex);
+       if (prev && !i915_request_completed(prev)) {
+               if (is_power_of_2(prev->engine->mask | rq->engine->mask))
+                       i915_sw_fence_await_sw_fence(&rq->submit,
+                                                    &prev->submit,
+                                                    &rq->submitq);
+               else
+                       __i915_sw_fence_await_dma_fence(&rq->submit,
+                                                       &prev->fence,
+                                                       &rq->dmaq);
+               if (rq->engine->schedule)
+                       __i915_sched_node_add_dependency(&rq->sched,
+                                                        &prev->sched,
+                                                        &rq->dep,
+                                                        0);
+       }
+
+       spin_lock_irq(&timeline->lock);
+       list_add_tail(&rq->link, &timeline->requests);
+       spin_unlock_irq(&timeline->lock);
+
+       GEM_BUG_ON(timeline->seqno != rq->fence.seqno);
+       __i915_active_request_set(&timeline->last_request, rq);
+
+       return prev;
+}
+
  /*
   * NB: This function is not allowed to fail. Doing so would mean the the
   * request is not being tracked for completion but the work itself is
@@ -1010,6 +1160,8 @@ void i915_request_add(struct i915_request *request)
                   engine->name, request->fence.context, request->fence.seqno);
  
         lockdep_assert_held(&request->timeline->mutex);
+       lockdep_unpin_lock(&request->timeline->mutex, request->cookie);
+
         trace_i915_request_add(request);
  
         /*
@@ -1037,37 +1189,11 @@ void i915_request_add(struct i915_request *request)
         GEM_BUG_ON(IS_ERR(cs));
         request->postfix = intel_ring_offset(request, cs);
  
-       /*
-        * Seal the request and mark it as pending execution. Note that
-        * we may inspect this state, without holding any locks, during
-        * hangcheck. Hence we apply the barrier to ensure that we do not
-        * see a more recent value in the hws than we are tracking.
-        */
-
-       prev = i915_active_request_raw(&timeline->last_request,
-                                      &request->i915->drm.struct_mutex);
-       if (prev && !i915_request_completed(prev)) {
-               i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
-                                            &request->submitq);
-               if (engine->schedule)
-                       __i915_sched_node_add_dependency(&request->sched,
-                                                        &prev->sched,
-                                                        &request->dep,
-                                                        0);
-       }
-
-       spin_lock_irq(&timeline->lock);
-       list_add_tail(&request->link, &timeline->requests);
-       spin_unlock_irq(&timeline->lock);
-
-       GEM_BUG_ON(timeline->seqno != request->fence.seqno);
-       __i915_active_request_set(&timeline->last_request, request);
+       prev = __i915_request_add_to_timeline(request);
  
         list_add_tail(&request->ring_link, &ring->request_list);
-       if (list_is_first(&request->ring_link, &ring->request_list)) {
-               GEM_TRACE("marking %s as active\n", ring->timeline->name);
+       if (list_is_first(&request->ring_link, &ring->request_list))
                 list_add(&ring->active_link, &request->i915->gt.active_rings);
-       }
         request->i915->gt.active_engines |= request->engine->mask;
         request->emitted_jiffies = jiffies;
  
@@ -1083,6 +1209,7 @@ void i915_request_add(struct i915_request *request)
          * run at the earliest possible convenience.
          */
         local_bh_disable();
+       i915_sw_fence_commit(&request->semaphore);
         rcu_read_lock(); /* RCU serialisation for set-wedged protection */
         if (engine->schedule) {
                 struct i915_sched_attr attr = request->gem_context->sched;
@@ -1099,7 +1226,7 @@ void i915_request_add(struct i915_request *request)
                  * far in the distance past over useful work, we keep a history
                  * of any semaphore use along our dependency chain.
                  */
-               if (!(request->sched.flags & I915_SCHED_HAS_SEMAPHORE))
+               if (!(request->sched.flags & I915_SCHED_HAS_SEMAPHORE_CHAIN))
                         attr.priority |= I915_PRIORITY_NOSEMAPHORE;
  
                 /*
@@ -1289,7 +1416,9 @@ long i915_request_wait(struct i915_request *rq,
         if (flags & I915_WAIT_PRIORITY) {
                 if (!i915_request_started(rq) && INTEL_GEN(rq->i915) >= 6)
                         gen6_rps_boost(rq);
+               local_bh_disable(); /* suspend tasklets for reprioritisation */
                 i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
+               local_bh_enable(); /* kick tasklets en masse */
         }
  
         wait.tsk = current;