drivers/gpu/drm/i915/gt/intel_lrc.c

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Ben Widawsky <ben@bwidawsk.net>
  25  *    Michel Thierry <michel.thierry@intel.com>
  26  *    Thomas Daniel <thomas.daniel@intel.com>
  27  *    Oscar Mateo <oscar.mateo@intel.com>
  28  *
  29  */
  30
  31 /**
  32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
  33  *
  34  * Motivation:
  35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
  36  * These expanded contexts enable a number of new abilities, especially
  37  * "Execlists" (also implemented in this file).
  38  *
  39  * One of the main differences with the legacy HW contexts is that logical
  40  * ring contexts incorporate many more things to the context's state, like
  41  * PDPs or ringbuffer control registers:
  42  *
  43  * The reason why PDPs are included in the context is straightforward: as
  44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
  45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
  46  * instead, the GPU will do it for you on the context switch.
  47  *
  48  * But, what about the ringbuffer control registers (head, tail, etc..)?
  49  * shouldn't we just need a set of those per engine command streamer? This is
  50  * where the name "Logical Rings" starts to make sense: by virtualizing the
  51  * rings, the engine cs shifts to a new "ring buffer" with every context
  52  * switch. When you want to submit a workload to the GPU you: A) choose your
  53  * context, B) find its appropriate virtualized ring, C) write commands to it
  54  * and then, finally, D) tell the GPU to switch to that context.
  55  *
  56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
  57  * to a contexts is via a context execution list, ergo "Execlists".
  58  *
  59  * LRC implementation:
  60  * Regarding the creation of contexts, we have:
  61  *
  62  * - One global default context.
  63  * - One local default context for each opened fd.
  64  * - One local extra context for each context create ioctl call.
  65  *
  66  * Now that ringbuffers belong per-context (and not per-engine, like before)
  67  * and that contexts are uniquely tied to a given engine (and not reusable,
  68  * like before) we need:
  69  *
  70  * - One ringbuffer per-engine inside each context.
  71  * - One backing object per-engine inside each context.
  72  *
  73  * The global default context starts its life with these new objects fully
  74  * allocated and populated. The local default context for each opened fd is
  75  * more complex, because we don't know at creation time which engine is going
  76  * to use them. To handle this, we have implemented a deferred creation of LR
  77  * contexts:
  78  *
  79  * The local context starts its life as a hollow or blank holder, that only
  80  * gets populated for a given engine once we receive an execbuffer. If later
  81  * on we receive another execbuffer ioctl for the same context but a different
  82  * engine, we allocate/populate a new ringbuffer and context backing object and
  83  * so on.
  84  *
  85  * Finally, regarding local contexts created using the ioctl call: as they are
  86  * only allowed with the render ring, we can allocate & populate them right
  87  * away (no need to defer anything, at least for now).
  88  *
  89  * Execlists implementation:
  90  * Execlists are the new method by which, on gen8+ hardware, workloads are
  91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
  92  * This method works as follows:
  93  *
  94  * When a request is committed, its commands (the BB start and any leading or
  95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
  96  * for the appropriate context. The tail pointer in the hardware context is not
  97  * updated at this time, but instead, kept by the driver in the ringbuffer
  98  * structure. A structure representing this request is added to a request queue
  99  * for the appropriate engine: this structure contains a copy of the context's
 100  * tail after the request was written to the ring buffer and a pointer to the
 101  * context itself.
 102  *
 103  * If the engine's request queue was empty before the request was added, the
 104  * queue is processed immediately. Otherwise the queue will be processed during
 105  * a context switch interrupt. In any case, elements on the queue will get sent
 106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 107  * globally unique 20-bits submission ID.
 108  *
 109  * When execution of a request completes, the GPU updates the context status
 110  * buffer with a context complete event and generates a context switch interrupt.
 111  * During the interrupt handling, the driver examines the events in the buffer:
 112  * for each context complete event, if the announced ID matches that on the head
 113  * of the request queue, then that request is retired and removed from the queue.
 114  *
 115  * After processing, if any requests were retired and the queue is not empty
 116  * then a new execution list can be submitted. The two requests at the front of
 117  * the queue are next to be submitted but since a context may not occur twice in
 118  * an execution list, if subsequent requests have the same ID as the first then
 119  * the two requests must be combined. This is done simply by discarding requests
 120  * at the head of the queue until either only one requests is left (in which case
 121  * we use a NULL second context) or the first two requests have unique IDs.
 122  *
 123  * By always executing the first two requests in the queue the driver ensures
 124  * that the GPU is kept as busy as possible. In the case where a single context
 125  * completes but a second context is still executing, the request for this second
 126  * context will be at the head of the queue when we remove the first one. This
 127  * request will then be resubmitted along with a new request for a different context,
 128  * which will cause the hardware to continue executing the second request and queue
 129  * the new request (the GPU detects the condition of a context getting preempted
 130  * with the same context and optimizes the context switch flow by not doing
 131  * preemption, but just sampling the new tail pointer).
 132  *
 133  */
 134 #include <linux/interrupt.h>
 135
 136 #include "gem/i915_gem_context.h"
 137
 138 #include "i915_drv.h"
 139 #include "i915_perf.h"
 140 #include "i915_trace.h"
 141 #include "i915_vgpu.h"
 142 #include "intel_engine_pm.h"
 143 #include "intel_gt.h"
 144 #include "intel_gt_pm.h"
 145 #include "intel_lrc_reg.h"
 146 #include "intel_mocs.h"
 147 #include "intel_reset.h"
 148 #include "intel_workarounds.h"
 149
 150 #define RING_EXECLIST_QFULL             (1 << 0x2)
 151 #define RING_EXECLIST1_VALID            (1 << 0x3)
 152 #define RING_EXECLIST0_VALID            (1 << 0x4)
 153 #define RING_EXECLIST_ACTIVE_STATUS     (3 << 0xE)
 154 #define RING_EXECLIST1_ACTIVE           (1 << 0x11)
 155 #define RING_EXECLIST0_ACTIVE           (1 << 0x12)
 156
 157 #define GEN8_CTX_STATUS_IDLE_ACTIVE     (1 << 0)
 158 #define GEN8_CTX_STATUS_PREEMPTED       (1 << 1)
 159 #define GEN8_CTX_STATUS_ELEMENT_SWITCH  (1 << 2)
 160 #define GEN8_CTX_STATUS_ACTIVE_IDLE     (1 << 3)
 161 #define GEN8_CTX_STATUS_COMPLETE        (1 << 4)
 162 #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
 163
 164 #define GEN8_CTX_STATUS_COMPLETED_MASK \
 165          (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
 166
 167 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
 168
 169 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
 170 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
 171 #define WA_TAIL_DWORDS 2
 172 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
 173
 174 struct virtual_engine {
 175         struct intel_engine_cs base;
 176         struct intel_context context;
 177
 178         /*
 179          * We allow only a single request through the virtual engine at a time
 180          * (each request in the timeline waits for the completion fence of
 181          * the previous before being submitted). By restricting ourselves to
 182          * only submitting a single request, each request is placed on to a
 183          * physical to maximise load spreading (by virtue of the late greedy
 184          * scheduling -- each real engine takes the next available request
 185          * upon idling).
 186          */
 187         struct i915_request *request;
 188
 189         /*
 190          * We keep a rbtree of available virtual engines inside each physical
 191          * engine, sorted by priority. Here we preallocate the nodes we need
 192          * for the virtual engine, indexed by physical_engine->id.
 193          */
 194         struct ve_node {
 195                 struct rb_node rb;
 196                 int prio;
 197         } nodes[I915_NUM_ENGINES];
 198
 199         /*
 200          * Keep track of bonded pairs -- restrictions upon on our selection
 201          * of physical engines any particular request may be submitted to.
 202          * If we receive a submit-fence from a master engine, we will only
 203          * use one of sibling_mask physical engines.
 204          */
 205         struct ve_bond {
 206                 const struct intel_engine_cs *master;
 207                 intel_engine_mask_t sibling_mask;
 208         } *bonds;
 209         unsigned int num_bonds;
 210
 211         /* And finally, which physical engines this virtual engine maps onto. */
 212         unsigned int num_siblings;
 213         struct intel_engine_cs *siblings[0];
 214 };
 215
 216 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
 217 {
 218         GEM_BUG_ON(!intel_engine_is_virtual(engine));
 219         return container_of(engine, struct virtual_engine, base);
 220 }
 221
 222 static int __execlists_context_alloc(struct intel_context *ce,
 223                                      struct intel_engine_cs *engine);
 224
 225 static void execlists_init_reg_state(u32 *reg_state,
 226                                      struct intel_context *ce,
 227                                      struct intel_engine_cs *engine,
 228                                      struct intel_ring *ring);
 229
 230 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
 231 {
 232         return (i915_ggtt_offset(engine->status_page.vma) +
 233                 I915_GEM_HWS_PREEMPT_ADDR);
 234 }
 235
 236 static inline void
 237 ring_set_paused(const struct intel_engine_cs *engine, int state)
 238 {
 239         /*
 240          * We inspect HWS_PREEMPT with a semaphore inside
 241          * engine->emit_fini_breadcrumb. If the dword is true,
 242          * the ring is paused as the semaphore will busywait
 243          * until the dword is false.
 244          */
 245         engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
 246         if (state)
 247                 wmb();
 248 }
 249
 250 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 251 {
 252         return rb_entry(rb, struct i915_priolist, node);
 253 }
 254
 255 static inline int rq_prio(const struct i915_request *rq)
 256 {
 257         return rq->sched.attr.priority;
 258 }
 259
 260 static int effective_prio(const struct i915_request *rq)
 261 {
 262         int prio = rq_prio(rq);
 263
 264         /*
 265          * If this request is special and must not be interrupted at any
 266          * cost, so be it. Note we are only checking the most recent request
 267          * in the context and so may be masking an earlier vip request. It
 268          * is hoped that under the conditions where nopreempt is used, this
 269          * will not matter (i.e. all requests to that context will be
 270          * nopreempt for as long as desired).
 271          */
 272         if (i915_request_has_nopreempt(rq))
 273                 prio = I915_PRIORITY_UNPREEMPTABLE;
 274
 275         /*
 276          * On unwinding the active request, we give it a priority bump
 277          * if it has completed waiting on any semaphore. If we know that
 278          * the request has already started, we can prevent an unwanted
 279          * preempt-to-idle cycle by taking that into account now.
 280          */
 281         if (__i915_request_has_started(rq))
 282                 prio |= I915_PRIORITY_NOSEMAPHORE;
 283
 284         /* Restrict mere WAIT boosts from triggering preemption */
 285         BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
 286         return prio | __NO_PREEMPTION;
 287 }
 288
 289 static int queue_prio(const struct intel_engine_execlists *execlists)
 290 {
 291         struct i915_priolist *p;
 292         struct rb_node *rb;
 293
 294         rb = rb_first_cached(&execlists->queue);
 295         if (!rb)
 296                 return INT_MIN;
 297
 298         /*
 299          * As the priolist[] are inverted, with the highest priority in [0],
 300          * we have to flip the index value to become priority.
 301          */
 302         p = to_priolist(rb);
 303         return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
 304 }
 305
 306 static inline bool need_preempt(const struct intel_engine_cs *engine,
 307                                 const struct i915_request *rq,
 308                                 struct rb_node *rb)
 309 {
 310         int last_prio;
 311
 312         if (!intel_engine_has_semaphores(engine))
 313                 return false;
 314
 315         /*
 316          * Check if the current priority hint merits a preemption attempt.
 317          *
 318          * We record the highest value priority we saw during rescheduling
 319          * prior to this dequeue, therefore we know that if it is strictly
 320          * less than the current tail of ESLP[0], we do not need to force
 321          * a preempt-to-idle cycle.
 322          *
 323          * However, the priority hint is a mere hint that we may need to
 324          * preempt. If that hint is stale or we may be trying to preempt
 325          * ourselves, ignore the request.
 326          */
 327         last_prio = effective_prio(rq);
 328         if (!i915_scheduler_need_preempt(engine->execlists.queue_priority_hint,
 329                                          last_prio))
 330                 return false;
 331
 332         /*
 333          * Check against the first request in ELSP[1], it will, thanks to the
 334          * power of PI, be the highest priority of that context.
 335          */
 336         if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
 337             rq_prio(list_next_entry(rq, sched.link)) > last_prio)
 338                 return true;
 339
 340         if (rb) {
 341                 struct virtual_engine *ve =
 342                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
 343                 bool preempt = false;
 344
 345                 if (engine == ve->siblings[0]) { /* only preempt one sibling */
 346                         struct i915_request *next;
 347
 348                         rcu_read_lock();
 349                         next = READ_ONCE(ve->request);
 350                         if (next)
 351                                 preempt = rq_prio(next) > last_prio;
 352                         rcu_read_unlock();
 353                 }
 354
 355                 if (preempt)
 356                         return preempt;
 357         }
 358
 359         /*
 360          * If the inflight context did not trigger the preemption, then maybe
 361          * it was the set of queued requests? Pick the highest priority in
 362          * the queue (the first active priolist) and see if it deserves to be
 363          * running instead of ELSP[0].
 364          *
 365          * The highest priority request in the queue can not be either
 366          * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
 367          * context, it's priority would not exceed ELSP[0] aka last_prio.
 368          */
 369         return queue_prio(&engine->execlists) > last_prio;
 370 }
 371
 372 __maybe_unused static inline bool
 373 assert_priority_queue(const struct i915_request *prev,
 374                       const struct i915_request *next)
 375 {
 376         /*
 377          * Without preemption, the prev may refer to the still active element
 378          * which we refuse to let go.
 379          *
 380          * Even with preemption, there are times when we think it is better not
 381          * to preempt and leave an ostensibly lower priority request in flight.
 382          */
 383         if (i915_request_is_active(prev))
 384                 return true;
 385
 386         return rq_prio(prev) >= rq_prio(next);
 387 }
 388
 389 /*
 390  * The context descriptor encodes various attributes of a context,
 391  * including its GTT address and some flags. Because it's fairly
 392  * expensive to calculate, we'll just do it once and cache the result,
 393  * which remains valid until the context is unpinned.
 394  *
 395  * This is what a descriptor looks like, from LSB to MSB::
 396  *
 397  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
 398  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
 399  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
 400  *      bits 53-54:    mbz, reserved for use by hardware
 401  *      bits 55-63:    group ID, currently unused and set to 0
 402  *
 403  * Starting from Gen11, the upper dword of the descriptor has a new format:
 404  *
 405  *      bits 32-36:    reserved
 406  *      bits 37-47:    SW context ID
 407  *      bits 48:53:    engine instance
 408  *      bit 54:        mbz, reserved for use by hardware
 409  *      bits 55-60:    SW counter
 410  *      bits 61-63:    engine class
 411  *
 412  * engine info, SW context ID and SW counter need to form a unique number
 413  * (Context ID) per lrc.
 414  */
 415 static u64
 416 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
 417 {
 418         struct i915_gem_context *ctx = ce->gem_context;
 419         u64 desc;
 420
 421         BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH)));
 422         BUILD_BUG_ON(GEN11_MAX_CONTEXT_HW_ID > (BIT(GEN11_SW_CTX_ID_WIDTH)));
 423
 424         desc = INTEL_LEGACY_32B_CONTEXT;
 425         if (i915_vm_is_4lvl(ce->vm))
 426                 desc = INTEL_LEGACY_64B_CONTEXT;
 427         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
 428
 429         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
 430         if (IS_GEN(engine->i915, 8))
 431                 desc |= GEN8_CTX_L3LLC_COHERENT;
 432
 433         desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE;
 434                                                                 /* bits 12-31 */
 435         /*
 436          * The following 32bits are copied into the OA reports (dword 2).
 437          * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
 438          * anything below.
 439          */
 440         if (INTEL_GEN(engine->i915) >= 11) {
 441                 GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
 442                 desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;
 443                                                                 /* bits 37-47 */
 444
 445                 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
 446                                                                 /* bits 48-53 */
 447
 448                 /* TODO: decide what to do with SW counter (bits 55-60) */
 449
 450                 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
 451                                                                 /* bits 61-63 */
 452         } else {
 453                 GEM_BUG_ON(ctx->hw_id >= BIT(GEN8_CTX_ID_WIDTH));
 454                 desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT;   /* bits 32-52 */
 455         }
 456
 457         return desc;
 458 }
 459
 460 static void unwind_wa_tail(struct i915_request *rq)
 461 {
 462         rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
 463         assert_ring_tail_valid(rq->ring, rq->tail);
 464 }
 465
 466 static struct i915_request *
 467 __unwind_incomplete_requests(struct intel_engine_cs *engine)
 468 {
 469         struct i915_request *rq, *rn, *active = NULL;
 470         struct list_head *uninitialized_var(pl);
 471         int prio = I915_PRIORITY_INVALID;
 472
 473         lockdep_assert_held(&engine->active.lock);
 474
 475         list_for_each_entry_safe_reverse(rq, rn,
 476                                          &engine->active.requests,
 477                                          sched.link) {
 478                 struct intel_engine_cs *owner;
 479
 480                 if (i915_request_completed(rq))
 481                         continue; /* XXX */
 482
 483                 __i915_request_unsubmit(rq);
 484                 unwind_wa_tail(rq);
 485
 486                 /*
 487                  * Push the request back into the queue for later resubmission.
 488                  * If this request is not native to this physical engine (i.e.
 489                  * it came from a virtual source), push it back onto the virtual
 490                  * engine so that it can be moved across onto another physical
 491                  * engine as load dictates.
 492                  */
 493                 owner = rq->hw_context->engine;
 494                 if (likely(owner == engine)) {
 495                         GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
 496                         if (rq_prio(rq) != prio) {
 497                                 prio = rq_prio(rq);
 498                                 pl = i915_sched_lookup_priolist(engine, prio);
 499                         }
 500                         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
 501
 502                         list_move(&rq->sched.link, pl);
 503                         active = rq;
 504                 } else {
 505                         /*
 506                          * Decouple the virtual breadcrumb before moving it
 507                          * back to the virtual engine -- we don't want the
 508                          * request to complete in the background and try
 509                          * and cancel the breadcrumb on the virtual engine
 510                          * (instead of the old engine where it is linked)!
 511                          */
 512                         if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
 513                                      &rq->fence.flags)) {
 514                                 spin_lock(&rq->lock);
 515                                 i915_request_cancel_breadcrumb(rq);
 516                                 spin_unlock(&rq->lock);
 517                         }
 518                         rq->engine = owner;
 519                         owner->submit_request(rq);
 520                         active = NULL;
 521                 }
 522         }
 523
 524         return active;
 525 }
 526
 527 struct i915_request *
 528 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
 529 {
 530         struct intel_engine_cs *engine =
 531                 container_of(execlists, typeof(*engine), execlists);
 532
 533         return __unwind_incomplete_requests(engine);
 534 }
 535
 536 static inline void
 537 execlists_context_status_change(struct i915_request *rq, unsigned long status)
 538 {
 539         /*
 540          * Only used when GVT-g is enabled now. When GVT-g is disabled,
 541          * The compiler should eliminate this function as dead-code.
 542          */
 543         if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
 544                 return;
 545
 546         atomic_notifier_call_chain(&rq->engine->context_status_notifier,
 547                                    status, rq);
 548 }
 549
 550 static inline struct intel_engine_cs *
 551 __execlists_schedule_in(struct i915_request *rq)
 552 {
 553         struct intel_engine_cs * const engine = rq->engine;
 554         struct intel_context * const ce = rq->hw_context;
 555
 556         intel_context_get(ce);
 557
 558         intel_gt_pm_get(engine->gt);
 559         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
 560         intel_engine_context_in(engine);
 561
 562         return engine;
 563 }
 564
 565 static inline struct i915_request *
 566 execlists_schedule_in(struct i915_request *rq, int idx)
 567 {
 568         struct intel_context * const ce = rq->hw_context;
 569         struct intel_engine_cs *old;
 570
 571         GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
 572         trace_i915_request_in(rq, idx);
 573
 574         old = READ_ONCE(ce->inflight);
 575         do {
 576                 if (!old) {
 577                         WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
 578                         break;
 579                 }
 580         } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
 581
 582         GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
 583         return i915_request_get(rq);
 584 }
 585
 586 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
 587 {
 588         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
 589         struct i915_request *next = READ_ONCE(ve->request);
 590
 591         if (next && next->execution_mask & ~rq->execution_mask)
 592                 tasklet_schedule(&ve->base.execlists.tasklet);
 593 }
 594
 595 static inline void
 596 __execlists_schedule_out(struct i915_request *rq,
 597                          struct intel_engine_cs * const engine)
 598 {
 599         struct intel_context * const ce = rq->hw_context;
 600
 601         intel_engine_context_out(engine);
 602         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
 603         intel_gt_pm_put(engine->gt);
 604
 605         /*
 606          * If this is part of a virtual engine, its next request may
 607          * have been blocked waiting for access to the active context.
 608          * We have to kick all the siblings again in case we need to
 609          * switch (e.g. the next request is not runnable on this
 610          * engine). Hopefully, we will already have submitted the next
 611          * request before the tasklet runs and do not need to rebuild
 612          * each virtual tree and kick everyone again.
 613          */
 614         if (ce->engine != engine)
 615                 kick_siblings(rq, ce);
 616
 617         intel_context_put(ce);
 618 }
 619
 620 static inline void
 621 execlists_schedule_out(struct i915_request *rq)
 622 {
 623         struct intel_context * const ce = rq->hw_context;
 624         struct intel_engine_cs *cur, *old;
 625
 626         trace_i915_request_out(rq);
 627         GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
 628
 629         old = READ_ONCE(ce->inflight);
 630         do
 631                 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
 632         while (!try_cmpxchg(&ce->inflight, &old, cur));
 633         if (!cur)
 634                 __execlists_schedule_out(rq, old);
 635
 636         i915_request_put(rq);
 637 }
 638
 639 static u64 execlists_update_context(const struct i915_request *rq)
 640 {
 641         struct intel_context *ce = rq->hw_context;
 642         u64 desc;
 643
 644         ce->lrc_reg_state[CTX_RING_TAIL + 1] =
 645                 intel_ring_set_tail(rq->ring, rq->tail);
 646
 647         /*
 648          * Make sure the context image is complete before we submit it to HW.
 649          *
 650          * Ostensibly, writes (including the WCB) should be flushed prior to
 651          * an uncached write such as our mmio register access, the empirical
 652          * evidence (esp. on Braswell) suggests that the WC write into memory
 653          * may not be visible to the HW prior to the completion of the UC
 654          * register write and that we may begin execution from the context
 655          * before its image is complete leading to invalid PD chasing.
 656          *
 657          * Furthermore, Braswell, at least, wants a full mb to be sure that
 658          * the writes are coherent in memory (visible to the GPU) prior to
 659          * execution, and not just visible to other CPUs (as is the result of
 660          * wmb).
 661          */
 662         mb();
 663
 664         desc = ce->lrc_desc;
 665         ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
 666
 667         return desc;
 668 }
 669
 670 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
 671 {
 672         if (execlists->ctrl_reg) {
 673                 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
 674                 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
 675         } else {
 676                 writel(upper_32_bits(desc), execlists->submit_reg);
 677                 writel(lower_32_bits(desc), execlists->submit_reg);
 678         }
 679 }
 680
 681 static __maybe_unused void
 682 trace_ports(const struct intel_engine_execlists *execlists,
 683             const char *msg,
 684             struct i915_request * const *ports)
 685 {
 686         const struct intel_engine_cs *engine =
 687                 container_of(execlists, typeof(*engine), execlists);
 688
 689         GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n",
 690                   engine->name, msg,
 691                   ports[0]->fence.context,
 692                   ports[0]->fence.seqno,
 693                   i915_request_completed(ports[0]) ? "!" :
 694                   i915_request_started(ports[0]) ? "*" :
 695                   "",
 696                   ports[1] ? ports[1]->fence.context : 0,
 697                   ports[1] ? ports[1]->fence.seqno : 0);
 698 }
 699
 700 static __maybe_unused bool
 701 assert_pending_valid(const struct intel_engine_execlists *execlists,
 702                      const char *msg)
 703 {
 704         struct i915_request * const *port, *rq;
 705         struct intel_context *ce = NULL;
 706
 707         trace_ports(execlists, msg, execlists->pending);
 708
 709         if (!execlists->pending[0])
 710                 return false;
 711
 712         if (execlists->pending[execlists_num_ports(execlists)])
 713                 return false;
 714
 715         for (port = execlists->pending; (rq = *port); port++) {
 716                 if (ce == rq->hw_context)
 717                         return false;
 718
 719                 ce = rq->hw_context;
 720                 if (i915_request_completed(rq))
 721                         continue;
 722
 723                 if (i915_active_is_idle(&ce->active))
 724                         return false;
 725
 726                 if (!i915_vma_is_pinned(ce->state))
 727                         return false;
 728         }
 729
 730         return ce;
 731 }
 732
 733 static void execlists_submit_ports(struct intel_engine_cs *engine)
 734 {
 735         struct intel_engine_execlists *execlists = &engine->execlists;
 736         unsigned int n;
 737
 738         GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
 739
 740         /*
 741          * We can skip acquiring intel_runtime_pm_get() here as it was taken
 742          * on our behalf by the request (see i915_gem_mark_busy()) and it will
 743          * not be relinquished until the device is idle (see
 744          * i915_gem_idle_work_handler()). As a precaution, we make sure
 745          * that all ELSP are drained i.e. we have processed the CSB,
 746          * before allowing ourselves to idle and calling intel_runtime_pm_put().
 747          */
 748         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
 749
 750         /*
 751          * ELSQ note: the submit queue is not cleared after being submitted
 752          * to the HW so we need to make sure we always clean it up. This is
 753          * currently ensured by the fact that we always write the same number
 754          * of elsq entries, keep this in mind before changing the loop below.
 755          */
 756         for (n = execlists_num_ports(execlists); n--; ) {
 757                 struct i915_request *rq = execlists->pending[n];
 758
 759                 write_desc(execlists,
 760                            rq ? execlists_update_context(rq) : 0,
 761                            n);
 762         }
 763
 764         /* we need to manually load the submit queue */
 765         if (execlists->ctrl_reg)
 766                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
 767 }
 768
 769 static bool ctx_single_port_submission(const struct intel_context *ce)
 770 {
 771         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
 772                 i915_gem_context_force_single_submission(ce->gem_context));
 773 }
 774
 775 static bool can_merge_ctx(const struct intel_context *prev,
 776                           const struct intel_context *next)
 777 {
 778         if (prev != next)
 779                 return false;
 780
 781         if (ctx_single_port_submission(prev))
 782                 return false;
 783
 784         return true;
 785 }
 786
 787 static bool can_merge_rq(const struct i915_request *prev,
 788                          const struct i915_request *next)
 789 {
 790         GEM_BUG_ON(prev == next);
 791         GEM_BUG_ON(!assert_priority_queue(prev, next));
 792
 793         if (!can_merge_ctx(prev->hw_context, next->hw_context))
 794                 return false;
 795
 796         return true;
 797 }
 798
 799 static void virtual_update_register_offsets(u32 *regs,
 800                                             struct intel_engine_cs *engine)
 801 {
 802         u32 base = engine->mmio_base;
 803
 804         /* Must match execlists_init_reg_state()! */
 805
 806         regs[CTX_CONTEXT_CONTROL] =
 807                 i915_mmio_reg_offset(RING_CONTEXT_CONTROL(base));
 808         regs[CTX_RING_HEAD] = i915_mmio_reg_offset(RING_HEAD(base));
 809         regs[CTX_RING_TAIL] = i915_mmio_reg_offset(RING_TAIL(base));
 810         regs[CTX_RING_BUFFER_START] = i915_mmio_reg_offset(RING_START(base));
 811         regs[CTX_RING_BUFFER_CONTROL] = i915_mmio_reg_offset(RING_CTL(base));
 812
 813         regs[CTX_BB_HEAD_U] = i915_mmio_reg_offset(RING_BBADDR_UDW(base));
 814         regs[CTX_BB_HEAD_L] = i915_mmio_reg_offset(RING_BBADDR(base));
 815         regs[CTX_BB_STATE] = i915_mmio_reg_offset(RING_BBSTATE(base));
 816         regs[CTX_SECOND_BB_HEAD_U] =
 817                 i915_mmio_reg_offset(RING_SBBADDR_UDW(base));
 818         regs[CTX_SECOND_BB_HEAD_L] = i915_mmio_reg_offset(RING_SBBADDR(base));
 819         regs[CTX_SECOND_BB_STATE] = i915_mmio_reg_offset(RING_SBBSTATE(base));
 820
 821         regs[CTX_CTX_TIMESTAMP] =
 822                 i915_mmio_reg_offset(RING_CTX_TIMESTAMP(base));
 823         regs[CTX_PDP3_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 3));
 824         regs[CTX_PDP3_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 3));
 825         regs[CTX_PDP2_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 2));
 826         regs[CTX_PDP2_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 2));
 827         regs[CTX_PDP1_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 1));
 828         regs[CTX_PDP1_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 1));
 829         regs[CTX_PDP0_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 0));
 830         regs[CTX_PDP0_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 0));
 831
 832         if (engine->class == RENDER_CLASS) {
 833                 regs[CTX_RCS_INDIRECT_CTX] =
 834                         i915_mmio_reg_offset(RING_INDIRECT_CTX(base));
 835                 regs[CTX_RCS_INDIRECT_CTX_OFFSET] =
 836                         i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(base));
 837                 regs[CTX_BB_PER_CTX_PTR] =
 838                         i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(base));
 839
 840                 regs[CTX_R_PWR_CLK_STATE] =
 841                         i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE);
 842         }
 843 }
 844
 845 static bool virtual_matches(const struct virtual_engine *ve,
 846                             const struct i915_request *rq,
 847                             const struct intel_engine_cs *engine)
 848 {
 849         const struct intel_engine_cs *inflight;
 850
 851         if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
 852                 return false;
 853
 854         /*
 855          * We track when the HW has completed saving the context image
 856          * (i.e. when we have seen the final CS event switching out of
 857          * the context) and must not overwrite the context image before
 858          * then. This restricts us to only using the active engine
 859          * while the previous virtualized request is inflight (so
 860          * we reuse the register offsets). This is a very small
 861          * hystersis on the greedy seelction algorithm.
 862          */
 863         inflight = intel_context_inflight(&ve->context);
 864         if (inflight && inflight != engine)
 865                 return false;
 866
 867         return true;
 868 }
 869
 870 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
 871                                      struct intel_engine_cs *engine)
 872 {
 873         struct intel_engine_cs *old = ve->siblings[0];
 874
 875         /* All unattached (rq->engine == old) must already be completed */
 876
 877         spin_lock(&old->breadcrumbs.irq_lock);
 878         if (!list_empty(&ve->context.signal_link)) {
 879                 list_move_tail(&ve->context.signal_link,
 880                                &engine->breadcrumbs.signalers);
 881                 intel_engine_queue_breadcrumbs(engine);
 882         }
 883         spin_unlock(&old->breadcrumbs.irq_lock);
 884 }
 885
 886 static struct i915_request *
 887 last_active(const struct intel_engine_execlists *execlists)
 888 {
 889         struct i915_request * const *last = execlists->active;
 890
 891         while (*last && i915_request_completed(*last))
 892                 last++;
 893
 894         return *last;
 895 }
 896
 897 static void defer_request(struct i915_request *rq, struct list_head * const pl)
 898 {
 899         LIST_HEAD(list);
 900
 901         /*
 902          * We want to move the interrupted request to the back of
 903          * the round-robin list (i.e. its priority level), but
 904          * in doing so, we must then move all requests that were in
 905          * flight and were waiting for the interrupted request to
 906          * be run after it again.
 907          */
 908         do {
 909                 struct i915_dependency *p;
 910
 911                 GEM_BUG_ON(i915_request_is_active(rq));
 912                 list_move_tail(&rq->sched.link, pl);
 913
 914                 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
 915                         struct i915_request *w =
 916                                 container_of(p->waiter, typeof(*w), sched);
 917
 918                         /* Leave semaphores spinning on the other engines */
 919                         if (w->engine != rq->engine)
 920                                 continue;
 921
 922                         /* No waiter should start before its signaler */
 923                         GEM_BUG_ON(i915_request_started(w) &&
 924                                    !i915_request_completed(rq));
 925
 926                         GEM_BUG_ON(i915_request_is_active(w));
 927                         if (list_empty(&w->sched.link))
 928                                 continue; /* Not yet submitted; unready */
 929
 930                         if (rq_prio(w) < rq_prio(rq))
 931                                 continue;
 932
 933                         GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
 934                         list_move_tail(&w->sched.link, &list);
 935                 }
 936
 937                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
 938         } while (rq);
 939 }
 940
 941 static void defer_active(struct intel_engine_cs *engine)
 942 {
 943         struct i915_request *rq;
 944
 945         rq = __unwind_incomplete_requests(engine);
 946         if (!rq)
 947                 return;
 948
 949         defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
 950 }
 951
 952 static bool
 953 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
 954 {
 955         int hint;
 956
 957         if (!intel_engine_has_semaphores(engine))
 958                 return false;
 959
 960         if (list_is_last(&rq->sched.link, &engine->active.requests))
 961                 return false;
 962
 963         hint = max(rq_prio(list_next_entry(rq, sched.link)),
 964                    engine->execlists.queue_priority_hint);
 965
 966         return hint >= effective_prio(rq);
 967 }
 968
 969 static int
 970 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
 971 {
 972         if (list_is_last(&rq->sched.link, &engine->active.requests))
 973                 return INT_MIN;
 974
 975         return rq_prio(list_next_entry(rq, sched.link));
 976 }
 977
 978 static bool
 979 enable_timeslice(const struct intel_engine_execlists *execlists)
 980 {
 981         const struct i915_request *rq = *execlists->active;
 982
 983         if (i915_request_completed(rq))
 984                 return false;
 985
 986         return execlists->switch_priority_hint >= effective_prio(rq);
 987 }
 988
 989 static void record_preemption(struct intel_engine_execlists *execlists)
 990 {
 991         (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
 992 }
 993
 994 static void execlists_dequeue(struct intel_engine_cs *engine)
 995 {
 996         struct intel_engine_execlists * const execlists = &engine->execlists;
 997         struct i915_request **port = execlists->pending;
 998         struct i915_request ** const last_port = port + execlists->port_mask;
 999         struct i915_request *last;
1000         struct rb_node *rb;
1001         bool submit = false;
1002
1003         /*
1004          * Hardware submission is through 2 ports. Conceptually each port
1005          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1006          * static for a context, and unique to each, so we only execute
1007          * requests belonging to a single context from each ring. RING_HEAD
1008          * is maintained by the CS in the context image, it marks the place
1009          * where it got up to last time, and through RING_TAIL we tell the CS
1010          * where we want to execute up to this time.
1011          *
1012          * In this list the requests are in order of execution. Consecutive
1013          * requests from the same context are adjacent in the ringbuffer. We
1014          * can combine these requests into a single RING_TAIL update:
1015          *
1016          *              RING_HEAD...req1...req2
1017          *                                    ^- RING_TAIL
1018          * since to execute req2 the CS must first execute req1.
1019          *
1020          * Our goal then is to point each port to the end of a consecutive
1021          * sequence of requests as being the most optimal (fewest wake ups
1022          * and context switches) submission.
1023          */
1024
1025         for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1026                 struct virtual_engine *ve =
1027                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1028                 struct i915_request *rq = READ_ONCE(ve->request);
1029
1030                 if (!rq) { /* lazily cleanup after another engine handled rq */
1031                         rb_erase_cached(rb, &execlists->virtual);
1032                         RB_CLEAR_NODE(rb);
1033                         rb = rb_first_cached(&execlists->virtual);
1034                         continue;
1035                 }
1036
1037                 if (!virtual_matches(ve, rq, engine)) {
1038                         rb = rb_next(rb);
1039                         continue;
1040                 }
1041
1042                 break;
1043         }
1044
1045         /*
1046          * If the queue is higher priority than the last
1047          * request in the currently active context, submit afresh.
1048          * We will resubmit again afterwards in case we need to split
1049          * the active context to interject the preemption request,
1050          * i.e. we will retrigger preemption following the ack in case
1051          * of trouble.
1052          */
1053         last = last_active(execlists);
1054         if (last) {
1055                 if (need_preempt(engine, last, rb)) {
1056                         GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n",
1057                                   engine->name,
1058                                   last->fence.context,
1059                                   last->fence.seqno,
1060                                   last->sched.attr.priority,
1061                                   execlists->queue_priority_hint);
1062                         record_preemption(execlists);
1063
1064                         /*
1065                          * Don't let the RING_HEAD advance past the breadcrumb
1066                          * as we unwind (and until we resubmit) so that we do
1067                          * not accidentally tell it to go backwards.
1068                          */
1069                         ring_set_paused(engine, 1);
1070
1071                         /*
1072                          * Note that we have not stopped the GPU at this point,
1073                          * so we are unwinding the incomplete requests as they
1074                          * remain inflight and so by the time we do complete
1075                          * the preemption, some of the unwound requests may
1076                          * complete!
1077                          */
1078                         __unwind_incomplete_requests(engine);
1079
1080                         /*
1081                          * If we need to return to the preempted context, we
1082                          * need to skip the lite-restore and force it to
1083                          * reload the RING_TAIL. Otherwise, the HW has a
1084                          * tendency to ignore us rewinding the TAIL to the
1085                          * end of an earlier request.
1086                          */
1087                         last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1088                         last = NULL;
1089                 } else if (need_timeslice(engine, last) &&
1090                            !timer_pending(&engine->execlists.timer)) {
1091                         GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n",
1092                                   engine->name,
1093                                   last->fence.context,
1094                                   last->fence.seqno,
1095                                   last->sched.attr.priority,
1096                                   execlists->queue_priority_hint);
1097
1098                         ring_set_paused(engine, 1);
1099                         defer_active(engine);
1100
1101                         /*
1102                          * Unlike for preemption, if we rewind and continue
1103                          * executing the same context as previously active,
1104                          * the order of execution will remain the same and
1105                          * the tail will only advance. We do not need to
1106                          * force a full context restore, as a lite-restore
1107                          * is sufficient to resample the monotonic TAIL.
1108                          *
1109                          * If we switch to any other context, similarly we
1110                          * will not rewind TAIL of current context, and
1111                          * normal save/restore will preserve state and allow
1112                          * us to later continue executing the same request.
1113                          */
1114                         last = NULL;
1115                 } else {
1116                         /*
1117                          * Otherwise if we already have a request pending
1118                          * for execution after the current one, we can
1119                          * just wait until the next CS event before
1120                          * queuing more. In either case we will force a
1121                          * lite-restore preemption event, but if we wait
1122                          * we hopefully coalesce several updates into a single
1123                          * submission.
1124                          */
1125                         if (!list_is_last(&last->sched.link,
1126                                           &engine->active.requests))
1127                                 return;
1128
1129                         /*
1130                          * WaIdleLiteRestore:bdw,skl
1131                          * Apply the wa NOOPs to prevent
1132                          * ring:HEAD == rq:TAIL as we resubmit the
1133                          * request. See gen8_emit_fini_breadcrumb() for
1134                          * where we prepare the padding after the
1135                          * end of the request.
1136                          */
1137                         last->tail = last->wa_tail;
1138                 }
1139         }
1140
1141         while (rb) { /* XXX virtual is always taking precedence */
1142                 struct virtual_engine *ve =
1143                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1144                 struct i915_request *rq;
1145
1146                 spin_lock(&ve->base.active.lock);
1147
1148                 rq = ve->request;
1149                 if (unlikely(!rq)) { /* lost the race to a sibling */
1150                         spin_unlock(&ve->base.active.lock);
1151                         rb_erase_cached(rb, &execlists->virtual);
1152                         RB_CLEAR_NODE(rb);
1153                         rb = rb_first_cached(&execlists->virtual);
1154                         continue;
1155                 }
1156
1157                 GEM_BUG_ON(rq != ve->request);
1158                 GEM_BUG_ON(rq->engine != &ve->base);
1159                 GEM_BUG_ON(rq->hw_context != &ve->context);
1160
1161                 if (rq_prio(rq) >= queue_prio(execlists)) {
1162                         if (!virtual_matches(ve, rq, engine)) {
1163                                 spin_unlock(&ve->base.active.lock);
1164                                 rb = rb_next(rb);
1165                                 continue;
1166                         }
1167
1168                         if (i915_request_completed(rq)) {
1169                                 ve->request = NULL;
1170                                 ve->base.execlists.queue_priority_hint = INT_MIN;
1171                                 rb_erase_cached(rb, &execlists->virtual);
1172                                 RB_CLEAR_NODE(rb);
1173
1174                                 rq->engine = engine;
1175                                 __i915_request_submit(rq);
1176
1177                                 spin_unlock(&ve->base.active.lock);
1178
1179                                 rb = rb_first_cached(&execlists->virtual);
1180                                 continue;
1181                         }
1182
1183                         if (last && !can_merge_rq(last, rq)) {
1184                                 spin_unlock(&ve->base.active.lock);
1185                                 return; /* leave this for another */
1186                         }
1187
1188                         GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n",
1189                                   engine->name,
1190                                   rq->fence.context,
1191                                   rq->fence.seqno,
1192                                   i915_request_completed(rq) ? "!" :
1193                                   i915_request_started(rq) ? "*" :
1194                                   "",
1195                                   yesno(engine != ve->siblings[0]));
1196
1197                         ve->request = NULL;
1198                         ve->base.execlists.queue_priority_hint = INT_MIN;
1199                         rb_erase_cached(rb, &execlists->virtual);
1200                         RB_CLEAR_NODE(rb);
1201
1202                         GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1203                         rq->engine = engine;
1204
1205                         if (engine != ve->siblings[0]) {
1206                                 u32 *regs = ve->context.lrc_reg_state;
1207                                 unsigned int n;
1208
1209                                 GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1210                                 virtual_update_register_offsets(regs, engine);
1211
1212                                 if (!list_empty(&ve->context.signals))
1213                                         virtual_xfer_breadcrumbs(ve, engine);
1214
1215                                 /*
1216                                  * Move the bound engine to the top of the list
1217                                  * for future execution. We then kick this
1218                                  * tasklet first before checking others, so that
1219                                  * we preferentially reuse this set of bound
1220                                  * registers.
1221                                  */
1222                                 for (n = 1; n < ve->num_siblings; n++) {
1223                                         if (ve->siblings[n] == engine) {
1224                                                 swap(ve->siblings[n],
1225                                                      ve->siblings[0]);
1226                                                 break;
1227                                         }
1228                                 }
1229
1230                                 GEM_BUG_ON(ve->siblings[0] != engine);
1231                         }
1232
1233                         __i915_request_submit(rq);
1234                         if (!i915_request_completed(rq)) {
1235                                 submit = true;
1236                                 last = rq;
1237                         }
1238                 }
1239
1240                 spin_unlock(&ve->base.active.lock);
1241                 break;
1242         }
1243
1244         while ((rb = rb_first_cached(&execlists->queue))) {
1245                 struct i915_priolist *p = to_priolist(rb);
1246                 struct i915_request *rq, *rn;
1247                 int i;
1248
1249                 priolist_for_each_request_consume(rq, rn, p, i) {
1250                         if (i915_request_completed(rq))
1251                                 goto skip;
1252
1253                         /*
1254                          * Can we combine this request with the current port?
1255                          * It has to be the same context/ringbuffer and not
1256                          * have any exceptions (e.g. GVT saying never to
1257                          * combine contexts).
1258                          *
1259                          * If we can combine the requests, we can execute both
1260                          * by updating the RING_TAIL to point to the end of the
1261                          * second request, and so we never need to tell the
1262                          * hardware about the first.
1263                          */
1264                         if (last && !can_merge_rq(last, rq)) {
1265                                 /*
1266                                  * If we are on the second port and cannot
1267                                  * combine this request with the last, then we
1268                                  * are done.
1269                                  */
1270                                 if (port == last_port)
1271                                         goto done;
1272
1273                                 /*
1274                                  * We must not populate both ELSP[] with the
1275                                  * same LRCA, i.e. we must submit 2 different
1276                                  * contexts if we submit 2 ELSP.
1277                                  */
1278                                 if (last->hw_context == rq->hw_context)
1279                                         goto done;
1280
1281                                 /*
1282                                  * If GVT overrides us we only ever submit
1283                                  * port[0], leaving port[1] empty. Note that we
1284                                  * also have to be careful that we don't queue
1285                                  * the same context (even though a different
1286                                  * request) to the second port.
1287                                  */
1288                                 if (ctx_single_port_submission(last->hw_context) ||
1289                                     ctx_single_port_submission(rq->hw_context))
1290                                         goto done;
1291
1292                                 *port = execlists_schedule_in(last, port - execlists->pending);
1293                                 port++;
1294                         }
1295
1296                         last = rq;
1297                         submit = true;
1298 skip:
1299                         __i915_request_submit(rq);
1300                 }
1301
1302                 rb_erase_cached(&p->node, &execlists->queue);
1303                 i915_priolist_free(p);
1304         }
1305
1306 done:
1307         /*
1308          * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
1309          *
1310          * We choose the priority hint such that if we add a request of greater
1311          * priority than this, we kick the submission tasklet to decide on
1312          * the right order of submitting the requests to hardware. We must
1313          * also be prepared to reorder requests as they are in-flight on the
1314          * HW. We derive the priority hint then as the first "hole" in
1315          * the HW submission ports and if there are no available slots,
1316          * the priority of the lowest executing request, i.e. last.
1317          *
1318          * When we do receive a higher priority request ready to run from the
1319          * user, see queue_request(), the priority hint is bumped to that
1320          * request triggering preemption on the next dequeue (or subsequent
1321          * interrupt for secondary ports).
1322          */
1323         execlists->queue_priority_hint = queue_prio(execlists);
1324         GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n",
1325                   engine->name, execlists->queue_priority_hint,
1326                   yesno(submit));
1327
1328         if (submit) {
1329                 *port = execlists_schedule_in(last, port - execlists->pending);
1330                 memset(port + 1, 0, (last_port - port) * sizeof(*port));
1331                 execlists_submit_ports(engine);
1332                 execlists->switch_priority_hint =
1333                         switch_prio(engine, *execlists->pending);
1334         } else {
1335                 ring_set_paused(engine, 0);
1336         }
1337 }
1338
1339 static void
1340 cancel_port_requests(struct intel_engine_execlists * const execlists)
1341 {
1342         struct i915_request * const *port, *rq;
1343
1344         for (port = execlists->pending; (rq = *port); port++)
1345                 execlists_schedule_out(rq);
1346         memset(execlists->pending, 0, sizeof(execlists->pending));
1347
1348         for (port = execlists->active; (rq = *port); port++)
1349                 execlists_schedule_out(rq);
1350         execlists->active =
1351                 memset(execlists->inflight, 0, sizeof(execlists->inflight));
1352 }
1353
1354 static inline void
1355 invalidate_csb_entries(const u32 *first, const u32 *last)
1356 {
1357         clflush((void *)first);
1358         clflush((void *)last);
1359 }
1360
1361 static inline bool
1362 reset_in_progress(const struct intel_engine_execlists *execlists)
1363 {
1364         return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1365 }
1366
1367 enum csb_step {
1368         CSB_NOP,
1369         CSB_PROMOTE,
1370         CSB_PREEMPT,
1371         CSB_COMPLETE,
1372 };
1373
1374 static inline enum csb_step
1375 csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
1376 {
1377         unsigned int status = *csb;
1378
1379         if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
1380                 return CSB_PROMOTE;
1381
1382         if (status & GEN8_CTX_STATUS_PREEMPTED)
1383                 return CSB_PREEMPT;
1384
1385         if (*execlists->active)
1386                 return CSB_COMPLETE;
1387
1388         return CSB_NOP;
1389 }
1390
1391 static void process_csb(struct intel_engine_cs *engine)
1392 {
1393         struct intel_engine_execlists * const execlists = &engine->execlists;
1394         const u32 * const buf = execlists->csb_status;
1395         const u8 num_entries = execlists->csb_size;
1396         u8 head, tail;
1397
1398         GEM_BUG_ON(USES_GUC_SUBMISSION(engine->i915));
1399
1400         /*
1401          * Note that csb_write, csb_status may be either in HWSP or mmio.
1402          * When reading from the csb_write mmio register, we have to be
1403          * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
1404          * the low 4bits. As it happens we know the next 4bits are always
1405          * zero and so we can simply masked off the low u8 of the register
1406          * and treat it identically to reading from the HWSP (without having
1407          * to use explicit shifting and masking, and probably bifurcating
1408          * the code to handle the legacy mmio read).
1409          */
1410         head = execlists->csb_head;
1411         tail = READ_ONCE(*execlists->csb_write);
1412         GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
1413         if (unlikely(head == tail))
1414                 return;
1415
1416         /*
1417          * Hopefully paired with a wmb() in HW!
1418          *
1419          * We must complete the read of the write pointer before any reads
1420          * from the CSB, so that we do not see stale values. Without an rmb
1421          * (lfence) the HW may speculatively perform the CSB[] reads *before*
1422          * we perform the READ_ONCE(*csb_write).
1423          */
1424         rmb();
1425
1426         do {
1427                 if (++head == num_entries)
1428                         head = 0;
1429
1430                 /*
1431                  * We are flying near dragons again.
1432                  *
1433                  * We hold a reference to the request in execlist_port[]
1434                  * but no more than that. We are operating in softirq
1435                  * context and so cannot hold any mutex or sleep. That
1436                  * prevents us stopping the requests we are processing
1437                  * in port[] from being retired simultaneously (the
1438                  * breadcrumb will be complete before we see the
1439                  * context-switch). As we only hold the reference to the
1440                  * request, any pointer chasing underneath the request
1441                  * is subject to a potential use-after-free. Thus we
1442                  * store all of the bookkeeping within port[] as
1443                  * required, and avoid using unguarded pointers beneath
1444                  * request itself. The same applies to the atomic
1445                  * status notifier.
1446                  */
1447
1448                 GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n",
1449                           engine->name, head,
1450                           buf[2 * head + 0], buf[2 * head + 1]);
1451
1452                 switch (csb_parse(execlists, buf + 2 * head)) {
1453                 case CSB_PREEMPT: /* cancel old inflight, prepare for switch */
1454                         trace_ports(execlists, "preempted", execlists->active);
1455
1456                         while (*execlists->active)
1457                                 execlists_schedule_out(*execlists->active++);
1458
1459                         /* fallthrough */
1460                 case CSB_PROMOTE: /* switch pending to inflight */
1461                         GEM_BUG_ON(*execlists->active);
1462                         GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
1463                         execlists->active =
1464                                 memcpy(execlists->inflight,
1465                                        execlists->pending,
1466                                        execlists_num_ports(execlists) *
1467                                        sizeof(*execlists->pending));
1468
1469                         if (enable_timeslice(execlists))
1470                                 mod_timer(&execlists->timer, jiffies + 1);
1471
1472                         if (!inject_preempt_hang(execlists))
1473                                 ring_set_paused(engine, 0);
1474
1475                         WRITE_ONCE(execlists->pending[0], NULL);
1476                         break;
1477
1478                 case CSB_COMPLETE: /* port0 completed, advanced to port1 */
1479                         trace_ports(execlists, "completed", execlists->active);
1480
1481                         /*
1482                          * We rely on the hardware being strongly
1483                          * ordered, that the breadcrumb write is
1484                          * coherent (visible from the CPU) before the
1485                          * user interrupt and CSB is processed.
1486                          */
1487                         GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
1488                                    !reset_in_progress(execlists));
1489                         execlists_schedule_out(*execlists->active++);
1490
1491                         GEM_BUG_ON(execlists->active - execlists->inflight >
1492                                    execlists_num_ports(execlists));
1493                         break;
1494
1495                 case CSB_NOP:
1496                         break;
1497                 }
1498         } while (head != tail);
1499
1500         execlists->csb_head = head;
1501
1502         /*
1503          * Gen11 has proven to fail wrt global observation point between
1504          * entry and tail update, failing on the ordering and thus
1505          * we see an old entry in the context status buffer.
1506          *
1507          * Forcibly evict out entries for the next gpu csb update,
1508          * to increase the odds that we get a fresh entries with non
1509          * working hardware. The cost for doing so comes out mostly with
1510          * the wash as hardware, working or not, will need to do the
1511          * invalidation before.
1512          */
1513         invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
1514 }
1515
1516 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
1517 {
1518         lockdep_assert_held(&engine->active.lock);
1519         if (!engine->execlists.pending[0])
1520                 execlists_dequeue(engine);
1521 }
1522
1523 /*
1524  * Check the unread Context Status Buffers and manage the submission of new
1525  * contexts to the ELSP accordingly.
1526  */
1527 static void execlists_submission_tasklet(unsigned long data)
1528 {
1529         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
1530         unsigned long flags;
1531
1532         process_csb(engine);
1533         if (!READ_ONCE(engine->execlists.pending[0])) {
1534                 spin_lock_irqsave(&engine->active.lock, flags);
1535                 __execlists_submission_tasklet(engine);
1536                 spin_unlock_irqrestore(&engine->active.lock, flags);
1537         }
1538 }
1539
1540 static void execlists_submission_timer(struct timer_list *timer)
1541 {
1542         struct intel_engine_cs *engine =
1543                 from_timer(engine, timer, execlists.timer);
1544
1545         /* Kick the tasklet for some interrupt coalescing and reset handling */
1546         tasklet_hi_schedule(&engine->execlists.tasklet);
1547 }
1548
1549 static void queue_request(struct intel_engine_cs *engine,
1550                           struct i915_sched_node *node,
1551                           int prio)
1552 {
1553         GEM_BUG_ON(!list_empty(&node->link));
1554         list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
1555 }
1556
1557 static void __submit_queue_imm(struct intel_engine_cs *engine)
1558 {
1559         struct intel_engine_execlists * const execlists = &engine->execlists;
1560
1561         if (reset_in_progress(execlists))
1562                 return; /* defer until we restart the engine following reset */
1563
1564         if (execlists->tasklet.func == execlists_submission_tasklet)
1565                 __execlists_submission_tasklet(engine);
1566         else
1567                 tasklet_hi_schedule(&execlists->tasklet);
1568 }
1569
1570 static void submit_queue(struct intel_engine_cs *engine,
1571                          const struct i915_request *rq)
1572 {
1573         struct intel_engine_execlists *execlists = &engine->execlists;
1574
1575         if (rq_prio(rq) <= execlists->queue_priority_hint)
1576                 return;
1577
1578         execlists->queue_priority_hint = rq_prio(rq);
1579         __submit_queue_imm(engine);
1580 }
1581
1582 static void execlists_submit_request(struct i915_request *request)
1583 {
1584         struct intel_engine_cs *engine = request->engine;
1585         unsigned long flags;
1586
1587         /* Will be called from irq-context when using foreign fences. */
1588         spin_lock_irqsave(&engine->active.lock, flags);
1589
1590         queue_request(engine, &request->sched, rq_prio(request));
1591
1592         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1593         GEM_BUG_ON(list_empty(&request->sched.link));
1594
1595         submit_queue(engine, request);
1596
1597         spin_unlock_irqrestore(&engine->active.lock, flags);
1598 }
1599
1600 static void __execlists_context_fini(struct intel_context *ce)
1601 {
1602         intel_ring_put(ce->ring);
1603         i915_vma_put(ce->state);
1604 }
1605
1606 static void execlists_context_destroy(struct kref *kref)
1607 {
1608         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1609
1610         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1611         GEM_BUG_ON(intel_context_is_pinned(ce));
1612
1613         if (ce->state)
1614                 __execlists_context_fini(ce);
1615
1616         intel_context_fini(ce);
1617         intel_context_free(ce);
1618 }
1619
1620 static void
1621 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
1622 {
1623         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1624                 return;
1625
1626         vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
1627         vaddr += engine->context_size;
1628
1629         memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
1630 }
1631
1632 static void
1633 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
1634 {
1635         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1636                 return;
1637
1638         vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
1639         vaddr += engine->context_size;
1640
1641         if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE))
1642                 dev_err_once(engine->i915->drm.dev,
1643                              "%s context redzone overwritten!\n",
1644                              engine->name);
1645 }
1646
1647 static void execlists_context_unpin(struct intel_context *ce)
1648 {
1649         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
1650                       ce->engine);
1651
1652         i915_gem_context_unpin_hw_id(ce->gem_context);
1653         i915_gem_object_unpin_map(ce->state->obj);
1654         intel_ring_reset(ce->ring, ce->ring->tail);
1655 }
1656
1657 static void
1658 __execlists_update_reg_state(struct intel_context *ce,
1659                              struct intel_engine_cs *engine)
1660 {
1661         struct intel_ring *ring = ce->ring;
1662         u32 *regs = ce->lrc_reg_state;
1663
1664         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
1665         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1666
1667         regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(ring->vma);
1668         regs[CTX_RING_HEAD + 1] = ring->head;
1669         regs[CTX_RING_TAIL + 1] = ring->tail;
1670
1671         /* RPCS */
1672         if (engine->class == RENDER_CLASS) {
1673                 regs[CTX_R_PWR_CLK_STATE + 1] =
1674                         intel_sseu_make_rpcs(engine->i915, &ce->sseu);
1675
1676                 i915_oa_init_reg_state(engine, ce, regs);
1677         }
1678 }
1679
1680 static int
1681 __execlists_context_pin(struct intel_context *ce,
1682                         struct intel_engine_cs *engine)
1683 {
1684         void *vaddr;
1685         int ret;
1686
1687         GEM_BUG_ON(!ce->state);
1688
1689         ret = intel_context_active_acquire(ce);
1690         if (ret)
1691                 goto err;
1692         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1693
1694         vaddr = i915_gem_object_pin_map(ce->state->obj,
1695                                         i915_coherent_map_type(engine->i915) |
1696                                         I915_MAP_OVERRIDE);
1697         if (IS_ERR(vaddr)) {
1698                 ret = PTR_ERR(vaddr);
1699                 goto unpin_active;
1700         }
1701
1702         ret = i915_gem_context_pin_hw_id(ce->gem_context);
1703         if (ret)
1704                 goto unpin_map;
1705
1706         ce->lrc_desc = lrc_descriptor(ce, engine);
1707         ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
1708         __execlists_update_reg_state(ce, engine);
1709
1710         return 0;
1711
1712 unpin_map:
1713         i915_gem_object_unpin_map(ce->state->obj);
1714 unpin_active:
1715         intel_context_active_release(ce);
1716 err:
1717         return ret;
1718 }
1719
1720 static int execlists_context_pin(struct intel_context *ce)
1721 {
1722         return __execlists_context_pin(ce, ce->engine);
1723 }
1724
1725 static int execlists_context_alloc(struct intel_context *ce)
1726 {
1727         return __execlists_context_alloc(ce, ce->engine);
1728 }
1729
1730 static void execlists_context_reset(struct intel_context *ce)
1731 {
1732         /*
1733          * Because we emit WA_TAIL_DWORDS there may be a disparity
1734          * between our bookkeeping in ce->ring->head and ce->ring->tail and
1735          * that stored in context. As we only write new commands from
1736          * ce->ring->tail onwards, everything before that is junk. If the GPU
1737          * starts reading from its RING_HEAD from the context, it may try to
1738          * execute that junk and die.
1739          *
1740          * The contexts that are stilled pinned on resume belong to the
1741          * kernel, and are local to each engine. All other contexts will
1742          * have their head/tail sanitized upon pinning before use, so they
1743          * will never see garbage,
1744          *
1745          * So to avoid that we reset the context images upon resume. For
1746          * simplicity, we just zero everything out.
1747          */
1748         intel_ring_reset(ce->ring, 0);
1749         __execlists_update_reg_state(ce, ce->engine);
1750 }
1751
1752 static const struct intel_context_ops execlists_context_ops = {
1753         .alloc = execlists_context_alloc,
1754
1755         .pin = execlists_context_pin,
1756         .unpin = execlists_context_unpin,
1757
1758         .enter = intel_context_enter_engine,
1759         .exit = intel_context_exit_engine,
1760
1761         .reset = execlists_context_reset,
1762         .destroy = execlists_context_destroy,
1763 };
1764
1765 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
1766 {
1767         u32 *cs;
1768
1769         GEM_BUG_ON(!rq->timeline->has_initial_breadcrumb);
1770
1771         cs = intel_ring_begin(rq, 6);
1772         if (IS_ERR(cs))
1773                 return PTR_ERR(cs);
1774
1775         /*
1776          * Check if we have been preempted before we even get started.
1777          *
1778          * After this point i915_request_started() reports true, even if
1779          * we get preempted and so are no longer running.
1780          */
1781         *cs++ = MI_ARB_CHECK;
1782         *cs++ = MI_NOOP;
1783
1784         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1785         *cs++ = rq->timeline->hwsp_offset;
1786         *cs++ = 0;
1787         *cs++ = rq->fence.seqno - 1;
1788
1789         intel_ring_advance(rq, cs);
1790
1791         /* Record the updated position of the request's payload */
1792         rq->infix = intel_ring_offset(rq, cs);
1793
1794         return 0;
1795 }
1796
1797 static int emit_pdps(struct i915_request *rq)
1798 {
1799         const struct intel_engine_cs * const engine = rq->engine;
1800         struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->hw_context->vm);
1801         int err, i;
1802         u32 *cs;
1803
1804         GEM_BUG_ON(intel_vgpu_active(rq->i915));
1805
1806         /*
1807          * Beware ye of the dragons, this sequence is magic!
1808          *
1809          * Small changes to this sequence can cause anything from
1810          * GPU hangs to forcewake errors and machine lockups!
1811          */
1812
1813         /* Flush any residual operations from the context load */
1814         err = engine->emit_flush(rq, EMIT_FLUSH);
1815         if (err)
1816                 return err;
1817
1818         /* Magic required to prevent forcewake errors! */
1819         err = engine->emit_flush(rq, EMIT_INVALIDATE);
1820         if (err)
1821                 return err;
1822
1823         cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
1824         if (IS_ERR(cs))
1825                 return PTR_ERR(cs);
1826
1827         /* Ensure the LRI have landed before we invalidate & continue */
1828         *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
1829         for (i = GEN8_3LVL_PDPES; i--; ) {
1830                 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
1831                 u32 base = engine->mmio_base;
1832
1833                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
1834                 *cs++ = upper_32_bits(pd_daddr);
1835                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
1836                 *cs++ = lower_32_bits(pd_daddr);
1837         }
1838         *cs++ = MI_NOOP;
1839
1840         intel_ring_advance(rq, cs);
1841
1842         /* Be doubly sure the LRI have landed before proceeding */
1843         err = engine->emit_flush(rq, EMIT_FLUSH);
1844         if (err)
1845                 return err;
1846
1847         /* Re-invalidate the TLB for luck */
1848         return engine->emit_flush(rq, EMIT_INVALIDATE);
1849 }
1850
1851 static int execlists_request_alloc(struct i915_request *request)
1852 {
1853         int ret;
1854
1855         GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
1856
1857         /*
1858          * Flush enough space to reduce the likelihood of waiting after
1859          * we start building the request - in which case we will just
1860          * have to repeat work.
1861          */
1862         request->reserved_space += EXECLISTS_REQUEST_SIZE;
1863
1864         /*
1865          * Note that after this point, we have committed to using
1866          * this request as it is being used to both track the
1867          * state of engine initialisation and liveness of the
1868          * golden renderstate above. Think twice before you try
1869          * to cancel/unwind this request now.
1870          */
1871
1872         /* Unconditionally invalidate GPU caches and TLBs. */
1873         if (i915_vm_is_4lvl(request->hw_context->vm))
1874                 ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
1875         else
1876                 ret = emit_pdps(request);
1877         if (ret)
1878                 return ret;
1879
1880         request->reserved_space -= EXECLISTS_REQUEST_SIZE;
1881         return 0;
1882 }
1883
1884 /*
1885  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1886  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1887  * but there is a slight complication as this is applied in WA batch where the
1888  * values are only initialized once so we cannot take register value at the
1889  * beginning and reuse it further; hence we save its value to memory, upload a
1890  * constant value with bit21 set and then we restore it back with the saved value.
1891  * To simplify the WA, a constant value is formed by using the default value
1892  * of this register. This shouldn't be a problem because we are only modifying
1893  * it for a short period and this batch in non-premptible. We can ofcourse
1894  * use additional instructions that read the actual value of the register
1895  * at that time and set our bit of interest but it makes the WA complicated.
1896  *
1897  * This WA is also required for Gen9 so extracting as a function avoids
1898  * code duplication.
1899  */
1900 static u32 *
1901 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1902 {
1903         /* NB no one else is allowed to scribble over scratch + 256! */
1904         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1905         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1906         *batch++ = intel_gt_scratch_offset(engine->gt,
1907                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1908         *batch++ = 0;
1909
1910         *batch++ = MI_LOAD_REGISTER_IMM(1);
1911         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1912         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1913
1914         batch = gen8_emit_pipe_control(batch,
1915                                        PIPE_CONTROL_CS_STALL |
1916                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
1917                                        0);
1918
1919         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1920         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1921         *batch++ = intel_gt_scratch_offset(engine->gt,
1922                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1923         *batch++ = 0;
1924
1925         return batch;
1926 }
1927
1928 static u32 slm_offset(struct intel_engine_cs *engine)
1929 {
1930         return intel_gt_scratch_offset(engine->gt,
1931                                        INTEL_GT_SCRATCH_FIELD_CLEAR_SLM_WA);
1932 }
1933
1934 /*
1935  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1936  * initialized at the beginning and shared across all contexts but this field
1937  * helps us to have multiple batches at different offsets and select them based
1938  * on a criteria. At the moment this batch always start at the beginning of the page
1939  * and at this point we don't have multiple wa_ctx batch buffers.
1940  *
1941  * The number of WA applied are not known at the beginning; we use this field
1942  * to return the no of DWORDS written.
1943  *
1944  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1945  * so it adds NOOPs as padding to make it cacheline aligned.
1946  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1947  * makes a complete batch buffer.
1948  */
1949 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1950 {
1951         /* WaDisableCtxRestoreArbitration:bdw,chv */
1952         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1953
1954         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1955         if (IS_BROADWELL(engine->i915))
1956                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1957
1958         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1959         /* Actual scratch location is at 128 bytes offset */
1960         batch = gen8_emit_pipe_control(batch,
1961                                        PIPE_CONTROL_FLUSH_L3 |
1962                                        PIPE_CONTROL_GLOBAL_GTT_IVB |
1963                                        PIPE_CONTROL_CS_STALL |
1964                                        PIPE_CONTROL_QW_WRITE,
1965                                        slm_offset(engine));
1966
1967         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1968
1969         /* Pad to end of cacheline */
1970         while ((unsigned long)batch % CACHELINE_BYTES)
1971                 *batch++ = MI_NOOP;
1972
1973         /*
1974          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1975          * execution depends on the length specified in terms of cache lines
1976          * in the register CTX_RCS_INDIRECT_CTX
1977          */
1978
1979         return batch;
1980 }
1981
1982 struct lri {
1983         i915_reg_t reg;
1984         u32 value;
1985 };
1986
1987 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1988 {
1989         GEM_BUG_ON(!count || count > 63);
1990
1991         *batch++ = MI_LOAD_REGISTER_IMM(count);
1992         do {
1993                 *batch++ = i915_mmio_reg_offset(lri->reg);
1994                 *batch++ = lri->value;
1995         } while (lri++, --count);
1996         *batch++ = MI_NOOP;
1997
1998         return batch;
1999 }
2000
2001 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2002 {
2003         static const struct lri lri[] = {
2004                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
2005                 {
2006                         COMMON_SLICE_CHICKEN2,
2007                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
2008                                        0),
2009                 },
2010
2011                 /* BSpec: 11391 */
2012                 {
2013                         FF_SLICE_CHICKEN,
2014                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
2015                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
2016                 },
2017
2018                 /* BSpec: 11299 */
2019                 {
2020                         _3D_CHICKEN3,
2021                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
2022                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
2023                 }
2024         };
2025
2026         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2027
2028         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
2029         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2030
2031         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
2032
2033         /* WaMediaPoolStateCmdInWABB:bxt,glk */
2034         if (HAS_POOLED_EU(engine->i915)) {
2035                 /*
2036                  * EU pool configuration is setup along with golden context
2037                  * during context initialization. This value depends on
2038                  * device type (2x6 or 3x6) and needs to be updated based
2039                  * on which subslice is disabled especially for 2x6
2040                  * devices, however it is safe to load default
2041                  * configuration of 3x6 device instead of masking off
2042                  * corresponding bits because HW ignores bits of a disabled
2043                  * subslice and drops down to appropriate config. Please
2044                  * see render_state_setup() in i915_gem_render_state.c for
2045                  * possible configurations, to avoid duplication they are
2046                  * not shown here again.
2047                  */
2048                 *batch++ = GEN9_MEDIA_POOL_STATE;
2049                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
2050                 *batch++ = 0x00777000;
2051                 *batch++ = 0;
2052                 *batch++ = 0;
2053                 *batch++ = 0;
2054         }
2055
2056         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2057
2058         /* Pad to end of cacheline */
2059         while ((unsigned long)batch % CACHELINE_BYTES)
2060                 *batch++ = MI_NOOP;
2061
2062         return batch;
2063 }
2064
2065 static u32 *
2066 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2067 {
2068         int i;
2069
2070         /*
2071          * WaPipeControlBefore3DStateSamplePattern: cnl
2072          *
2073          * Ensure the engine is idle prior to programming a
2074          * 3DSTATE_SAMPLE_PATTERN during a context restore.
2075          */
2076         batch = gen8_emit_pipe_control(batch,
2077                                        PIPE_CONTROL_CS_STALL,
2078                                        0);
2079         /*
2080          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
2081          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
2082          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
2083          * confusing. Since gen8_emit_pipe_control() already advances the
2084          * batch by 6 dwords, we advance the other 10 here, completing a
2085          * cacheline. It's not clear if the workaround requires this padding
2086          * before other commands, or if it's just the regular padding we would
2087          * already have for the workaround bb, so leave it here for now.
2088          */
2089         for (i = 0; i < 10; i++)
2090                 *batch++ = MI_NOOP;
2091
2092         /* Pad to end of cacheline */
2093         while ((unsigned long)batch % CACHELINE_BYTES)
2094                 *batch++ = MI_NOOP;
2095
2096         return batch;
2097 }
2098
2099 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
2100
2101 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
2102 {
2103         struct drm_i915_gem_object *obj;
2104         struct i915_vma *vma;
2105         int err;
2106
2107         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
2108         if (IS_ERR(obj))
2109                 return PTR_ERR(obj);
2110
2111         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
2112         if (IS_ERR(vma)) {
2113                 err = PTR_ERR(vma);
2114                 goto err;
2115         }
2116
2117         err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
2118         if (err)
2119                 goto err;
2120
2121         engine->wa_ctx.vma = vma;
2122         return 0;
2123
2124 err:
2125         i915_gem_object_put(obj);
2126         return err;
2127 }
2128
2129 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
2130 {
2131         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
2132 }
2133
2134 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
2135
2136 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
2137 {
2138         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2139         struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
2140                                             &wa_ctx->per_ctx };
2141         wa_bb_func_t wa_bb_fn[2];
2142         struct page *page;
2143         void *batch, *batch_ptr;
2144         unsigned int i;
2145         int ret;
2146
2147         if (engine->class != RENDER_CLASS)
2148                 return 0;
2149
2150         switch (INTEL_GEN(engine->i915)) {
2151         case 11:
2152                 return 0;
2153         case 10:
2154                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
2155                 wa_bb_fn[1] = NULL;
2156                 break;
2157         case 9:
2158                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
2159                 wa_bb_fn[1] = NULL;
2160                 break;
2161         case 8:
2162                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
2163                 wa_bb_fn[1] = NULL;
2164                 break;
2165         default:
2166                 MISSING_CASE(INTEL_GEN(engine->i915));
2167                 return 0;
2168         }
2169
2170         ret = lrc_setup_wa_ctx(engine);
2171         if (ret) {
2172                 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
2173                 return ret;
2174         }
2175
2176         page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
2177         batch = batch_ptr = kmap_atomic(page);
2178
2179         /*
2180          * Emit the two workaround batch buffers, recording the offset from the
2181          * start of the workaround batch buffer object for each and their
2182          * respective sizes.
2183          */
2184         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
2185                 wa_bb[i]->offset = batch_ptr - batch;
2186                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
2187                                                   CACHELINE_BYTES))) {
2188                         ret = -EINVAL;
2189                         break;
2190                 }
2191                 if (wa_bb_fn[i])
2192                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
2193                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
2194         }
2195
2196         BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
2197
2198         kunmap_atomic(batch);
2199         if (ret)
2200                 lrc_destroy_wa_ctx(engine);
2201
2202         return ret;
2203 }
2204
2205 static void enable_execlists(struct intel_engine_cs *engine)
2206 {
2207         u32 mode;
2208
2209         assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
2210
2211         intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
2212
2213         if (INTEL_GEN(engine->i915) >= 11)
2214                 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
2215         else
2216                 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
2217         ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
2218
2219         ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
2220
2221         ENGINE_WRITE_FW(engine,
2222                         RING_HWS_PGA,
2223                         i915_ggtt_offset(engine->status_page.vma));
2224         ENGINE_POSTING_READ(engine, RING_HWS_PGA);
2225 }
2226
2227 static bool unexpected_starting_state(struct intel_engine_cs *engine)
2228 {
2229         bool unexpected = false;
2230
2231         if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
2232                 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
2233                 unexpected = true;
2234         }
2235
2236         return unexpected;
2237 }
2238
2239 static int execlists_resume(struct intel_engine_cs *engine)
2240 {
2241         intel_engine_apply_workarounds(engine);
2242         intel_engine_apply_whitelist(engine);
2243
2244         intel_mocs_init_engine(engine);
2245
2246         intel_engine_reset_breadcrumbs(engine);
2247
2248         if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
2249                 struct drm_printer p = drm_debug_printer(__func__);
2250
2251                 intel_engine_dump(engine, &p, NULL);
2252         }
2253
2254         enable_execlists(engine);
2255
2256         return 0;
2257 }
2258
2259 static void execlists_reset_prepare(struct intel_engine_cs *engine)
2260 {
2261         struct intel_engine_execlists * const execlists = &engine->execlists;
2262         unsigned long flags;
2263
2264         GEM_TRACE("%s: depth<-%d\n", engine->name,
2265                   atomic_read(&execlists->tasklet.count));
2266
2267         /*
2268          * Prevent request submission to the hardware until we have
2269          * completed the reset in i915_gem_reset_finish(). If a request
2270          * is completed by one engine, it may then queue a request
2271          * to a second via its execlists->tasklet *just* as we are
2272          * calling engine->resume() and also writing the ELSP.
2273          * Turning off the execlists->tasklet until the reset is over
2274          * prevents the race.
2275          */
2276         __tasklet_disable_sync_once(&execlists->tasklet);
2277         GEM_BUG_ON(!reset_in_progress(execlists));
2278
2279         /* And flush any current direct submission. */
2280         spin_lock_irqsave(&engine->active.lock, flags);
2281         spin_unlock_irqrestore(&engine->active.lock, flags);
2282
2283         /*
2284          * We stop engines, otherwise we might get failed reset and a
2285          * dead gpu (on elk). Also as modern gpu as kbl can suffer
2286          * from system hang if batchbuffer is progressing when
2287          * the reset is issued, regardless of READY_TO_RESET ack.
2288          * Thus assume it is best to stop engines on all gens
2289          * where we have a gpu reset.
2290          *
2291          * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
2292          *
2293          * FIXME: Wa for more modern gens needs to be validated
2294          */
2295         intel_engine_stop_cs(engine);
2296 }
2297
2298 static void reset_csb_pointers(struct intel_engine_cs *engine)
2299 {
2300         struct intel_engine_execlists * const execlists = &engine->execlists;
2301         const unsigned int reset_value = execlists->csb_size - 1;
2302
2303         ring_set_paused(engine, 0);
2304
2305         /*
2306          * After a reset, the HW starts writing into CSB entry [0]. We
2307          * therefore have to set our HEAD pointer back one entry so that
2308          * the *first* entry we check is entry 0. To complicate this further,
2309          * as we don't wait for the first interrupt after reset, we have to
2310          * fake the HW write to point back to the last entry so that our
2311          * inline comparison of our cached head position against the last HW
2312          * write works even before the first interrupt.
2313          */
2314         execlists->csb_head = reset_value;
2315         WRITE_ONCE(*execlists->csb_write, reset_value);
2316         wmb(); /* Make sure this is visible to HW (paranoia?) */
2317
2318         invalidate_csb_entries(&execlists->csb_status[0],
2319                                &execlists->csb_status[reset_value]);
2320 }
2321
2322 static struct i915_request *active_request(struct i915_request *rq)
2323 {
2324         const struct list_head * const list = &rq->timeline->requests;
2325         const struct intel_context * const ce = rq->hw_context;
2326         struct i915_request *active = NULL;
2327
2328         list_for_each_entry_from_reverse(rq, list, link) {
2329                 if (i915_request_completed(rq))
2330                         break;
2331
2332                 if (rq->hw_context != ce)
2333                         break;
2334
2335                 active = rq;
2336         }
2337
2338         return active;
2339 }
2340
2341 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
2342 {
2343         struct intel_engine_execlists * const execlists = &engine->execlists;
2344         struct intel_context *ce;
2345         struct i915_request *rq;
2346         u32 *regs;
2347
2348         process_csb(engine); /* drain preemption events */
2349
2350         /* Following the reset, we need to reload the CSB read/write pointers */
2351         reset_csb_pointers(engine);
2352
2353         /*
2354          * Save the currently executing context, even if we completed
2355          * its request, it was still running at the time of the
2356          * reset and will have been clobbered.
2357          */
2358         rq = execlists_active(execlists);
2359         if (!rq)
2360                 goto unwind;
2361
2362         ce = rq->hw_context;
2363         GEM_BUG_ON(i915_active_is_idle(&ce->active));
2364         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2365         rq = active_request(rq);
2366         if (!rq) {
2367                 ce->ring->head = ce->ring->tail;
2368                 goto out_replay;
2369         }
2370
2371         ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
2372
2373         /*
2374          * If this request hasn't started yet, e.g. it is waiting on a
2375          * semaphore, we need to avoid skipping the request or else we
2376          * break the signaling chain. However, if the context is corrupt
2377          * the request will not restart and we will be stuck with a wedged
2378          * device. It is quite often the case that if we issue a reset
2379          * while the GPU is loading the context image, that the context
2380          * image becomes corrupt.
2381          *
2382          * Otherwise, if we have not started yet, the request should replay
2383          * perfectly and we do not need to flag the result as being erroneous.
2384          */
2385         if (!i915_request_started(rq))
2386                 goto out_replay;
2387
2388         /*
2389          * If the request was innocent, we leave the request in the ELSP
2390          * and will try to replay it on restarting. The context image may
2391          * have been corrupted by the reset, in which case we may have
2392          * to service a new GPU hang, but more likely we can continue on
2393          * without impact.
2394          *
2395          * If the request was guilty, we presume the context is corrupt
2396          * and have to at least restore the RING register in the context
2397          * image back to the expected values to skip over the guilty request.
2398          */
2399         __i915_request_reset(rq, stalled);
2400         if (!stalled)
2401                 goto out_replay;
2402
2403         /*
2404          * We want a simple context + ring to execute the breadcrumb update.
2405          * We cannot rely on the context being intact across the GPU hang,
2406          * so clear it and rebuild just what we need for the breadcrumb.
2407          * All pending requests for this context will be zapped, and any
2408          * future request will be after userspace has had the opportunity
2409          * to recreate its own state.
2410          */
2411         regs = ce->lrc_reg_state;
2412         if (engine->pinned_default_state) {
2413                 memcpy(regs, /* skip restoring the vanilla PPHWSP */
2414                        engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
2415                        engine->context_size - PAGE_SIZE);
2416         }
2417         execlists_init_reg_state(regs, ce, engine, ce->ring);
2418
2419 out_replay:
2420         GEM_TRACE("%s replay {head:%04x, tail:%04x\n",
2421                   engine->name, ce->ring->head, ce->ring->tail);
2422         intel_ring_update_space(ce->ring);
2423         __execlists_update_reg_state(ce, engine);
2424
2425 unwind:
2426         /* Push back any incomplete requests for replay after the reset. */
2427         cancel_port_requests(execlists);
2428         __unwind_incomplete_requests(engine);
2429 }
2430
2431 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
2432 {
2433         unsigned long flags;
2434
2435         GEM_TRACE("%s\n", engine->name);
2436
2437         spin_lock_irqsave(&engine->active.lock, flags);
2438
2439         __execlists_reset(engine, stalled);
2440
2441         spin_unlock_irqrestore(&engine->active.lock, flags);
2442 }
2443
2444 static void nop_submission_tasklet(unsigned long data)
2445 {
2446         /* The driver is wedged; don't process any more events. */
2447 }
2448
2449 static void execlists_cancel_requests(struct intel_engine_cs *engine)
2450 {
2451         struct intel_engine_execlists * const execlists = &engine->execlists;
2452         struct i915_request *rq, *rn;
2453         struct rb_node *rb;
2454         unsigned long flags;
2455
2456         GEM_TRACE("%s\n", engine->name);
2457
2458         /*
2459          * Before we call engine->cancel_requests(), we should have exclusive
2460          * access to the submission state. This is arranged for us by the
2461          * caller disabling the interrupt generation, the tasklet and other
2462          * threads that may then access the same state, giving us a free hand
2463          * to reset state. However, we still need to let lockdep be aware that
2464          * we know this state may be accessed in hardirq context, so we
2465          * disable the irq around this manipulation and we want to keep
2466          * the spinlock focused on its duties and not accidentally conflate
2467          * coverage to the submission's irq state. (Similarly, although we
2468          * shouldn't need to disable irq around the manipulation of the
2469          * submission's irq state, we also wish to remind ourselves that
2470          * it is irq state.)
2471          */
2472         spin_lock_irqsave(&engine->active.lock, flags);
2473
2474         __execlists_reset(engine, true);
2475
2476         /* Mark all executing requests as skipped. */
2477         list_for_each_entry(rq, &engine->active.requests, sched.link) {
2478                 if (!i915_request_signaled(rq))
2479                         dma_fence_set_error(&rq->fence, -EIO);
2480
2481                 i915_request_mark_complete(rq);
2482         }
2483
2484         /* Flush the queued requests to the timeline list (for retiring). */
2485         while ((rb = rb_first_cached(&execlists->queue))) {
2486                 struct i915_priolist *p = to_priolist(rb);
2487                 int i;
2488
2489                 priolist_for_each_request_consume(rq, rn, p, i) {
2490                         list_del_init(&rq->sched.link);
2491                         __i915_request_submit(rq);
2492                         dma_fence_set_error(&rq->fence, -EIO);
2493                         i915_request_mark_complete(rq);
2494                 }
2495
2496                 rb_erase_cached(&p->node, &execlists->queue);
2497                 i915_priolist_free(p);
2498         }
2499
2500         /* Cancel all attached virtual engines */
2501         while ((rb = rb_first_cached(&execlists->virtual))) {
2502                 struct virtual_engine *ve =
2503                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2504
2505                 rb_erase_cached(rb, &execlists->virtual);
2506                 RB_CLEAR_NODE(rb);
2507
2508                 spin_lock(&ve->base.active.lock);
2509                 if (ve->request) {
2510                         ve->request->engine = engine;
2511                         __i915_request_submit(ve->request);
2512                         dma_fence_set_error(&ve->request->fence, -EIO);
2513                         i915_request_mark_complete(ve->request);
2514                         ve->base.execlists.queue_priority_hint = INT_MIN;
2515                         ve->request = NULL;
2516                 }
2517                 spin_unlock(&ve->base.active.lock);
2518         }
2519
2520         /* Remaining _unready_ requests will be nop'ed when submitted */
2521
2522         execlists->queue_priority_hint = INT_MIN;
2523         execlists->queue = RB_ROOT_CACHED;
2524
2525         GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
2526         execlists->tasklet.func = nop_submission_tasklet;
2527
2528         spin_unlock_irqrestore(&engine->active.lock, flags);
2529 }
2530
2531 static void execlists_reset_finish(struct intel_engine_cs *engine)
2532 {
2533         struct intel_engine_execlists * const execlists = &engine->execlists;
2534
2535         /*
2536          * After a GPU reset, we may have requests to replay. Do so now while
2537          * we still have the forcewake to be sure that the GPU is not allowed
2538          * to sleep before we restart and reload a context.
2539          */
2540         GEM_BUG_ON(!reset_in_progress(execlists));
2541         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
2542                 execlists->tasklet.func(execlists->tasklet.data);
2543
2544         if (__tasklet_enable(&execlists->tasklet))
2545                 /* And kick in case we missed a new request submission. */
2546                 tasklet_hi_schedule(&execlists->tasklet);
2547         GEM_TRACE("%s: depth->%d\n", engine->name,
2548                   atomic_read(&execlists->tasklet.count));
2549 }
2550
2551 static int gen8_emit_bb_start(struct i915_request *rq,
2552                               u64 offset, u32 len,
2553                               const unsigned int flags)
2554 {
2555         u32 *cs;
2556
2557         cs = intel_ring_begin(rq, 4);
2558         if (IS_ERR(cs))
2559                 return PTR_ERR(cs);
2560
2561         /*
2562          * WaDisableCtxRestoreArbitration:bdw,chv
2563          *
2564          * We don't need to perform MI_ARB_ENABLE as often as we do (in
2565          * particular all the gen that do not need the w/a at all!), if we
2566          * took care to make sure that on every switch into this context
2567          * (both ordinary and for preemption) that arbitrartion was enabled
2568          * we would be fine.  However, for gen8 there is another w/a that
2569          * requires us to not preempt inside GPGPU execution, so we keep
2570          * arbitration disabled for gen8 batches. Arbitration will be
2571          * re-enabled before we close the request
2572          * (engine->emit_fini_breadcrumb).
2573          */
2574         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2575
2576         /* FIXME(BDW+): Address space and security selectors. */
2577         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
2578                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
2579         *cs++ = lower_32_bits(offset);
2580         *cs++ = upper_32_bits(offset);
2581
2582         intel_ring_advance(rq, cs);
2583
2584         return 0;
2585 }
2586
2587 static int gen9_emit_bb_start(struct i915_request *rq,
2588                               u64 offset, u32 len,
2589                               const unsigned int flags)
2590 {
2591         u32 *cs;
2592
2593         cs = intel_ring_begin(rq, 6);
2594         if (IS_ERR(cs))
2595                 return PTR_ERR(cs);
2596
2597         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2598
2599         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
2600                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
2601         *cs++ = lower_32_bits(offset);
2602         *cs++ = upper_32_bits(offset);
2603
2604         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2605         *cs++ = MI_NOOP;
2606
2607         intel_ring_advance(rq, cs);
2608
2609         return 0;
2610 }
2611
2612 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
2613 {
2614         ENGINE_WRITE(engine, RING_IMR,
2615                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
2616         ENGINE_POSTING_READ(engine, RING_IMR);
2617 }
2618
2619 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
2620 {
2621         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
2622 }
2623
2624 static int gen8_emit_flush(struct i915_request *request, u32 mode)
2625 {
2626         u32 cmd, *cs;
2627
2628         cs = intel_ring_begin(request, 4);
2629         if (IS_ERR(cs))
2630                 return PTR_ERR(cs);
2631
2632         cmd = MI_FLUSH_DW + 1;
2633
2634         /* We always require a command barrier so that subsequent
2635          * commands, such as breadcrumb interrupts, are strictly ordered
2636          * wrt the contents of the write cache being flushed to memory
2637          * (and thus being coherent from the CPU).
2638          */
2639         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
2640
2641         if (mode & EMIT_INVALIDATE) {
2642                 cmd |= MI_INVALIDATE_TLB;
2643                 if (request->engine->class == VIDEO_DECODE_CLASS)
2644                         cmd |= MI_INVALIDATE_BSD;
2645         }
2646
2647         *cs++ = cmd;
2648         *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
2649         *cs++ = 0; /* upper addr */
2650         *cs++ = 0; /* value */
2651         intel_ring_advance(request, cs);
2652
2653         return 0;
2654 }
2655
2656 static int gen8_emit_flush_render(struct i915_request *request,
2657                                   u32 mode)
2658 {
2659         struct intel_engine_cs *engine = request->engine;
2660         u32 scratch_addr =
2661                 intel_gt_scratch_offset(engine->gt,
2662                                         INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
2663         bool vf_flush_wa = false, dc_flush_wa = false;
2664         u32 *cs, flags = 0;
2665         int len;
2666
2667         flags |= PIPE_CONTROL_CS_STALL;
2668
2669         if (mode & EMIT_FLUSH) {
2670                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
2671                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
2672                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
2673                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
2674         }
2675
2676         if (mode & EMIT_INVALIDATE) {
2677                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
2678                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
2679                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
2680                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
2681                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
2682                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
2683                 flags |= PIPE_CONTROL_QW_WRITE;
2684                 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
2685
2686                 /*
2687                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
2688                  * pipe control.
2689                  */
2690                 if (IS_GEN(request->i915, 9))
2691                         vf_flush_wa = true;
2692
2693                 /* WaForGAMHang:kbl */
2694                 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
2695                         dc_flush_wa = true;
2696         }
2697
2698         len = 6;
2699
2700         if (vf_flush_wa)
2701                 len += 6;
2702
2703         if (dc_flush_wa)
2704                 len += 12;
2705
2706         cs = intel_ring_begin(request, len);
2707         if (IS_ERR(cs))
2708                 return PTR_ERR(cs);
2709
2710         if (vf_flush_wa)
2711                 cs = gen8_emit_pipe_control(cs, 0, 0);
2712
2713         if (dc_flush_wa)
2714                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
2715                                             0);
2716
2717         cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
2718
2719         if (dc_flush_wa)
2720                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
2721
2722         intel_ring_advance(request, cs);
2723
2724         return 0;
2725 }
2726
2727 static int gen11_emit_flush_render(struct i915_request *request,
2728                                    u32 mode)
2729 {
2730         struct intel_engine_cs *engine = request->engine;
2731         const u32 scratch_addr =
2732                 intel_gt_scratch_offset(engine->gt,
2733                                         INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
2734
2735         if (mode & EMIT_FLUSH) {
2736                 u32 *cs;
2737                 u32 flags = 0;
2738
2739                 flags |= PIPE_CONTROL_CS_STALL;
2740
2741                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
2742                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
2743                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
2744                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
2745                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
2746                 flags |= PIPE_CONTROL_QW_WRITE;
2747                 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
2748
2749                 cs = intel_ring_begin(request, 6);
2750                 if (IS_ERR(cs))
2751                         return PTR_ERR(cs);
2752
2753                 cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
2754                 intel_ring_advance(request, cs);
2755         }
2756
2757         if (mode & EMIT_INVALIDATE) {
2758                 u32 *cs;
2759                 u32 flags = 0;
2760
2761                 flags |= PIPE_CONTROL_CS_STALL;
2762
2763                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
2764                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
2765                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
2766                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
2767                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
2768                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
2769                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
2770                 flags |= PIPE_CONTROL_QW_WRITE;
2771                 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
2772
2773                 cs = intel_ring_begin(request, 6);
2774                 if (IS_ERR(cs))
2775                         return PTR_ERR(cs);
2776
2777                 cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
2778                 intel_ring_advance(request, cs);
2779         }
2780
2781         return 0;
2782 }
2783
2784 /*
2785  * Reserve space for 2 NOOPs at the end of each request to be
2786  * used as a workaround for not being allowed to do lite
2787  * restore with HEAD==TAIL (WaIdleLiteRestore).
2788  */
2789 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
2790 {
2791         /* Ensure there's always at least one preemption point per-request. */
2792         *cs++ = MI_ARB_CHECK;
2793         *cs++ = MI_NOOP;
2794         request->wa_tail = intel_ring_offset(request, cs);
2795
2796         return cs;
2797 }
2798
2799 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
2800 {
2801         *cs++ = MI_SEMAPHORE_WAIT |
2802                 MI_SEMAPHORE_GLOBAL_GTT |
2803                 MI_SEMAPHORE_POLL |
2804                 MI_SEMAPHORE_SAD_EQ_SDD;
2805         *cs++ = 0;
2806         *cs++ = intel_hws_preempt_address(request->engine);
2807         *cs++ = 0;
2808
2809         return cs;
2810 }
2811
2812 static __always_inline u32*
2813 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
2814                                  u32 *cs)
2815 {
2816         *cs++ = MI_USER_INTERRUPT;
2817
2818         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2819         if (intel_engine_has_semaphores(request->engine))
2820                 cs = emit_preempt_busywait(request, cs);
2821
2822         request->tail = intel_ring_offset(request, cs);
2823         assert_ring_tail_valid(request->ring, request->tail);
2824
2825         return gen8_emit_wa_tail(request, cs);
2826 }
2827
2828 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
2829 {
2830         cs = gen8_emit_ggtt_write(cs,
2831                                   request->fence.seqno,
2832                                   request->timeline->hwsp_offset,
2833                                   0);
2834
2835         return gen8_emit_fini_breadcrumb_footer(request, cs);
2836 }
2837
2838 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
2839 {
2840         cs = gen8_emit_ggtt_write_rcs(cs,
2841                                       request->fence.seqno,
2842                                       request->timeline->hwsp_offset,
2843                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
2844                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2845                                       PIPE_CONTROL_DC_FLUSH_ENABLE);
2846
2847         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
2848         cs = gen8_emit_pipe_control(cs,
2849                                     PIPE_CONTROL_FLUSH_ENABLE |
2850                                     PIPE_CONTROL_CS_STALL,
2851                                     0);
2852
2853         return gen8_emit_fini_breadcrumb_footer(request, cs);
2854 }
2855
2856 static u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *request,
2857                                            u32 *cs)
2858 {
2859         cs = gen8_emit_ggtt_write_rcs(cs,
2860                                       request->fence.seqno,
2861                                       request->timeline->hwsp_offset,
2862                                       PIPE_CONTROL_CS_STALL |
2863                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
2864                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
2865                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2866                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
2867                                       PIPE_CONTROL_FLUSH_ENABLE);
2868
2869         return gen8_emit_fini_breadcrumb_footer(request, cs);
2870 }
2871
2872 static void execlists_park(struct intel_engine_cs *engine)
2873 {
2874         del_timer(&engine->execlists.timer);
2875 }
2876
2877 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
2878 {
2879         engine->submit_request = execlists_submit_request;
2880         engine->cancel_requests = execlists_cancel_requests;
2881         engine->schedule = i915_schedule;
2882         engine->execlists.tasklet.func = execlists_submission_tasklet;
2883
2884         engine->reset.prepare = execlists_reset_prepare;
2885         engine->reset.reset = execlists_reset;
2886         engine->reset.finish = execlists_reset_finish;
2887
2888         engine->park = execlists_park;
2889         engine->unpark = NULL;
2890
2891         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
2892         if (!intel_vgpu_active(engine->i915)) {
2893                 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
2894                 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
2895                         engine->flags |= I915_ENGINE_HAS_PREEMPTION;
2896         }
2897 }
2898
2899 static void execlists_destroy(struct intel_engine_cs *engine)
2900 {
2901         intel_engine_cleanup_common(engine);
2902         lrc_destroy_wa_ctx(engine);
2903         kfree(engine);
2904 }
2905
2906 static void
2907 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
2908 {
2909         /* Default vfuncs which can be overriden by each engine. */
2910
2911         engine->destroy = execlists_destroy;
2912         engine->resume = execlists_resume;
2913
2914         engine->reset.prepare = execlists_reset_prepare;
2915         engine->reset.reset = execlists_reset;
2916         engine->reset.finish = execlists_reset_finish;
2917
2918         engine->cops = &execlists_context_ops;
2919         engine->request_alloc = execlists_request_alloc;
2920
2921         engine->emit_flush = gen8_emit_flush;
2922         engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
2923         engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
2924
2925         engine->set_default_submission = intel_execlists_set_default_submission;
2926
2927         if (INTEL_GEN(engine->i915) < 11) {
2928                 engine->irq_enable = gen8_logical_ring_enable_irq;
2929                 engine->irq_disable = gen8_logical_ring_disable_irq;
2930         } else {
2931                 /*
2932                  * TODO: On Gen11 interrupt masks need to be clear
2933                  * to allow C6 entry. Keep interrupts enabled at
2934                  * and take the hit of generating extra interrupts
2935                  * until a more refined solution exists.
2936                  */
2937         }
2938         if (IS_GEN(engine->i915, 8))
2939                 engine->emit_bb_start = gen8_emit_bb_start;
2940         else
2941                 engine->emit_bb_start = gen9_emit_bb_start;
2942 }
2943
2944 static inline void
2945 logical_ring_default_irqs(struct intel_engine_cs *engine)
2946 {
2947         unsigned int shift = 0;
2948
2949         if (INTEL_GEN(engine->i915) < 11) {
2950                 const u8 irq_shifts[] = {
2951                         [RCS0]  = GEN8_RCS_IRQ_SHIFT,
2952                         [BCS0]  = GEN8_BCS_IRQ_SHIFT,
2953                         [VCS0]  = GEN8_VCS0_IRQ_SHIFT,
2954                         [VCS1]  = GEN8_VCS1_IRQ_SHIFT,
2955                         [VECS0] = GEN8_VECS_IRQ_SHIFT,
2956                 };
2957
2958                 shift = irq_shifts[engine->id];
2959         }
2960
2961         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
2962         engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
2963 }
2964
2965 static void rcs_submission_override(struct intel_engine_cs *engine)
2966 {
2967         switch (INTEL_GEN(engine->i915)) {
2968         case 12:
2969         case 11:
2970                 engine->emit_flush = gen11_emit_flush_render;
2971                 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
2972                 break;
2973         default:
2974                 engine->emit_flush = gen8_emit_flush_render;
2975                 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
2976                 break;
2977         }
2978 }
2979
2980 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
2981 {
2982         tasklet_init(&engine->execlists.tasklet,
2983                      execlists_submission_tasklet, (unsigned long)engine);
2984         timer_setup(&engine->execlists.timer, execlists_submission_timer, 0);
2985
2986         logical_ring_default_vfuncs(engine);
2987         logical_ring_default_irqs(engine);
2988
2989         if (engine->class == RENDER_CLASS)
2990                 rcs_submission_override(engine);
2991
2992         return 0;
2993 }
2994
2995 int intel_execlists_submission_init(struct intel_engine_cs *engine)
2996 {
2997         struct intel_engine_execlists * const execlists = &engine->execlists;
2998         struct drm_i915_private *i915 = engine->i915;
2999         struct intel_uncore *uncore = engine->uncore;
3000         u32 base = engine->mmio_base;
3001         int ret;
3002
3003         ret = intel_engine_init_common(engine);
3004         if (ret)
3005                 return ret;
3006
3007         if (intel_init_workaround_bb(engine))
3008                 /*
3009                  * We continue even if we fail to initialize WA batch
3010                  * because we only expect rare glitches but nothing
3011                  * critical to prevent us from using GPU
3012                  */
3013                 DRM_ERROR("WA batch buffer initialization failed\n");
3014
3015         if (HAS_LOGICAL_RING_ELSQ(i915)) {
3016                 execlists->submit_reg = uncore->regs +
3017                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
3018                 execlists->ctrl_reg = uncore->regs +
3019                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
3020         } else {
3021                 execlists->submit_reg = uncore->regs +
3022                         i915_mmio_reg_offset(RING_ELSP(base));
3023         }
3024
3025         execlists->csb_status =
3026                 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
3027
3028         execlists->csb_write =
3029                 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
3030
3031         if (INTEL_GEN(i915) < 11)
3032                 execlists->csb_size = GEN8_CSB_ENTRIES;
3033         else
3034                 execlists->csb_size = GEN11_CSB_ENTRIES;
3035
3036         reset_csb_pointers(engine);
3037
3038         return 0;
3039 }
3040
3041 static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
3042 {
3043         u32 indirect_ctx_offset;
3044
3045         switch (INTEL_GEN(engine->i915)) {
3046         default:
3047                 MISSING_CASE(INTEL_GEN(engine->i915));
3048                 /* fall through */
3049         case 11:
3050                 indirect_ctx_offset =
3051                         GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3052                 break;
3053         case 10:
3054                 indirect_ctx_offset =
3055                         GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3056                 break;
3057         case 9:
3058                 indirect_ctx_offset =
3059                         GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3060                 break;
3061         case 8:
3062                 indirect_ctx_offset =
3063                         GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3064                 break;
3065         }
3066
3067         return indirect_ctx_offset;
3068 }
3069
3070 static void execlists_init_reg_state(u32 *regs,
3071                                      struct intel_context *ce,
3072                                      struct intel_engine_cs *engine,
3073                                      struct intel_ring *ring)
3074 {
3075         struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(ce->vm);
3076         bool rcs = engine->class == RENDER_CLASS;
3077         u32 base = engine->mmio_base;
3078
3079         /*
3080          * A context is actually a big batch buffer with several
3081          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
3082          * values we are setting here are only for the first context restore:
3083          * on a subsequent save, the GPU will recreate this batchbuffer with new
3084          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
3085          * we are not initializing here).
3086          *
3087          * Must keep consistent with virtual_update_register_offsets().
3088          */
3089         regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
3090                                  MI_LRI_FORCE_POSTED;
3091
3092         CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(base),
3093                 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
3094                 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH));
3095         if (INTEL_GEN(engine->i915) < 11) {
3096                 regs[CTX_CONTEXT_CONTROL + 1] |=
3097                         _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
3098                                             CTX_CTRL_RS_CTX_ENABLE);
3099         }
3100         CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0);
3101         CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0);
3102         CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0);
3103         CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base),
3104                 RING_CTL_SIZE(ring->size) | RING_VALID);
3105         CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0);
3106         CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0);
3107         CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT);
3108         CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0);
3109         CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0);
3110         CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0);
3111         if (rcs) {
3112                 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3113
3114                 CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0);
3115                 CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET,
3116                         RING_INDIRECT_CTX_OFFSET(base), 0);
3117                 if (wa_ctx->indirect_ctx.size) {
3118                         u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3119
3120                         regs[CTX_RCS_INDIRECT_CTX + 1] =
3121                                 (ggtt_offset + wa_ctx->indirect_ctx.offset) |
3122                                 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
3123
3124                         regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] =
3125                                 intel_lr_indirect_ctx_offset(engine) << 6;
3126                 }
3127
3128                 CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0);
3129                 if (wa_ctx->per_ctx.size) {
3130                         u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3131
3132                         regs[CTX_BB_PER_CTX_PTR + 1] =
3133                                 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
3134                 }
3135         }
3136
3137         regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED;
3138
3139         CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
3140         /* PDP values well be assigned later if needed */
3141         CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(base, 3), 0);
3142         CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(base, 3), 0);
3143         CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(base, 2), 0);
3144         CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(base, 2), 0);
3145         CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(base, 1), 0);
3146         CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(base, 1), 0);
3147         CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(base, 0), 0);
3148         CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(base, 0), 0);
3149
3150         if (i915_vm_is_4lvl(&ppgtt->vm)) {
3151                 /* 64b PPGTT (48bit canonical)
3152                  * PDP0_DESCRIPTOR contains the base address to PML4 and
3153                  * other PDP Descriptors are ignored.
3154                  */
3155                 ASSIGN_CTX_PML4(ppgtt, regs);
3156         } else {
3157                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
3158                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
3159                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
3160                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
3161         }
3162
3163         if (rcs) {
3164                 regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
3165                 CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
3166         }
3167
3168         regs[CTX_END] = MI_BATCH_BUFFER_END;
3169         if (INTEL_GEN(engine->i915) >= 10)
3170                 regs[CTX_END] |= BIT(0);
3171 }
3172
3173 static int
3174 populate_lr_context(struct intel_context *ce,
3175                     struct drm_i915_gem_object *ctx_obj,
3176                     struct intel_engine_cs *engine,
3177                     struct intel_ring *ring)
3178 {
3179         void *vaddr;
3180         u32 *regs;
3181         int ret;
3182
3183         vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
3184         if (IS_ERR(vaddr)) {
3185                 ret = PTR_ERR(vaddr);
3186                 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
3187                 return ret;
3188         }
3189
3190         set_redzone(vaddr, engine);
3191
3192         if (engine->default_state) {
3193                 /*
3194                  * We only want to copy over the template context state;
3195                  * skipping over the headers reserved for GuC communication,
3196                  * leaving those as zero.
3197                  */
3198                 const unsigned long start = LRC_HEADER_PAGES * PAGE_SIZE;
3199                 void *defaults;
3200
3201                 defaults = i915_gem_object_pin_map(engine->default_state,
3202                                                    I915_MAP_WB);
3203                 if (IS_ERR(defaults)) {
3204                         ret = PTR_ERR(defaults);
3205                         goto err_unpin_ctx;
3206                 }
3207
3208                 memcpy(vaddr + start, defaults + start, engine->context_size);
3209                 i915_gem_object_unpin_map(engine->default_state);
3210         }
3211
3212         /* The second page of the context object contains some fields which must
3213          * be set up prior to the first execution. */
3214         regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
3215         execlists_init_reg_state(regs, ce, engine, ring);
3216         if (!engine->default_state)
3217                 regs[CTX_CONTEXT_CONTROL + 1] |=
3218                         _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
3219
3220         ret = 0;
3221 err_unpin_ctx:
3222         __i915_gem_object_flush_map(ctx_obj,
3223                                     LRC_HEADER_PAGES * PAGE_SIZE,
3224                                     engine->context_size);
3225         i915_gem_object_unpin_map(ctx_obj);
3226         return ret;
3227 }
3228
3229 static int __execlists_context_alloc(struct intel_context *ce,
3230                                      struct intel_engine_cs *engine)
3231 {
3232         struct drm_i915_gem_object *ctx_obj;
3233         struct intel_ring *ring;
3234         struct i915_vma *vma;
3235         u32 context_size;
3236         int ret;
3237
3238         GEM_BUG_ON(ce->state);
3239         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
3240
3241         /*
3242          * Before the actual start of the context image, we insert a few pages
3243          * for our own use and for sharing with the GuC.
3244          */
3245         context_size += LRC_HEADER_PAGES * PAGE_SIZE;
3246         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3247                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
3248
3249         ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
3250         if (IS_ERR(ctx_obj))
3251                 return PTR_ERR(ctx_obj);
3252
3253         vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
3254         if (IS_ERR(vma)) {
3255                 ret = PTR_ERR(vma);
3256                 goto error_deref_obj;
3257         }
3258
3259         if (!ce->timeline) {
3260                 struct intel_timeline *tl;
3261
3262                 tl = intel_timeline_create(engine->gt, NULL);
3263                 if (IS_ERR(tl)) {
3264                         ret = PTR_ERR(tl);
3265                         goto error_deref_obj;
3266                 }
3267
3268                 ce->timeline = tl;
3269         }
3270
3271         ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
3272         if (IS_ERR(ring)) {
3273                 ret = PTR_ERR(ring);
3274                 goto error_deref_obj;
3275         }
3276
3277         ret = populate_lr_context(ce, ctx_obj, engine, ring);
3278         if (ret) {
3279                 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
3280                 goto error_ring_free;
3281         }
3282
3283         ce->ring = ring;
3284         ce->state = vma;
3285
3286         return 0;
3287
3288 error_ring_free:
3289         intel_ring_put(ring);
3290 error_deref_obj:
3291         i915_gem_object_put(ctx_obj);
3292         return ret;
3293 }
3294
3295 static struct list_head *virtual_queue(struct virtual_engine *ve)
3296 {
3297         return &ve->base.execlists.default_priolist.requests[0];
3298 }
3299
3300 static void virtual_context_destroy(struct kref *kref)
3301 {
3302         struct virtual_engine *ve =
3303                 container_of(kref, typeof(*ve), context.ref);
3304         unsigned int n;
3305
3306         GEM_BUG_ON(!list_empty(virtual_queue(ve)));
3307         GEM_BUG_ON(ve->request);
3308         GEM_BUG_ON(ve->context.inflight);
3309
3310         for (n = 0; n < ve->num_siblings; n++) {
3311                 struct intel_engine_cs *sibling = ve->siblings[n];
3312                 struct rb_node *node = &ve->nodes[sibling->id].rb;
3313
3314                 if (RB_EMPTY_NODE(node))
3315                         continue;
3316
3317                 spin_lock_irq(&sibling->active.lock);
3318
3319                 /* Detachment is lazily performed in the execlists tasklet */
3320                 if (!RB_EMPTY_NODE(node))
3321                         rb_erase_cached(node, &sibling->execlists.virtual);
3322
3323                 spin_unlock_irq(&sibling->active.lock);
3324         }
3325         GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
3326
3327         if (ve->context.state)
3328                 __execlists_context_fini(&ve->context);
3329         intel_context_fini(&ve->context);
3330
3331         kfree(ve->bonds);
3332         kfree(ve);
3333 }
3334
3335 static void virtual_engine_initial_hint(struct virtual_engine *ve)
3336 {
3337         int swp;
3338
3339         /*
3340          * Pick a random sibling on starting to help spread the load around.
3341          *
3342          * New contexts are typically created with exactly the same order
3343          * of siblings, and often started in batches. Due to the way we iterate
3344          * the array of sibling when submitting requests, sibling[0] is
3345          * prioritised for dequeuing. If we make sure that sibling[0] is fairly
3346          * randomised across the system, we also help spread the load by the
3347          * first engine we inspect being different each time.
3348          *
3349          * NB This does not force us to execute on this engine, it will just
3350          * typically be the first we inspect for submission.
3351          */
3352         swp = prandom_u32_max(ve->num_siblings);
3353         if (!swp)
3354                 return;
3355
3356         swap(ve->siblings[swp], ve->siblings[0]);
3357         virtual_update_register_offsets(ve->context.lrc_reg_state,
3358                                         ve->siblings[0]);
3359 }
3360
3361 static int virtual_context_pin(struct intel_context *ce)
3362 {
3363         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
3364         int err;
3365
3366         /* Note: we must use a real engine class for setting up reg state */
3367         err = __execlists_context_pin(ce, ve->siblings[0]);
3368         if (err)
3369                 return err;
3370
3371         virtual_engine_initial_hint(ve);
3372         return 0;
3373 }
3374
3375 static void virtual_context_enter(struct intel_context *ce)
3376 {
3377         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
3378         unsigned int n;
3379
3380         for (n = 0; n < ve->num_siblings; n++)
3381                 intel_engine_pm_get(ve->siblings[n]);
3382
3383         intel_timeline_enter(ce->timeline);
3384 }
3385
3386 static void virtual_context_exit(struct intel_context *ce)
3387 {
3388         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
3389         unsigned int n;
3390
3391         intel_timeline_exit(ce->timeline);
3392
3393         for (n = 0; n < ve->num_siblings; n++)
3394                 intel_engine_pm_put(ve->siblings[n]);
3395 }
3396
3397 static const struct intel_context_ops virtual_context_ops = {
3398         .pin = virtual_context_pin,
3399         .unpin = execlists_context_unpin,
3400
3401         .enter = virtual_context_enter,
3402         .exit = virtual_context_exit,
3403
3404         .destroy = virtual_context_destroy,
3405 };
3406
3407 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
3408 {
3409         struct i915_request *rq;
3410         intel_engine_mask_t mask;
3411
3412         rq = READ_ONCE(ve->request);
3413         if (!rq)
3414                 return 0;
3415
3416         /* The rq is ready for submission; rq->execution_mask is now stable. */
3417         mask = rq->execution_mask;
3418         if (unlikely(!mask)) {
3419                 /* Invalid selection, submit to a random engine in error */
3420                 i915_request_skip(rq, -ENODEV);
3421                 mask = ve->siblings[0]->mask;
3422         }
3423
3424         GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n",
3425                   ve->base.name,
3426                   rq->fence.context, rq->fence.seqno,
3427                   mask, ve->base.execlists.queue_priority_hint);
3428
3429         return mask;
3430 }
3431
3432 static void virtual_submission_tasklet(unsigned long data)
3433 {
3434         struct virtual_engine * const ve = (struct virtual_engine *)data;
3435         const int prio = ve->base.execlists.queue_priority_hint;
3436         intel_engine_mask_t mask;
3437         unsigned int n;
3438
3439         rcu_read_lock();
3440         mask = virtual_submission_mask(ve);
3441         rcu_read_unlock();
3442         if (unlikely(!mask))
3443                 return;
3444
3445         local_irq_disable();
3446         for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
3447                 struct intel_engine_cs *sibling = ve->siblings[n];
3448                 struct ve_node * const node = &ve->nodes[sibling->id];
3449                 struct rb_node **parent, *rb;
3450                 bool first;
3451
3452                 if (unlikely(!(mask & sibling->mask))) {
3453                         if (!RB_EMPTY_NODE(&node->rb)) {
3454                                 spin_lock(&sibling->active.lock);
3455                                 rb_erase_cached(&node->rb,
3456                                                 &sibling->execlists.virtual);
3457                                 RB_CLEAR_NODE(&node->rb);
3458                                 spin_unlock(&sibling->active.lock);
3459                         }
3460                         continue;
3461                 }
3462
3463                 spin_lock(&sibling->active.lock);
3464
3465                 if (!RB_EMPTY_NODE(&node->rb)) {
3466                         /*
3467                          * Cheat and avoid rebalancing the tree if we can
3468                          * reuse this node in situ.
3469                          */
3470                         first = rb_first_cached(&sibling->execlists.virtual) ==
3471                                 &node->rb;
3472                         if (prio == node->prio || (prio > node->prio && first))
3473                                 goto submit_engine;
3474
3475                         rb_erase_cached(&node->rb, &sibling->execlists.virtual);
3476                 }
3477
3478                 rb = NULL;
3479                 first = true;
3480                 parent = &sibling->execlists.virtual.rb_root.rb_node;
3481                 while (*parent) {
3482                         struct ve_node *other;
3483
3484                         rb = *parent;
3485                         other = rb_entry(rb, typeof(*other), rb);
3486                         if (prio > other->prio) {
3487                                 parent = &rb->rb_left;
3488                         } else {
3489                                 parent = &rb->rb_right;
3490                                 first = false;
3491                         }
3492                 }
3493
3494                 rb_link_node(&node->rb, rb, parent);
3495                 rb_insert_color_cached(&node->rb,
3496                                        &sibling->execlists.virtual,
3497                                        first);
3498
3499 submit_engine:
3500                 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
3501                 node->prio = prio;
3502                 if (first && prio > sibling->execlists.queue_priority_hint) {
3503                         sibling->execlists.queue_priority_hint = prio;
3504                         tasklet_hi_schedule(&sibling->execlists.tasklet);
3505                 }
3506
3507                 spin_unlock(&sibling->active.lock);
3508         }
3509         local_irq_enable();
3510 }
3511
3512 static void virtual_submit_request(struct i915_request *rq)
3513 {
3514         struct virtual_engine *ve = to_virtual_engine(rq->engine);
3515
3516         GEM_TRACE("%s: rq=%llx:%lld\n",
3517                   ve->base.name,
3518                   rq->fence.context,
3519                   rq->fence.seqno);
3520
3521         GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
3522
3523         GEM_BUG_ON(ve->request);
3524         GEM_BUG_ON(!list_empty(virtual_queue(ve)));
3525
3526         ve->base.execlists.queue_priority_hint = rq_prio(rq);
3527         WRITE_ONCE(ve->request, rq);
3528
3529         list_move_tail(&rq->sched.link, virtual_queue(ve));
3530
3531         tasklet_schedule(&ve->base.execlists.tasklet);
3532 }
3533
3534 static struct ve_bond *
3535 virtual_find_bond(struct virtual_engine *ve,
3536                   const struct intel_engine_cs *master)
3537 {
3538         int i;
3539
3540         for (i = 0; i < ve->num_bonds; i++) {
3541                 if (ve->bonds[i].master == master)
3542                         return &ve->bonds[i];
3543         }
3544
3545         return NULL;
3546 }
3547
3548 static void
3549 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
3550 {
3551         struct virtual_engine *ve = to_virtual_engine(rq->engine);
3552         struct ve_bond *bond;
3553
3554         bond = virtual_find_bond(ve, to_request(signal)->engine);
3555         if (bond) {
3556                 intel_engine_mask_t old, new, cmp;
3557
3558                 cmp = READ_ONCE(rq->execution_mask);
3559                 do {
3560                         old = cmp;
3561                         new = cmp & bond->sibling_mask;
3562                 } while ((cmp = cmpxchg(&rq->execution_mask, old, new)) != old);
3563         }
3564 }
3565
3566 struct intel_context *
3567 intel_execlists_create_virtual(struct i915_gem_context *ctx,
3568                                struct intel_engine_cs **siblings,
3569                                unsigned int count)
3570 {
3571         struct virtual_engine *ve;
3572         unsigned int n;
3573         int err;
3574
3575         if (count == 0)
3576                 return ERR_PTR(-EINVAL);
3577
3578         if (count == 1)
3579                 return intel_context_create(ctx, siblings[0]);
3580
3581         ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
3582         if (!ve)
3583                 return ERR_PTR(-ENOMEM);
3584
3585         ve->base.i915 = ctx->i915;
3586         ve->base.gt = siblings[0]->gt;
3587         ve->base.id = -1;
3588         ve->base.class = OTHER_CLASS;
3589         ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
3590         ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
3591
3592         /*
3593          * The decision on whether to submit a request using semaphores
3594          * depends on the saturated state of the engine. We only compute
3595          * this during HW submission of the request, and we need for this
3596          * state to be globally applied to all requests being submitted
3597          * to this engine. Virtual engines encompass more than one physical
3598          * engine and so we cannot accurately tell in advance if one of those
3599          * engines is already saturated and so cannot afford to use a semaphore
3600          * and be pessimized in priority for doing so -- if we are the only
3601          * context using semaphores after all other clients have stopped, we
3602          * will be starved on the saturated system. Such a global switch for
3603          * semaphores is less than ideal, but alas is the current compromise.
3604          */
3605         ve->base.saturated = ALL_ENGINES;
3606
3607         snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
3608
3609         intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
3610
3611         intel_engine_init_execlists(&ve->base);
3612
3613         ve->base.cops = &virtual_context_ops;
3614         ve->base.request_alloc = execlists_request_alloc;
3615
3616         ve->base.schedule = i915_schedule;
3617         ve->base.submit_request = virtual_submit_request;
3618         ve->base.bond_execute = virtual_bond_execute;
3619
3620         INIT_LIST_HEAD(virtual_queue(ve));
3621         ve->base.execlists.queue_priority_hint = INT_MIN;
3622         tasklet_init(&ve->base.execlists.tasklet,
3623                      virtual_submission_tasklet,
3624                      (unsigned long)ve);
3625
3626         intel_context_init(&ve->context, ctx, &ve->base);
3627
3628         for (n = 0; n < count; n++) {
3629                 struct intel_engine_cs *sibling = siblings[n];
3630
3631                 GEM_BUG_ON(!is_power_of_2(sibling->mask));
3632                 if (sibling->mask & ve->base.mask) {
3633                         DRM_DEBUG("duplicate %s entry in load balancer\n",
3634                                   sibling->name);
3635                         err = -EINVAL;
3636                         goto err_put;
3637                 }
3638
3639                 /*
3640                  * The virtual engine implementation is tightly coupled to
3641                  * the execlists backend -- we push out request directly
3642                  * into a tree inside each physical engine. We could support
3643                  * layering if we handle cloning of the requests and
3644                  * submitting a copy into each backend.
3645                  */
3646                 if (sibling->execlists.tasklet.func !=
3647                     execlists_submission_tasklet) {
3648                         err = -ENODEV;
3649                         goto err_put;
3650                 }
3651
3652                 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
3653                 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
3654
3655                 ve->siblings[ve->num_siblings++] = sibling;
3656                 ve->base.mask |= sibling->mask;
3657
3658                 /*
3659                  * All physical engines must be compatible for their emission
3660                  * functions (as we build the instructions during request
3661                  * construction and do not alter them before submission
3662                  * on the physical engine). We use the engine class as a guide
3663                  * here, although that could be refined.
3664                  */
3665                 if (ve->base.class != OTHER_CLASS) {
3666                         if (ve->base.class != sibling->class) {
3667                                 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
3668                                           sibling->class, ve->base.class);
3669                                 err = -EINVAL;
3670                                 goto err_put;
3671                         }
3672                         continue;
3673                 }
3674
3675                 ve->base.class = sibling->class;
3676                 ve->base.uabi_class = sibling->uabi_class;
3677                 snprintf(ve->base.name, sizeof(ve->base.name),
3678                          "v%dx%d", ve->base.class, count);
3679                 ve->base.context_size = sibling->context_size;
3680
3681                 ve->base.emit_bb_start = sibling->emit_bb_start;
3682                 ve->base.emit_flush = sibling->emit_flush;
3683                 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
3684                 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
3685                 ve->base.emit_fini_breadcrumb_dw =
3686                         sibling->emit_fini_breadcrumb_dw;
3687
3688                 ve->base.flags = sibling->flags;
3689         }
3690
3691         ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
3692
3693         err = __execlists_context_alloc(&ve->context, siblings[0]);
3694         if (err)
3695                 goto err_put;
3696
3697         __set_bit(CONTEXT_ALLOC_BIT, &ve->context.flags);
3698
3699         return &ve->context;
3700
3701 err_put:
3702         intel_context_put(&ve->context);
3703         return ERR_PTR(err);
3704 }
3705
3706 struct intel_context *
3707 intel_execlists_clone_virtual(struct i915_gem_context *ctx,
3708                               struct intel_engine_cs *src)
3709 {
3710         struct virtual_engine *se = to_virtual_engine(src);
3711         struct intel_context *dst;
3712
3713         dst = intel_execlists_create_virtual(ctx,
3714                                              se->siblings,
3715                                              se->num_siblings);
3716         if (IS_ERR(dst))
3717                 return dst;
3718
3719         if (se->num_bonds) {
3720                 struct virtual_engine *de = to_virtual_engine(dst->engine);
3721
3722                 de->bonds = kmemdup(se->bonds,
3723                                     sizeof(*se->bonds) * se->num_bonds,
3724                                     GFP_KERNEL);
3725                 if (!de->bonds) {
3726                         intel_context_put(dst);
3727                         return ERR_PTR(-ENOMEM);
3728                 }
3729
3730                 de->num_bonds = se->num_bonds;
3731         }
3732
3733         return dst;
3734 }
3735
3736 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
3737                                      const struct intel_engine_cs *master,
3738                                      const struct intel_engine_cs *sibling)
3739 {
3740         struct virtual_engine *ve = to_virtual_engine(engine);
3741         struct ve_bond *bond;
3742         int n;
3743
3744         /* Sanity check the sibling is part of the virtual engine */
3745         for (n = 0; n < ve->num_siblings; n++)
3746                 if (sibling == ve->siblings[n])
3747                         break;
3748         if (n == ve->num_siblings)
3749                 return -EINVAL;
3750
3751         bond = virtual_find_bond(ve, master);
3752         if (bond) {
3753                 bond->sibling_mask |= sibling->mask;
3754                 return 0;
3755         }
3756
3757         bond = krealloc(ve->bonds,
3758                         sizeof(*bond) * (ve->num_bonds + 1),
3759                         GFP_KERNEL);
3760         if (!bond)
3761                 return -ENOMEM;
3762
3763         bond[ve->num_bonds].master = master;
3764         bond[ve->num_bonds].sibling_mask = sibling->mask;
3765
3766         ve->bonds = bond;
3767         ve->num_bonds++;
3768
3769         return 0;
3770 }
3771
3772 void intel_execlists_show_requests(struct intel_engine_cs *engine,
3773                                    struct drm_printer *m,
3774                                    void (*show_request)(struct drm_printer *m,
3775                                                         struct i915_request *rq,
3776                                                         const char *prefix),
3777                                    unsigned int max)
3778 {
3779         const struct intel_engine_execlists *execlists = &engine->execlists;
3780         struct i915_request *rq, *last;
3781         unsigned long flags;
3782         unsigned int count;
3783         struct rb_node *rb;
3784
3785         spin_lock_irqsave(&engine->active.lock, flags);
3786
3787         last = NULL;
3788         count = 0;
3789         list_for_each_entry(rq, &engine->active.requests, sched.link) {
3790                 if (count++ < max - 1)
3791                         show_request(m, rq, "\t\tE ");
3792                 else
3793                         last = rq;
3794         }
3795         if (last) {
3796                 if (count > max) {
3797                         drm_printf(m,
3798                                    "\t\t...skipping %d executing requests...\n",
3799                                    count - max);
3800                 }
3801                 show_request(m, last, "\t\tE ");
3802         }
3803
3804         last = NULL;
3805         count = 0;
3806         if (execlists->queue_priority_hint != INT_MIN)
3807                 drm_printf(m, "\t\tQueue priority hint: %d\n",
3808                            execlists->queue_priority_hint);
3809         for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
3810                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
3811                 int i;
3812
3813                 priolist_for_each_request(rq, p, i) {
3814                         if (count++ < max - 1)
3815                                 show_request(m, rq, "\t\tQ ");
3816                         else
3817                                 last = rq;
3818                 }
3819         }
3820         if (last) {
3821                 if (count > max) {
3822                         drm_printf(m,
3823                                    "\t\t...skipping %d queued requests...\n",
3824                                    count - max);
3825                 }
3826                 show_request(m, last, "\t\tQ ");
3827         }
3828
3829         last = NULL;
3830         count = 0;
3831         for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
3832                 struct virtual_engine *ve =
3833                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3834                 struct i915_request *rq = READ_ONCE(ve->request);
3835
3836                 if (rq) {
3837                         if (count++ < max - 1)
3838                                 show_request(m, rq, "\t\tV ");
3839                         else
3840                                 last = rq;
3841                 }
3842         }
3843         if (last) {
3844                 if (count > max) {
3845                         drm_printf(m,
3846                                    "\t\t...skipping %d virtual requests...\n",
3847                                    count - max);
3848                 }
3849                 show_request(m, last, "\t\tV ");
3850         }
3851
3852         spin_unlock_irqrestore(&engine->active.lock, flags);
3853 }
3854
3855 void intel_lr_context_reset(struct intel_engine_cs *engine,
3856                             struct intel_context *ce,
3857                             u32 head,
3858                             bool scrub)
3859 {
3860         /*
3861          * We want a simple context + ring to execute the breadcrumb update.
3862          * We cannot rely on the context being intact across the GPU hang,
3863          * so clear it and rebuild just what we need for the breadcrumb.
3864          * All pending requests for this context will be zapped, and any
3865          * future request will be after userspace has had the opportunity
3866          * to recreate its own state.
3867          */
3868         if (scrub) {
3869                 u32 *regs = ce->lrc_reg_state;
3870
3871                 if (engine->pinned_default_state) {
3872                         memcpy(regs, /* skip restoring the vanilla PPHWSP */
3873                                engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
3874                                engine->context_size - PAGE_SIZE);
3875                 }
3876                 execlists_init_reg_state(regs, ce, engine, ce->ring);
3877         }
3878
3879         /* Rerun the request; its payload has been neutered (if guilty). */
3880         ce->ring->head = head;
3881         intel_ring_update_space(ce->ring);
3882
3883         __execlists_update_reg_state(ce, engine);
3884 }
3885
3886 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
3887 #include "selftest_lrc.c"
3888 #endif