]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/gpu/drm/i915/gt/intel_lrc.c
fd609ce4313ad822e66e11f4ca29ccc10d7130de
[linux.git] / drivers / gpu / drm / i915 / gt / intel_lrc.c
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135
136 #include "gem/i915_gem_context.h"
137
138 #include "i915_drv.h"
139 #include "i915_perf.h"
140 #include "i915_trace.h"
141 #include "i915_vgpu.h"
142 #include "intel_engine_pm.h"
143 #include "intel_gt.h"
144 #include "intel_gt_pm.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_workarounds.h"
149
150 #define RING_EXECLIST_QFULL             (1 << 0x2)
151 #define RING_EXECLIST1_VALID            (1 << 0x3)
152 #define RING_EXECLIST0_VALID            (1 << 0x4)
153 #define RING_EXECLIST_ACTIVE_STATUS     (3 << 0xE)
154 #define RING_EXECLIST1_ACTIVE           (1 << 0x11)
155 #define RING_EXECLIST0_ACTIVE           (1 << 0x12)
156
157 #define GEN8_CTX_STATUS_IDLE_ACTIVE     (1 << 0)
158 #define GEN8_CTX_STATUS_PREEMPTED       (1 << 1)
159 #define GEN8_CTX_STATUS_ELEMENT_SWITCH  (1 << 2)
160 #define GEN8_CTX_STATUS_ACTIVE_IDLE     (1 << 3)
161 #define GEN8_CTX_STATUS_COMPLETE        (1 << 4)
162 #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
163
164 #define GEN8_CTX_STATUS_COMPLETED_MASK \
165          (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
166
167 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
168
169 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE  (0x1) /* lower csb dword */
170 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
171 #define GEN12_CSB_SW_CTX_ID_MASK                GENMASK(25, 15)
172 #define GEN12_IDLE_CTX_ID               0x7FF
173 #define GEN12_CSB_CTX_VALID(csb_dw) \
174         (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
175
176 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
177 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
178 #define WA_TAIL_DWORDS 2
179 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
180
181 struct virtual_engine {
182         struct intel_engine_cs base;
183         struct intel_context context;
184
185         /*
186          * We allow only a single request through the virtual engine at a time
187          * (each request in the timeline waits for the completion fence of
188          * the previous before being submitted). By restricting ourselves to
189          * only submitting a single request, each request is placed on to a
190          * physical to maximise load spreading (by virtue of the late greedy
191          * scheduling -- each real engine takes the next available request
192          * upon idling).
193          */
194         struct i915_request *request;
195
196         /*
197          * We keep a rbtree of available virtual engines inside each physical
198          * engine, sorted by priority. Here we preallocate the nodes we need
199          * for the virtual engine, indexed by physical_engine->id.
200          */
201         struct ve_node {
202                 struct rb_node rb;
203                 int prio;
204         } nodes[I915_NUM_ENGINES];
205
206         /*
207          * Keep track of bonded pairs -- restrictions upon on our selection
208          * of physical engines any particular request may be submitted to.
209          * If we receive a submit-fence from a master engine, we will only
210          * use one of sibling_mask physical engines.
211          */
212         struct ve_bond {
213                 const struct intel_engine_cs *master;
214                 intel_engine_mask_t sibling_mask;
215         } *bonds;
216         unsigned int num_bonds;
217
218         /* And finally, which physical engines this virtual engine maps onto. */
219         unsigned int num_siblings;
220         struct intel_engine_cs *siblings[0];
221 };
222
223 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
224 {
225         GEM_BUG_ON(!intel_engine_is_virtual(engine));
226         return container_of(engine, struct virtual_engine, base);
227 }
228
229 static int __execlists_context_alloc(struct intel_context *ce,
230                                      struct intel_engine_cs *engine);
231
232 static void execlists_init_reg_state(u32 *reg_state,
233                                      const struct intel_context *ce,
234                                      const struct intel_engine_cs *engine,
235                                      const struct intel_ring *ring,
236                                      bool close);
237
238 static void __context_pin_acquire(struct intel_context *ce)
239 {
240         mutex_acquire(&ce->pin_mutex.dep_map, 2, 0, _RET_IP_);
241 }
242
243 static void __context_pin_release(struct intel_context *ce)
244 {
245         mutex_release(&ce->pin_mutex.dep_map, 0, _RET_IP_);
246 }
247
248 static void mark_eio(struct i915_request *rq)
249 {
250         if (!i915_request_signaled(rq))
251                 dma_fence_set_error(&rq->fence, -EIO);
252         i915_request_mark_complete(rq);
253 }
254
255 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
256 {
257         return (i915_ggtt_offset(engine->status_page.vma) +
258                 I915_GEM_HWS_PREEMPT_ADDR);
259 }
260
261 static inline void
262 ring_set_paused(const struct intel_engine_cs *engine, int state)
263 {
264         /*
265          * We inspect HWS_PREEMPT with a semaphore inside
266          * engine->emit_fini_breadcrumb. If the dword is true,
267          * the ring is paused as the semaphore will busywait
268          * until the dword is false.
269          */
270         engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
271         if (state)
272                 wmb();
273 }
274
275 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
276 {
277         return rb_entry(rb, struct i915_priolist, node);
278 }
279
280 static inline int rq_prio(const struct i915_request *rq)
281 {
282         return rq->sched.attr.priority;
283 }
284
285 static int effective_prio(const struct i915_request *rq)
286 {
287         int prio = rq_prio(rq);
288
289         /*
290          * If this request is special and must not be interrupted at any
291          * cost, so be it. Note we are only checking the most recent request
292          * in the context and so may be masking an earlier vip request. It
293          * is hoped that under the conditions where nopreempt is used, this
294          * will not matter (i.e. all requests to that context will be
295          * nopreempt for as long as desired).
296          */
297         if (i915_request_has_nopreempt(rq))
298                 prio = I915_PRIORITY_UNPREEMPTABLE;
299
300         /*
301          * On unwinding the active request, we give it a priority bump
302          * if it has completed waiting on any semaphore. If we know that
303          * the request has already started, we can prevent an unwanted
304          * preempt-to-idle cycle by taking that into account now.
305          */
306         if (__i915_request_has_started(rq))
307                 prio |= I915_PRIORITY_NOSEMAPHORE;
308
309         /* Restrict mere WAIT boosts from triggering preemption */
310         BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
311         return prio | __NO_PREEMPTION;
312 }
313
314 static int queue_prio(const struct intel_engine_execlists *execlists)
315 {
316         struct i915_priolist *p;
317         struct rb_node *rb;
318
319         rb = rb_first_cached(&execlists->queue);
320         if (!rb)
321                 return INT_MIN;
322
323         /*
324          * As the priolist[] are inverted, with the highest priority in [0],
325          * we have to flip the index value to become priority.
326          */
327         p = to_priolist(rb);
328         return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
329 }
330
331 static inline bool need_preempt(const struct intel_engine_cs *engine,
332                                 const struct i915_request *rq,
333                                 struct rb_node *rb)
334 {
335         int last_prio;
336
337         if (!intel_engine_has_semaphores(engine))
338                 return false;
339
340         /*
341          * Check if the current priority hint merits a preemption attempt.
342          *
343          * We record the highest value priority we saw during rescheduling
344          * prior to this dequeue, therefore we know that if it is strictly
345          * less than the current tail of ESLP[0], we do not need to force
346          * a preempt-to-idle cycle.
347          *
348          * However, the priority hint is a mere hint that we may need to
349          * preempt. If that hint is stale or we may be trying to preempt
350          * ourselves, ignore the request.
351          */
352         last_prio = effective_prio(rq);
353         if (!i915_scheduler_need_preempt(engine->execlists.queue_priority_hint,
354                                          last_prio))
355                 return false;
356
357         /*
358          * Check against the first request in ELSP[1], it will, thanks to the
359          * power of PI, be the highest priority of that context.
360          */
361         if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
362             rq_prio(list_next_entry(rq, sched.link)) > last_prio)
363                 return true;
364
365         if (rb) {
366                 struct virtual_engine *ve =
367                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
368                 bool preempt = false;
369
370                 if (engine == ve->siblings[0]) { /* only preempt one sibling */
371                         struct i915_request *next;
372
373                         rcu_read_lock();
374                         next = READ_ONCE(ve->request);
375                         if (next)
376                                 preempt = rq_prio(next) > last_prio;
377                         rcu_read_unlock();
378                 }
379
380                 if (preempt)
381                         return preempt;
382         }
383
384         /*
385          * If the inflight context did not trigger the preemption, then maybe
386          * it was the set of queued requests? Pick the highest priority in
387          * the queue (the first active priolist) and see if it deserves to be
388          * running instead of ELSP[0].
389          *
390          * The highest priority request in the queue can not be either
391          * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
392          * context, it's priority would not exceed ELSP[0] aka last_prio.
393          */
394         return queue_prio(&engine->execlists) > last_prio;
395 }
396
397 __maybe_unused static inline bool
398 assert_priority_queue(const struct i915_request *prev,
399                       const struct i915_request *next)
400 {
401         /*
402          * Without preemption, the prev may refer to the still active element
403          * which we refuse to let go.
404          *
405          * Even with preemption, there are times when we think it is better not
406          * to preempt and leave an ostensibly lower priority request in flight.
407          */
408         if (i915_request_is_active(prev))
409                 return true;
410
411         return rq_prio(prev) >= rq_prio(next);
412 }
413
414 /*
415  * The context descriptor encodes various attributes of a context,
416  * including its GTT address and some flags. Because it's fairly
417  * expensive to calculate, we'll just do it once and cache the result,
418  * which remains valid until the context is unpinned.
419  *
420  * This is what a descriptor looks like, from LSB to MSB::
421  *
422  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
423  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
424  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
425  *      bits 53-54:    mbz, reserved for use by hardware
426  *      bits 55-63:    group ID, currently unused and set to 0
427  *
428  * Starting from Gen11, the upper dword of the descriptor has a new format:
429  *
430  *      bits 32-36:    reserved
431  *      bits 37-47:    SW context ID
432  *      bits 48:53:    engine instance
433  *      bit 54:        mbz, reserved for use by hardware
434  *      bits 55-60:    SW counter
435  *      bits 61-63:    engine class
436  *
437  * engine info, SW context ID and SW counter need to form a unique number
438  * (Context ID) per lrc.
439  */
440 static u64
441 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
442 {
443         u64 desc;
444
445         desc = INTEL_LEGACY_32B_CONTEXT;
446         if (i915_vm_is_4lvl(ce->vm))
447                 desc = INTEL_LEGACY_64B_CONTEXT;
448         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
449
450         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
451         if (IS_GEN(engine->i915, 8))
452                 desc |= GEN8_CTX_L3LLC_COHERENT;
453
454         desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE;
455                                                                 /* bits 12-31 */
456         /*
457          * The following 32bits are copied into the OA reports (dword 2).
458          * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
459          * anything below.
460          */
461         if (INTEL_GEN(engine->i915) >= 11) {
462                 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
463                                                                 /* bits 48-53 */
464
465                 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
466                                                                 /* bits 61-63 */
467         }
468
469         return desc;
470 }
471
472 static u32 *set_offsets(u32 *regs,
473                         const u8 *data,
474                         const struct intel_engine_cs *engine)
475 #define NOP(x) (BIT(7) | (x))
476 #define LRI(count, flags) ((flags) << 6 | (count))
477 #define POSTED BIT(0)
478 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
479 #define REG16(x) \
480         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
481         (((x) >> 2) & 0x7f)
482 #define END() 0
483 {
484         const u32 base = engine->mmio_base;
485
486         while (*data) {
487                 u8 count, flags;
488
489                 if (*data & BIT(7)) { /* skip */
490                         regs += *data++ & ~BIT(7);
491                         continue;
492                 }
493
494                 count = *data & 0x3f;
495                 flags = *data >> 6;
496                 data++;
497
498                 *regs = MI_LOAD_REGISTER_IMM(count);
499                 if (flags & POSTED)
500                         *regs |= MI_LRI_FORCE_POSTED;
501                 if (INTEL_GEN(engine->i915) >= 11)
502                         *regs |= MI_LRI_CS_MMIO;
503                 regs++;
504
505                 GEM_BUG_ON(!count);
506                 do {
507                         u32 offset = 0;
508                         u8 v;
509
510                         do {
511                                 v = *data++;
512                                 offset <<= 7;
513                                 offset |= v & ~BIT(7);
514                         } while (v & BIT(7));
515
516                         *regs = base + (offset << 2);
517                         regs += 2;
518                 } while (--count);
519         }
520
521         return regs;
522 }
523
524 static const u8 gen8_xcs_offsets[] = {
525         NOP(1),
526         LRI(11, 0),
527         REG16(0x244),
528         REG(0x034),
529         REG(0x030),
530         REG(0x038),
531         REG(0x03c),
532         REG(0x168),
533         REG(0x140),
534         REG(0x110),
535         REG(0x11c),
536         REG(0x114),
537         REG(0x118),
538
539         NOP(9),
540         LRI(9, 0),
541         REG16(0x3a8),
542         REG16(0x28c),
543         REG16(0x288),
544         REG16(0x284),
545         REG16(0x280),
546         REG16(0x27c),
547         REG16(0x278),
548         REG16(0x274),
549         REG16(0x270),
550
551         NOP(13),
552         LRI(2, 0),
553         REG16(0x200),
554         REG(0x028),
555
556         END(),
557 };
558
559 static const u8 gen9_xcs_offsets[] = {
560         NOP(1),
561         LRI(14, POSTED),
562         REG16(0x244),
563         REG(0x034),
564         REG(0x030),
565         REG(0x038),
566         REG(0x03c),
567         REG(0x168),
568         REG(0x140),
569         REG(0x110),
570         REG(0x11c),
571         REG(0x114),
572         REG(0x118),
573         REG(0x1c0),
574         REG(0x1c4),
575         REG(0x1c8),
576
577         NOP(3),
578         LRI(9, POSTED),
579         REG16(0x3a8),
580         REG16(0x28c),
581         REG16(0x288),
582         REG16(0x284),
583         REG16(0x280),
584         REG16(0x27c),
585         REG16(0x278),
586         REG16(0x274),
587         REG16(0x270),
588
589         NOP(13),
590         LRI(1, POSTED),
591         REG16(0x200),
592
593         NOP(13),
594         LRI(44, POSTED),
595         REG(0x028),
596         REG(0x09c),
597         REG(0x0c0),
598         REG(0x178),
599         REG(0x17c),
600         REG16(0x358),
601         REG(0x170),
602         REG(0x150),
603         REG(0x154),
604         REG(0x158),
605         REG16(0x41c),
606         REG16(0x600),
607         REG16(0x604),
608         REG16(0x608),
609         REG16(0x60c),
610         REG16(0x610),
611         REG16(0x614),
612         REG16(0x618),
613         REG16(0x61c),
614         REG16(0x620),
615         REG16(0x624),
616         REG16(0x628),
617         REG16(0x62c),
618         REG16(0x630),
619         REG16(0x634),
620         REG16(0x638),
621         REG16(0x63c),
622         REG16(0x640),
623         REG16(0x644),
624         REG16(0x648),
625         REG16(0x64c),
626         REG16(0x650),
627         REG16(0x654),
628         REG16(0x658),
629         REG16(0x65c),
630         REG16(0x660),
631         REG16(0x664),
632         REG16(0x668),
633         REG16(0x66c),
634         REG16(0x670),
635         REG16(0x674),
636         REG16(0x678),
637         REG16(0x67c),
638         REG(0x068),
639
640         END(),
641 };
642
643 static const u8 gen12_xcs_offsets[] = {
644         NOP(1),
645         LRI(13, POSTED),
646         REG16(0x244),
647         REG(0x034),
648         REG(0x030),
649         REG(0x038),
650         REG(0x03c),
651         REG(0x168),
652         REG(0x140),
653         REG(0x110),
654         REG(0x1c0),
655         REG(0x1c4),
656         REG(0x1c8),
657         REG(0x180),
658         REG16(0x2b4),
659
660         NOP(5),
661         LRI(9, POSTED),
662         REG16(0x3a8),
663         REG16(0x28c),
664         REG16(0x288),
665         REG16(0x284),
666         REG16(0x280),
667         REG16(0x27c),
668         REG16(0x278),
669         REG16(0x274),
670         REG16(0x270),
671
672         END(),
673 };
674
675 static const u8 gen8_rcs_offsets[] = {
676         NOP(1),
677         LRI(14, POSTED),
678         REG16(0x244),
679         REG(0x034),
680         REG(0x030),
681         REG(0x038),
682         REG(0x03c),
683         REG(0x168),
684         REG(0x140),
685         REG(0x110),
686         REG(0x11c),
687         REG(0x114),
688         REG(0x118),
689         REG(0x1c0),
690         REG(0x1c4),
691         REG(0x1c8),
692
693         NOP(3),
694         LRI(9, POSTED),
695         REG16(0x3a8),
696         REG16(0x28c),
697         REG16(0x288),
698         REG16(0x284),
699         REG16(0x280),
700         REG16(0x27c),
701         REG16(0x278),
702         REG16(0x274),
703         REG16(0x270),
704
705         NOP(13),
706         LRI(1, 0),
707         REG(0x0c8),
708
709         END(),
710 };
711
712 static const u8 gen11_rcs_offsets[] = {
713         NOP(1),
714         LRI(15, POSTED),
715         REG16(0x244),
716         REG(0x034),
717         REG(0x030),
718         REG(0x038),
719         REG(0x03c),
720         REG(0x168),
721         REG(0x140),
722         REG(0x110),
723         REG(0x11c),
724         REG(0x114),
725         REG(0x118),
726         REG(0x1c0),
727         REG(0x1c4),
728         REG(0x1c8),
729         REG(0x180),
730
731         NOP(1),
732         LRI(9, POSTED),
733         REG16(0x3a8),
734         REG16(0x28c),
735         REG16(0x288),
736         REG16(0x284),
737         REG16(0x280),
738         REG16(0x27c),
739         REG16(0x278),
740         REG16(0x274),
741         REG16(0x270),
742
743         LRI(1, POSTED),
744         REG(0x1b0),
745
746         NOP(10),
747         LRI(1, 0),
748         REG(0x0c8),
749
750         END(),
751 };
752
753 static const u8 gen12_rcs_offsets[] = {
754         NOP(1),
755         LRI(13, POSTED),
756         REG16(0x244),
757         REG(0x034),
758         REG(0x030),
759         REG(0x038),
760         REG(0x03c),
761         REG(0x168),
762         REG(0x140),
763         REG(0x110),
764         REG(0x1c0),
765         REG(0x1c4),
766         REG(0x1c8),
767         REG(0x180),
768         REG16(0x2b4),
769
770         NOP(5),
771         LRI(9, POSTED),
772         REG16(0x3a8),
773         REG16(0x28c),
774         REG16(0x288),
775         REG16(0x284),
776         REG16(0x280),
777         REG16(0x27c),
778         REG16(0x278),
779         REG16(0x274),
780         REG16(0x270),
781
782         LRI(3, POSTED),
783         REG(0x1b0),
784         REG16(0x5a8),
785         REG16(0x5ac),
786
787         NOP(6),
788         LRI(1, 0),
789         REG(0x0c8),
790
791         END(),
792 };
793
794 #undef END
795 #undef REG16
796 #undef REG
797 #undef LRI
798 #undef NOP
799
800 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
801 {
802         /*
803          * The gen12+ lists only have the registers we program in the basic
804          * default state. We rely on the context image using relative
805          * addressing to automatic fixup the register state between the
806          * physical engines for virtual engine.
807          */
808         GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
809                    !intel_engine_has_relative_mmio(engine));
810
811         if (engine->class == RENDER_CLASS) {
812                 if (INTEL_GEN(engine->i915) >= 12)
813                         return gen12_rcs_offsets;
814                 else if (INTEL_GEN(engine->i915) >= 11)
815                         return gen11_rcs_offsets;
816                 else
817                         return gen8_rcs_offsets;
818         } else {
819                 if (INTEL_GEN(engine->i915) >= 12)
820                         return gen12_xcs_offsets;
821                 else if (INTEL_GEN(engine->i915) >= 9)
822                         return gen9_xcs_offsets;
823                 else
824                         return gen8_xcs_offsets;
825         }
826 }
827
828 static void unwind_wa_tail(struct i915_request *rq)
829 {
830         rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
831         assert_ring_tail_valid(rq->ring, rq->tail);
832 }
833
834 static struct i915_request *
835 __unwind_incomplete_requests(struct intel_engine_cs *engine)
836 {
837         struct i915_request *rq, *rn, *active = NULL;
838         struct list_head *uninitialized_var(pl);
839         int prio = I915_PRIORITY_INVALID;
840
841         lockdep_assert_held(&engine->active.lock);
842
843         list_for_each_entry_safe_reverse(rq, rn,
844                                          &engine->active.requests,
845                                          sched.link) {
846                 struct intel_engine_cs *owner;
847
848                 if (i915_request_completed(rq))
849                         continue; /* XXX */
850
851                 __i915_request_unsubmit(rq);
852                 unwind_wa_tail(rq);
853
854                 /*
855                  * Push the request back into the queue for later resubmission.
856                  * If this request is not native to this physical engine (i.e.
857                  * it came from a virtual source), push it back onto the virtual
858                  * engine so that it can be moved across onto another physical
859                  * engine as load dictates.
860                  */
861                 owner = rq->hw_context->engine;
862                 if (likely(owner == engine)) {
863                         GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
864                         if (rq_prio(rq) != prio) {
865                                 prio = rq_prio(rq);
866                                 pl = i915_sched_lookup_priolist(engine, prio);
867                         }
868                         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
869
870                         list_move(&rq->sched.link, pl);
871                         active = rq;
872                 } else {
873                         /*
874                          * Decouple the virtual breadcrumb before moving it
875                          * back to the virtual engine -- we don't want the
876                          * request to complete in the background and try
877                          * and cancel the breadcrumb on the virtual engine
878                          * (instead of the old engine where it is linked)!
879                          */
880                         if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
881                                      &rq->fence.flags)) {
882                                 spin_lock_nested(&rq->lock,
883                                                  SINGLE_DEPTH_NESTING);
884                                 i915_request_cancel_breadcrumb(rq);
885                                 spin_unlock(&rq->lock);
886                         }
887                         rq->engine = owner;
888                         owner->submit_request(rq);
889                         active = NULL;
890                 }
891         }
892
893         return active;
894 }
895
896 struct i915_request *
897 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
898 {
899         struct intel_engine_cs *engine =
900                 container_of(execlists, typeof(*engine), execlists);
901
902         return __unwind_incomplete_requests(engine);
903 }
904
905 static inline void
906 execlists_context_status_change(struct i915_request *rq, unsigned long status)
907 {
908         /*
909          * Only used when GVT-g is enabled now. When GVT-g is disabled,
910          * The compiler should eliminate this function as dead-code.
911          */
912         if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
913                 return;
914
915         atomic_notifier_call_chain(&rq->engine->context_status_notifier,
916                                    status, rq);
917 }
918
919 static inline struct intel_engine_cs *
920 __execlists_schedule_in(struct i915_request *rq)
921 {
922         struct intel_engine_cs * const engine = rq->engine;
923         struct intel_context * const ce = rq->hw_context;
924
925         intel_context_get(ce);
926
927         if (ce->tag) {
928                 /* Use a fixed tag for OA and friends */
929                 ce->lrc_desc |= (u64)ce->tag << 32;
930         } else {
931                 /* We don't need a strict matching tag, just different values */
932                 ce->lrc_desc &= ~GENMASK_ULL(47, 37);
933                 ce->lrc_desc |=
934                         (u64)(engine->context_tag++ % NUM_CONTEXT_TAG) <<
935                         GEN11_SW_CTX_ID_SHIFT;
936                 BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
937         }
938
939         intel_gt_pm_get(engine->gt);
940         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
941         intel_engine_context_in(engine);
942
943         return engine;
944 }
945
946 static inline struct i915_request *
947 execlists_schedule_in(struct i915_request *rq, int idx)
948 {
949         struct intel_context * const ce = rq->hw_context;
950         struct intel_engine_cs *old;
951
952         GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
953         trace_i915_request_in(rq, idx);
954
955         old = READ_ONCE(ce->inflight);
956         do {
957                 if (!old) {
958                         WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
959                         break;
960                 }
961         } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
962
963         GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
964         return i915_request_get(rq);
965 }
966
967 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
968 {
969         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
970         struct i915_request *next = READ_ONCE(ve->request);
971
972         if (next && next->execution_mask & ~rq->execution_mask)
973                 tasklet_schedule(&ve->base.execlists.tasklet);
974 }
975
976 static inline void
977 __execlists_schedule_out(struct i915_request *rq,
978                          struct intel_engine_cs * const engine)
979 {
980         struct intel_context * const ce = rq->hw_context;
981
982         intel_engine_context_out(engine);
983         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
984         intel_gt_pm_put(engine->gt);
985
986         /*
987          * If this is part of a virtual engine, its next request may
988          * have been blocked waiting for access to the active context.
989          * We have to kick all the siblings again in case we need to
990          * switch (e.g. the next request is not runnable on this
991          * engine). Hopefully, we will already have submitted the next
992          * request before the tasklet runs and do not need to rebuild
993          * each virtual tree and kick everyone again.
994          */
995         if (ce->engine != engine)
996                 kick_siblings(rq, ce);
997
998         intel_context_put(ce);
999 }
1000
1001 static inline void
1002 execlists_schedule_out(struct i915_request *rq)
1003 {
1004         struct intel_context * const ce = rq->hw_context;
1005         struct intel_engine_cs *cur, *old;
1006
1007         trace_i915_request_out(rq);
1008
1009         old = READ_ONCE(ce->inflight);
1010         do
1011                 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1012         while (!try_cmpxchg(&ce->inflight, &old, cur));
1013         if (!cur)
1014                 __execlists_schedule_out(rq, old);
1015
1016         i915_request_put(rq);
1017 }
1018
1019 static u64 execlists_update_context(const struct i915_request *rq)
1020 {
1021         struct intel_context *ce = rq->hw_context;
1022         u64 desc;
1023
1024         ce->lrc_reg_state[CTX_RING_TAIL] =
1025                 intel_ring_set_tail(rq->ring, rq->tail);
1026
1027         /*
1028          * Make sure the context image is complete before we submit it to HW.
1029          *
1030          * Ostensibly, writes (including the WCB) should be flushed prior to
1031          * an uncached write such as our mmio register access, the empirical
1032          * evidence (esp. on Braswell) suggests that the WC write into memory
1033          * may not be visible to the HW prior to the completion of the UC
1034          * register write and that we may begin execution from the context
1035          * before its image is complete leading to invalid PD chasing.
1036          *
1037          * Furthermore, Braswell, at least, wants a full mb to be sure that
1038          * the writes are coherent in memory (visible to the GPU) prior to
1039          * execution, and not just visible to other CPUs (as is the result of
1040          * wmb).
1041          */
1042         mb();
1043
1044         desc = ce->lrc_desc;
1045         ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1046
1047         return desc;
1048 }
1049
1050 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1051 {
1052         if (execlists->ctrl_reg) {
1053                 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1054                 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1055         } else {
1056                 writel(upper_32_bits(desc), execlists->submit_reg);
1057                 writel(lower_32_bits(desc), execlists->submit_reg);
1058         }
1059 }
1060
1061 static __maybe_unused void
1062 trace_ports(const struct intel_engine_execlists *execlists,
1063             const char *msg,
1064             struct i915_request * const *ports)
1065 {
1066         const struct intel_engine_cs *engine =
1067                 container_of(execlists, typeof(*engine), execlists);
1068
1069         if (!ports[0])
1070                 return;
1071
1072         GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n",
1073                   engine->name, msg,
1074                   ports[0]->fence.context,
1075                   ports[0]->fence.seqno,
1076                   i915_request_completed(ports[0]) ? "!" :
1077                   i915_request_started(ports[0]) ? "*" :
1078                   "",
1079                   ports[1] ? ports[1]->fence.context : 0,
1080                   ports[1] ? ports[1]->fence.seqno : 0);
1081 }
1082
1083 static __maybe_unused bool
1084 assert_pending_valid(const struct intel_engine_execlists *execlists,
1085                      const char *msg)
1086 {
1087         struct i915_request * const *port, *rq;
1088         struct intel_context *ce = NULL;
1089
1090         trace_ports(execlists, msg, execlists->pending);
1091
1092         if (!execlists->pending[0]) {
1093                 GEM_TRACE_ERR("Nothing pending for promotion!\n");
1094                 return false;
1095         }
1096
1097         if (execlists->pending[execlists_num_ports(execlists)]) {
1098                 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1099                               execlists_num_ports(execlists));
1100                 return false;
1101         }
1102
1103         for (port = execlists->pending; (rq = *port); port++) {
1104                 if (ce == rq->hw_context) {
1105                         GEM_TRACE_ERR("Duplicate context in pending[%zd]\n",
1106                                       port - execlists->pending);
1107                         return false;
1108                 }
1109
1110                 ce = rq->hw_context;
1111                 if (i915_request_completed(rq))
1112                         continue;
1113
1114                 if (i915_active_is_idle(&ce->active)) {
1115                         GEM_TRACE_ERR("Inactive context in pending[%zd]\n",
1116                                       port - execlists->pending);
1117                         return false;
1118                 }
1119
1120                 if (!i915_vma_is_pinned(ce->state)) {
1121                         GEM_TRACE_ERR("Unpinned context in pending[%zd]\n",
1122                                       port - execlists->pending);
1123                         return false;
1124                 }
1125
1126                 if (!i915_vma_is_pinned(ce->ring->vma)) {
1127                         GEM_TRACE_ERR("Unpinned ringbuffer in pending[%zd]\n",
1128                                       port - execlists->pending);
1129                         return false;
1130                 }
1131         }
1132
1133         return ce;
1134 }
1135
1136 static void execlists_submit_ports(struct intel_engine_cs *engine)
1137 {
1138         struct intel_engine_execlists *execlists = &engine->execlists;
1139         unsigned int n;
1140
1141         GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1142
1143         /*
1144          * We can skip acquiring intel_runtime_pm_get() here as it was taken
1145          * on our behalf by the request (see i915_gem_mark_busy()) and it will
1146          * not be relinquished until the device is idle (see
1147          * i915_gem_idle_work_handler()). As a precaution, we make sure
1148          * that all ELSP are drained i.e. we have processed the CSB,
1149          * before allowing ourselves to idle and calling intel_runtime_pm_put().
1150          */
1151         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1152
1153         /*
1154          * ELSQ note: the submit queue is not cleared after being submitted
1155          * to the HW so we need to make sure we always clean it up. This is
1156          * currently ensured by the fact that we always write the same number
1157          * of elsq entries, keep this in mind before changing the loop below.
1158          */
1159         for (n = execlists_num_ports(execlists); n--; ) {
1160                 struct i915_request *rq = execlists->pending[n];
1161
1162                 write_desc(execlists,
1163                            rq ? execlists_update_context(rq) : 0,
1164                            n);
1165         }
1166
1167         /* we need to manually load the submit queue */
1168         if (execlists->ctrl_reg)
1169                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1170 }
1171
1172 static bool ctx_single_port_submission(const struct intel_context *ce)
1173 {
1174         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1175                 i915_gem_context_force_single_submission(ce->gem_context));
1176 }
1177
1178 static bool can_merge_ctx(const struct intel_context *prev,
1179                           const struct intel_context *next)
1180 {
1181         if (prev != next)
1182                 return false;
1183
1184         if (ctx_single_port_submission(prev))
1185                 return false;
1186
1187         return true;
1188 }
1189
1190 static bool can_merge_rq(const struct i915_request *prev,
1191                          const struct i915_request *next)
1192 {
1193         GEM_BUG_ON(prev == next);
1194         GEM_BUG_ON(!assert_priority_queue(prev, next));
1195
1196         /*
1197          * We do not submit known completed requests. Therefore if the next
1198          * request is already completed, we can pretend to merge it in
1199          * with the previous context (and we will skip updating the ELSP
1200          * and tracking). Thus hopefully keeping the ELSP full with active
1201          * contexts, despite the best efforts of preempt-to-busy to confuse
1202          * us.
1203          */
1204         if (i915_request_completed(next))
1205                 return true;
1206
1207         if (!can_merge_ctx(prev->hw_context, next->hw_context))
1208                 return false;
1209
1210         return true;
1211 }
1212
1213 static void virtual_update_register_offsets(u32 *regs,
1214                                             struct intel_engine_cs *engine)
1215 {
1216         set_offsets(regs, reg_offsets(engine), engine);
1217 }
1218
1219 static bool virtual_matches(const struct virtual_engine *ve,
1220                             const struct i915_request *rq,
1221                             const struct intel_engine_cs *engine)
1222 {
1223         const struct intel_engine_cs *inflight;
1224
1225         if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1226                 return false;
1227
1228         /*
1229          * We track when the HW has completed saving the context image
1230          * (i.e. when we have seen the final CS event switching out of
1231          * the context) and must not overwrite the context image before
1232          * then. This restricts us to only using the active engine
1233          * while the previous virtualized request is inflight (so
1234          * we reuse the register offsets). This is a very small
1235          * hystersis on the greedy seelction algorithm.
1236          */
1237         inflight = intel_context_inflight(&ve->context);
1238         if (inflight && inflight != engine)
1239                 return false;
1240
1241         return true;
1242 }
1243
1244 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1245                                      struct intel_engine_cs *engine)
1246 {
1247         struct intel_engine_cs *old = ve->siblings[0];
1248
1249         /* All unattached (rq->engine == old) must already be completed */
1250
1251         spin_lock(&old->breadcrumbs.irq_lock);
1252         if (!list_empty(&ve->context.signal_link)) {
1253                 list_move_tail(&ve->context.signal_link,
1254                                &engine->breadcrumbs.signalers);
1255                 intel_engine_queue_breadcrumbs(engine);
1256         }
1257         spin_unlock(&old->breadcrumbs.irq_lock);
1258 }
1259
1260 static struct i915_request *
1261 last_active(const struct intel_engine_execlists *execlists)
1262 {
1263         struct i915_request * const *last = READ_ONCE(execlists->active);
1264
1265         while (*last && i915_request_completed(*last))
1266                 last++;
1267
1268         return *last;
1269 }
1270
1271 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1272 {
1273         LIST_HEAD(list);
1274
1275         /*
1276          * We want to move the interrupted request to the back of
1277          * the round-robin list (i.e. its priority level), but
1278          * in doing so, we must then move all requests that were in
1279          * flight and were waiting for the interrupted request to
1280          * be run after it again.
1281          */
1282         do {
1283                 struct i915_dependency *p;
1284
1285                 GEM_BUG_ON(i915_request_is_active(rq));
1286                 list_move_tail(&rq->sched.link, pl);
1287
1288                 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
1289                         struct i915_request *w =
1290                                 container_of(p->waiter, typeof(*w), sched);
1291
1292                         /* Leave semaphores spinning on the other engines */
1293                         if (w->engine != rq->engine)
1294                                 continue;
1295
1296                         /* No waiter should start before its signaler */
1297                         GEM_BUG_ON(i915_request_started(w) &&
1298                                    !i915_request_completed(rq));
1299
1300                         GEM_BUG_ON(i915_request_is_active(w));
1301                         if (list_empty(&w->sched.link))
1302                                 continue; /* Not yet submitted; unready */
1303
1304                         if (rq_prio(w) < rq_prio(rq))
1305                                 continue;
1306
1307                         GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1308                         list_move_tail(&w->sched.link, &list);
1309                 }
1310
1311                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1312         } while (rq);
1313 }
1314
1315 static void defer_active(struct intel_engine_cs *engine)
1316 {
1317         struct i915_request *rq;
1318
1319         rq = __unwind_incomplete_requests(engine);
1320         if (!rq)
1321                 return;
1322
1323         defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1324 }
1325
1326 static bool
1327 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1328 {
1329         int hint;
1330
1331         if (!intel_engine_has_semaphores(engine))
1332                 return false;
1333
1334         if (list_is_last(&rq->sched.link, &engine->active.requests))
1335                 return false;
1336
1337         hint = max(rq_prio(list_next_entry(rq, sched.link)),
1338                    engine->execlists.queue_priority_hint);
1339
1340         return hint >= effective_prio(rq);
1341 }
1342
1343 static int
1344 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1345 {
1346         if (list_is_last(&rq->sched.link, &engine->active.requests))
1347                 return INT_MIN;
1348
1349         return rq_prio(list_next_entry(rq, sched.link));
1350 }
1351
1352 static bool
1353 enable_timeslice(const struct intel_engine_execlists *execlists)
1354 {
1355         const struct i915_request *rq = *execlists->active;
1356
1357         if (i915_request_completed(rq))
1358                 return false;
1359
1360         return execlists->switch_priority_hint >= effective_prio(rq);
1361 }
1362
1363 static void record_preemption(struct intel_engine_execlists *execlists)
1364 {
1365         (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1366 }
1367
1368 static void execlists_dequeue(struct intel_engine_cs *engine)
1369 {
1370         struct intel_engine_execlists * const execlists = &engine->execlists;
1371         struct i915_request **port = execlists->pending;
1372         struct i915_request ** const last_port = port + execlists->port_mask;
1373         struct i915_request *last;
1374         struct rb_node *rb;
1375         bool submit = false;
1376
1377         /*
1378          * Hardware submission is through 2 ports. Conceptually each port
1379          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1380          * static for a context, and unique to each, so we only execute
1381          * requests belonging to a single context from each ring. RING_HEAD
1382          * is maintained by the CS in the context image, it marks the place
1383          * where it got up to last time, and through RING_TAIL we tell the CS
1384          * where we want to execute up to this time.
1385          *
1386          * In this list the requests are in order of execution. Consecutive
1387          * requests from the same context are adjacent in the ringbuffer. We
1388          * can combine these requests into a single RING_TAIL update:
1389          *
1390          *              RING_HEAD...req1...req2
1391          *                                    ^- RING_TAIL
1392          * since to execute req2 the CS must first execute req1.
1393          *
1394          * Our goal then is to point each port to the end of a consecutive
1395          * sequence of requests as being the most optimal (fewest wake ups
1396          * and context switches) submission.
1397          */
1398
1399         for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1400                 struct virtual_engine *ve =
1401                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1402                 struct i915_request *rq = READ_ONCE(ve->request);
1403
1404                 if (!rq) { /* lazily cleanup after another engine handled rq */
1405                         rb_erase_cached(rb, &execlists->virtual);
1406                         RB_CLEAR_NODE(rb);
1407                         rb = rb_first_cached(&execlists->virtual);
1408                         continue;
1409                 }
1410
1411                 if (!virtual_matches(ve, rq, engine)) {
1412                         rb = rb_next(rb);
1413                         continue;
1414                 }
1415
1416                 break;
1417         }
1418
1419         /*
1420          * If the queue is higher priority than the last
1421          * request in the currently active context, submit afresh.
1422          * We will resubmit again afterwards in case we need to split
1423          * the active context to interject the preemption request,
1424          * i.e. we will retrigger preemption following the ack in case
1425          * of trouble.
1426          */
1427         last = last_active(execlists);
1428         if (last) {
1429                 if (need_preempt(engine, last, rb)) {
1430                         GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n",
1431                                   engine->name,
1432                                   last->fence.context,
1433                                   last->fence.seqno,
1434                                   last->sched.attr.priority,
1435                                   execlists->queue_priority_hint);
1436                         record_preemption(execlists);
1437
1438                         /*
1439                          * Don't let the RING_HEAD advance past the breadcrumb
1440                          * as we unwind (and until we resubmit) so that we do
1441                          * not accidentally tell it to go backwards.
1442                          */
1443                         ring_set_paused(engine, 1);
1444
1445                         /*
1446                          * Note that we have not stopped the GPU at this point,
1447                          * so we are unwinding the incomplete requests as they
1448                          * remain inflight and so by the time we do complete
1449                          * the preemption, some of the unwound requests may
1450                          * complete!
1451                          */
1452                         __unwind_incomplete_requests(engine);
1453
1454                         /*
1455                          * If we need to return to the preempted context, we
1456                          * need to skip the lite-restore and force it to
1457                          * reload the RING_TAIL. Otherwise, the HW has a
1458                          * tendency to ignore us rewinding the TAIL to the
1459                          * end of an earlier request.
1460                          */
1461                         last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1462                         last = NULL;
1463                 } else if (need_timeslice(engine, last) &&
1464                            !timer_pending(&engine->execlists.timer)) {
1465                         GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n",
1466                                   engine->name,
1467                                   last->fence.context,
1468                                   last->fence.seqno,
1469                                   last->sched.attr.priority,
1470                                   execlists->queue_priority_hint);
1471
1472                         ring_set_paused(engine, 1);
1473                         defer_active(engine);
1474
1475                         /*
1476                          * Unlike for preemption, if we rewind and continue
1477                          * executing the same context as previously active,
1478                          * the order of execution will remain the same and
1479                          * the tail will only advance. We do not need to
1480                          * force a full context restore, as a lite-restore
1481                          * is sufficient to resample the monotonic TAIL.
1482                          *
1483                          * If we switch to any other context, similarly we
1484                          * will not rewind TAIL of current context, and
1485                          * normal save/restore will preserve state and allow
1486                          * us to later continue executing the same request.
1487                          */
1488                         last = NULL;
1489                 } else {
1490                         /*
1491                          * Otherwise if we already have a request pending
1492                          * for execution after the current one, we can
1493                          * just wait until the next CS event before
1494                          * queuing more. In either case we will force a
1495                          * lite-restore preemption event, but if we wait
1496                          * we hopefully coalesce several updates into a single
1497                          * submission.
1498                          */
1499                         if (!list_is_last(&last->sched.link,
1500                                           &engine->active.requests))
1501                                 return;
1502
1503                         /*
1504                          * WaIdleLiteRestore:bdw,skl
1505                          * Apply the wa NOOPs to prevent
1506                          * ring:HEAD == rq:TAIL as we resubmit the
1507                          * request. See gen8_emit_fini_breadcrumb() for
1508                          * where we prepare the padding after the
1509                          * end of the request.
1510                          */
1511                         last->tail = last->wa_tail;
1512                 }
1513         }
1514
1515         while (rb) { /* XXX virtual is always taking precedence */
1516                 struct virtual_engine *ve =
1517                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1518                 struct i915_request *rq;
1519
1520                 spin_lock(&ve->base.active.lock);
1521
1522                 rq = ve->request;
1523                 if (unlikely(!rq)) { /* lost the race to a sibling */
1524                         spin_unlock(&ve->base.active.lock);
1525                         rb_erase_cached(rb, &execlists->virtual);
1526                         RB_CLEAR_NODE(rb);
1527                         rb = rb_first_cached(&execlists->virtual);
1528                         continue;
1529                 }
1530
1531                 GEM_BUG_ON(rq != ve->request);
1532                 GEM_BUG_ON(rq->engine != &ve->base);
1533                 GEM_BUG_ON(rq->hw_context != &ve->context);
1534
1535                 if (rq_prio(rq) >= queue_prio(execlists)) {
1536                         if (!virtual_matches(ve, rq, engine)) {
1537                                 spin_unlock(&ve->base.active.lock);
1538                                 rb = rb_next(rb);
1539                                 continue;
1540                         }
1541
1542                         if (last && !can_merge_rq(last, rq)) {
1543                                 spin_unlock(&ve->base.active.lock);
1544                                 return; /* leave this for another */
1545                         }
1546
1547                         GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n",
1548                                   engine->name,
1549                                   rq->fence.context,
1550                                   rq->fence.seqno,
1551                                   i915_request_completed(rq) ? "!" :
1552                                   i915_request_started(rq) ? "*" :
1553                                   "",
1554                                   yesno(engine != ve->siblings[0]));
1555
1556                         ve->request = NULL;
1557                         ve->base.execlists.queue_priority_hint = INT_MIN;
1558                         rb_erase_cached(rb, &execlists->virtual);
1559                         RB_CLEAR_NODE(rb);
1560
1561                         GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1562                         rq->engine = engine;
1563
1564                         if (engine != ve->siblings[0]) {
1565                                 u32 *regs = ve->context.lrc_reg_state;
1566                                 unsigned int n;
1567
1568                                 GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1569
1570                                 if (!intel_engine_has_relative_mmio(engine))
1571                                         virtual_update_register_offsets(regs,
1572                                                                         engine);
1573
1574                                 if (!list_empty(&ve->context.signals))
1575                                         virtual_xfer_breadcrumbs(ve, engine);
1576
1577                                 /*
1578                                  * Move the bound engine to the top of the list
1579                                  * for future execution. We then kick this
1580                                  * tasklet first before checking others, so that
1581                                  * we preferentially reuse this set of bound
1582                                  * registers.
1583                                  */
1584                                 for (n = 1; n < ve->num_siblings; n++) {
1585                                         if (ve->siblings[n] == engine) {
1586                                                 swap(ve->siblings[n],
1587                                                      ve->siblings[0]);
1588                                                 break;
1589                                         }
1590                                 }
1591
1592                                 GEM_BUG_ON(ve->siblings[0] != engine);
1593                         }
1594
1595                         if (__i915_request_submit(rq)) {
1596                                 submit = true;
1597                                 last = rq;
1598                         }
1599                         i915_request_put(rq);
1600
1601                         /*
1602                          * Hmm, we have a bunch of virtual engine requests,
1603                          * but the first one was already completed (thanks
1604                          * preempt-to-busy!). Keep looking at the veng queue
1605                          * until we have no more relevant requests (i.e.
1606                          * the normal submit queue has higher priority).
1607                          */
1608                         if (!submit) {
1609                                 spin_unlock(&ve->base.active.lock);
1610                                 rb = rb_first_cached(&execlists->virtual);
1611                                 continue;
1612                         }
1613                 }
1614
1615                 spin_unlock(&ve->base.active.lock);
1616                 break;
1617         }
1618
1619         while ((rb = rb_first_cached(&execlists->queue))) {
1620                 struct i915_priolist *p = to_priolist(rb);
1621                 struct i915_request *rq, *rn;
1622                 int i;
1623
1624                 priolist_for_each_request_consume(rq, rn, p, i) {
1625                         bool merge = true;
1626
1627                         /*
1628                          * Can we combine this request with the current port?
1629                          * It has to be the same context/ringbuffer and not
1630                          * have any exceptions (e.g. GVT saying never to
1631                          * combine contexts).
1632                          *
1633                          * If we can combine the requests, we can execute both
1634                          * by updating the RING_TAIL to point to the end of the
1635                          * second request, and so we never need to tell the
1636                          * hardware about the first.
1637                          */
1638                         if (last && !can_merge_rq(last, rq)) {
1639                                 /*
1640                                  * If we are on the second port and cannot
1641                                  * combine this request with the last, then we
1642                                  * are done.
1643                                  */
1644                                 if (port == last_port)
1645                                         goto done;
1646
1647                                 /*
1648                                  * We must not populate both ELSP[] with the
1649                                  * same LRCA, i.e. we must submit 2 different
1650                                  * contexts if we submit 2 ELSP.
1651                                  */
1652                                 if (last->hw_context == rq->hw_context)
1653                                         goto done;
1654
1655                                 /*
1656                                  * If GVT overrides us we only ever submit
1657                                  * port[0], leaving port[1] empty. Note that we
1658                                  * also have to be careful that we don't queue
1659                                  * the same context (even though a different
1660                                  * request) to the second port.
1661                                  */
1662                                 if (ctx_single_port_submission(last->hw_context) ||
1663                                     ctx_single_port_submission(rq->hw_context))
1664                                         goto done;
1665
1666                                 merge = false;
1667                         }
1668
1669                         if (__i915_request_submit(rq)) {
1670                                 if (!merge) {
1671                                         *port = execlists_schedule_in(last, port - execlists->pending);
1672                                         port++;
1673                                         last = NULL;
1674                                 }
1675
1676                                 GEM_BUG_ON(last &&
1677                                            !can_merge_ctx(last->hw_context,
1678                                                           rq->hw_context));
1679
1680                                 submit = true;
1681                                 last = rq;
1682                         }
1683                 }
1684
1685                 rb_erase_cached(&p->node, &execlists->queue);
1686                 i915_priolist_free(p);
1687         }
1688
1689 done:
1690         /*
1691          * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
1692          *
1693          * We choose the priority hint such that if we add a request of greater
1694          * priority than this, we kick the submission tasklet to decide on
1695          * the right order of submitting the requests to hardware. We must
1696          * also be prepared to reorder requests as they are in-flight on the
1697          * HW. We derive the priority hint then as the first "hole" in
1698          * the HW submission ports and if there are no available slots,
1699          * the priority of the lowest executing request, i.e. last.
1700          *
1701          * When we do receive a higher priority request ready to run from the
1702          * user, see queue_request(), the priority hint is bumped to that
1703          * request triggering preemption on the next dequeue (or subsequent
1704          * interrupt for secondary ports).
1705          */
1706         execlists->queue_priority_hint = queue_prio(execlists);
1707         GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n",
1708                   engine->name, execlists->queue_priority_hint,
1709                   yesno(submit));
1710
1711         if (submit) {
1712                 *port = execlists_schedule_in(last, port - execlists->pending);
1713                 execlists->switch_priority_hint =
1714                         switch_prio(engine, *execlists->pending);
1715
1716                 /*
1717                  * Skip if we ended up with exactly the same set of requests,
1718                  * e.g. trying to timeslice a pair of ordered contexts
1719                  */
1720                 if (!memcmp(execlists->active, execlists->pending,
1721                             (port - execlists->pending + 1) * sizeof(*port))) {
1722                         do
1723                                 execlists_schedule_out(fetch_and_zero(port));
1724                         while (port-- != execlists->pending);
1725
1726                         goto skip_submit;
1727                 }
1728
1729                 memset(port + 1, 0, (last_port - port) * sizeof(*port));
1730                 execlists_submit_ports(engine);
1731         } else {
1732 skip_submit:
1733                 ring_set_paused(engine, 0);
1734         }
1735 }
1736
1737 static void
1738 cancel_port_requests(struct intel_engine_execlists * const execlists)
1739 {
1740         struct i915_request * const *port, *rq;
1741
1742         for (port = execlists->pending; (rq = *port); port++)
1743                 execlists_schedule_out(rq);
1744         memset(execlists->pending, 0, sizeof(execlists->pending));
1745
1746         for (port = execlists->active; (rq = *port); port++)
1747                 execlists_schedule_out(rq);
1748         execlists->active =
1749                 memset(execlists->inflight, 0, sizeof(execlists->inflight));
1750 }
1751
1752 static inline void
1753 invalidate_csb_entries(const u32 *first, const u32 *last)
1754 {
1755         clflush((void *)first);
1756         clflush((void *)last);
1757 }
1758
1759 static inline bool
1760 reset_in_progress(const struct intel_engine_execlists *execlists)
1761 {
1762         return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1763 }
1764
1765 /*
1766  * Starting with Gen12, the status has a new format:
1767  *
1768  *     bit  0:     switched to new queue
1769  *     bit  1:     reserved
1770  *     bit  2:     semaphore wait mode (poll or signal), only valid when
1771  *                 switch detail is set to "wait on semaphore"
1772  *     bits 3-5:   engine class
1773  *     bits 6-11:  engine instance
1774  *     bits 12-14: reserved
1775  *     bits 15-25: sw context id of the lrc the GT switched to
1776  *     bits 26-31: sw counter of the lrc the GT switched to
1777  *     bits 32-35: context switch detail
1778  *                  - 0: ctx complete
1779  *                  - 1: wait on sync flip
1780  *                  - 2: wait on vblank
1781  *                  - 3: wait on scanline
1782  *                  - 4: wait on semaphore
1783  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
1784  *                       WAIT_FOR_EVENT)
1785  *     bit  36:    reserved
1786  *     bits 37-43: wait detail (for switch detail 1 to 4)
1787  *     bits 44-46: reserved
1788  *     bits 47-57: sw context id of the lrc the GT switched away from
1789  *     bits 58-63: sw counter of the lrc the GT switched away from
1790  */
1791 static inline bool
1792 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
1793 {
1794         u32 lower_dw = csb[0];
1795         u32 upper_dw = csb[1];
1796         bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
1797         bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
1798         bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
1799
1800         /*
1801          * The context switch detail is not guaranteed to be 5 when a preemption
1802          * occurs, so we can't just check for that. The check below works for
1803          * all the cases we care about, including preemptions of WAIT
1804          * instructions and lite-restore. Preempt-to-idle via the CTRL register
1805          * would require some extra handling, but we don't support that.
1806          */
1807         if (!ctx_away_valid || new_queue) {
1808                 GEM_BUG_ON(!ctx_to_valid);
1809                 return true;
1810         }
1811
1812         /*
1813          * switch detail = 5 is covered by the case above and we do not expect a
1814          * context switch on an unsuccessful wait instruction since we always
1815          * use polling mode.
1816          */
1817         GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
1818         return false;
1819 }
1820
1821 static inline bool
1822 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
1823 {
1824         return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
1825 }
1826
1827 static void process_csb(struct intel_engine_cs *engine)
1828 {
1829         struct intel_engine_execlists * const execlists = &engine->execlists;
1830         const u32 * const buf = execlists->csb_status;
1831         const u8 num_entries = execlists->csb_size;
1832         u8 head, tail;
1833
1834         GEM_BUG_ON(USES_GUC_SUBMISSION(engine->i915));
1835
1836         /*
1837          * Note that csb_write, csb_status may be either in HWSP or mmio.
1838          * When reading from the csb_write mmio register, we have to be
1839          * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
1840          * the low 4bits. As it happens we know the next 4bits are always
1841          * zero and so we can simply masked off the low u8 of the register
1842          * and treat it identically to reading from the HWSP (without having
1843          * to use explicit shifting and masking, and probably bifurcating
1844          * the code to handle the legacy mmio read).
1845          */
1846         head = execlists->csb_head;
1847         tail = READ_ONCE(*execlists->csb_write);
1848         GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
1849         if (unlikely(head == tail))
1850                 return;
1851
1852         /*
1853          * Hopefully paired with a wmb() in HW!
1854          *
1855          * We must complete the read of the write pointer before any reads
1856          * from the CSB, so that we do not see stale values. Without an rmb
1857          * (lfence) the HW may speculatively perform the CSB[] reads *before*
1858          * we perform the READ_ONCE(*csb_write).
1859          */
1860         rmb();
1861
1862         do {
1863                 bool promote;
1864
1865                 if (++head == num_entries)
1866                         head = 0;
1867
1868                 /*
1869                  * We are flying near dragons again.
1870                  *
1871                  * We hold a reference to the request in execlist_port[]
1872                  * but no more than that. We are operating in softirq
1873                  * context and so cannot hold any mutex or sleep. That
1874                  * prevents us stopping the requests we are processing
1875                  * in port[] from being retired simultaneously (the
1876                  * breadcrumb will be complete before we see the
1877                  * context-switch). As we only hold the reference to the
1878                  * request, any pointer chasing underneath the request
1879                  * is subject to a potential use-after-free. Thus we
1880                  * store all of the bookkeeping within port[] as
1881                  * required, and avoid using unguarded pointers beneath
1882                  * request itself. The same applies to the atomic
1883                  * status notifier.
1884                  */
1885
1886                 GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n",
1887                           engine->name, head,
1888                           buf[2 * head + 0], buf[2 * head + 1]);
1889
1890                 if (INTEL_GEN(engine->i915) >= 12)
1891                         promote = gen12_csb_parse(execlists, buf + 2 * head);
1892                 else
1893                         promote = gen8_csb_parse(execlists, buf + 2 * head);
1894                 if (promote) {
1895                         /* cancel old inflight, prepare for switch */
1896                         trace_ports(execlists, "preempted", execlists->active);
1897                         while (*execlists->active)
1898                                 execlists_schedule_out(*execlists->active++);
1899
1900                         /* switch pending to inflight */
1901                         GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
1902                         execlists->active =
1903                                 memcpy(execlists->inflight,
1904                                        execlists->pending,
1905                                        execlists_num_ports(execlists) *
1906                                        sizeof(*execlists->pending));
1907
1908                         if (enable_timeslice(execlists))
1909                                 mod_timer(&execlists->timer, jiffies + 1);
1910
1911                         if (!inject_preempt_hang(execlists))
1912                                 ring_set_paused(engine, 0);
1913
1914                         WRITE_ONCE(execlists->pending[0], NULL);
1915                 } else {
1916                         GEM_BUG_ON(!*execlists->active);
1917
1918                         /* port0 completed, advanced to port1 */
1919                         trace_ports(execlists, "completed", execlists->active);
1920
1921                         /*
1922                          * We rely on the hardware being strongly
1923                          * ordered, that the breadcrumb write is
1924                          * coherent (visible from the CPU) before the
1925                          * user interrupt and CSB is processed.
1926                          */
1927                         GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
1928                                    !reset_in_progress(execlists));
1929                         execlists_schedule_out(*execlists->active++);
1930
1931                         GEM_BUG_ON(execlists->active - execlists->inflight >
1932                                    execlists_num_ports(execlists));
1933                 }
1934         } while (head != tail);
1935
1936         execlists->csb_head = head;
1937
1938         /*
1939          * Gen11 has proven to fail wrt global observation point between
1940          * entry and tail update, failing on the ordering and thus
1941          * we see an old entry in the context status buffer.
1942          *
1943          * Forcibly evict out entries for the next gpu csb update,
1944          * to increase the odds that we get a fresh entries with non
1945          * working hardware. The cost for doing so comes out mostly with
1946          * the wash as hardware, working or not, will need to do the
1947          * invalidation before.
1948          */
1949         invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
1950 }
1951
1952 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
1953 {
1954         lockdep_assert_held(&engine->active.lock);
1955         if (!engine->execlists.pending[0]) {
1956                 rcu_read_lock(); /* protect peeking at execlists->active */
1957                 execlists_dequeue(engine);
1958                 rcu_read_unlock();
1959         }
1960 }
1961
1962 /*
1963  * Check the unread Context Status Buffers and manage the submission of new
1964  * contexts to the ELSP accordingly.
1965  */
1966 static void execlists_submission_tasklet(unsigned long data)
1967 {
1968         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
1969         unsigned long flags;
1970
1971         process_csb(engine);
1972         if (!READ_ONCE(engine->execlists.pending[0])) {
1973                 spin_lock_irqsave(&engine->active.lock, flags);
1974                 __execlists_submission_tasklet(engine);
1975                 spin_unlock_irqrestore(&engine->active.lock, flags);
1976         }
1977 }
1978
1979 static void execlists_submission_timer(struct timer_list *timer)
1980 {
1981         struct intel_engine_cs *engine =
1982                 from_timer(engine, timer, execlists.timer);
1983
1984         /* Kick the tasklet for some interrupt coalescing and reset handling */
1985         tasklet_hi_schedule(&engine->execlists.tasklet);
1986 }
1987
1988 static void queue_request(struct intel_engine_cs *engine,
1989                           struct i915_sched_node *node,
1990                           int prio)
1991 {
1992         GEM_BUG_ON(!list_empty(&node->link));
1993         list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
1994 }
1995
1996 static void __submit_queue_imm(struct intel_engine_cs *engine)
1997 {
1998         struct intel_engine_execlists * const execlists = &engine->execlists;
1999
2000         if (reset_in_progress(execlists))
2001                 return; /* defer until we restart the engine following reset */
2002
2003         if (execlists->tasklet.func == execlists_submission_tasklet)
2004                 __execlists_submission_tasklet(engine);
2005         else
2006                 tasklet_hi_schedule(&execlists->tasklet);
2007 }
2008
2009 static void submit_queue(struct intel_engine_cs *engine,
2010                          const struct i915_request *rq)
2011 {
2012         struct intel_engine_execlists *execlists = &engine->execlists;
2013
2014         if (rq_prio(rq) <= execlists->queue_priority_hint)
2015                 return;
2016
2017         execlists->queue_priority_hint = rq_prio(rq);
2018         __submit_queue_imm(engine);
2019 }
2020
2021 static void execlists_submit_request(struct i915_request *request)
2022 {
2023         struct intel_engine_cs *engine = request->engine;
2024         unsigned long flags;
2025
2026         /* Will be called from irq-context when using foreign fences. */
2027         spin_lock_irqsave(&engine->active.lock, flags);
2028
2029         queue_request(engine, &request->sched, rq_prio(request));
2030
2031         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2032         GEM_BUG_ON(list_empty(&request->sched.link));
2033
2034         submit_queue(engine, request);
2035
2036         spin_unlock_irqrestore(&engine->active.lock, flags);
2037 }
2038
2039 static void __execlists_context_fini(struct intel_context *ce)
2040 {
2041         intel_ring_put(ce->ring);
2042         i915_vma_put(ce->state);
2043 }
2044
2045 static void execlists_context_destroy(struct kref *kref)
2046 {
2047         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2048
2049         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2050         GEM_BUG_ON(intel_context_is_pinned(ce));
2051
2052         if (ce->state)
2053                 __execlists_context_fini(ce);
2054
2055         intel_context_fini(ce);
2056         intel_context_free(ce);
2057 }
2058
2059 static void
2060 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2061 {
2062         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2063                 return;
2064
2065         vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
2066         vaddr += engine->context_size;
2067
2068         memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
2069 }
2070
2071 static void
2072 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2073 {
2074         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2075                 return;
2076
2077         vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
2078         vaddr += engine->context_size;
2079
2080         if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE))
2081                 dev_err_once(engine->i915->drm.dev,
2082                              "%s context redzone overwritten!\n",
2083                              engine->name);
2084 }
2085
2086 static void execlists_context_unpin(struct intel_context *ce)
2087 {
2088         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
2089                       ce->engine);
2090
2091         i915_gem_object_unpin_map(ce->state->obj);
2092         intel_ring_reset(ce->ring, ce->ring->tail);
2093 }
2094
2095 static void
2096 __execlists_update_reg_state(const struct intel_context *ce,
2097                              const struct intel_engine_cs *engine)
2098 {
2099         struct intel_ring *ring = ce->ring;
2100         u32 *regs = ce->lrc_reg_state;
2101
2102         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
2103         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
2104
2105         regs[CTX_RING_BUFFER_START] = i915_ggtt_offset(ring->vma);
2106         regs[CTX_RING_HEAD] = ring->head;
2107         regs[CTX_RING_TAIL] = ring->tail;
2108
2109         /* RPCS */
2110         if (engine->class == RENDER_CLASS) {
2111                 regs[CTX_R_PWR_CLK_STATE] =
2112                         intel_sseu_make_rpcs(engine->i915, &ce->sseu);
2113
2114                 i915_oa_init_reg_state(ce, engine);
2115         }
2116 }
2117
2118 static int
2119 __execlists_context_pin(struct intel_context *ce,
2120                         struct intel_engine_cs *engine)
2121 {
2122         void *vaddr;
2123         int ret;
2124
2125         GEM_BUG_ON(!ce->state);
2126
2127         ret = intel_context_active_acquire(ce);
2128         if (ret)
2129                 goto err;
2130         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2131
2132         vaddr = i915_gem_object_pin_map(ce->state->obj,
2133                                         i915_coherent_map_type(engine->i915) |
2134                                         I915_MAP_OVERRIDE);
2135         if (IS_ERR(vaddr)) {
2136                 ret = PTR_ERR(vaddr);
2137                 goto unpin_active;
2138         }
2139
2140         ce->lrc_desc = lrc_descriptor(ce, engine);
2141         ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
2142         __execlists_update_reg_state(ce, engine);
2143
2144         return 0;
2145
2146 unpin_active:
2147         intel_context_active_release(ce);
2148 err:
2149         return ret;
2150 }
2151
2152 static int execlists_context_pin(struct intel_context *ce)
2153 {
2154         return __execlists_context_pin(ce, ce->engine);
2155 }
2156
2157 static int execlists_context_alloc(struct intel_context *ce)
2158 {
2159         return __execlists_context_alloc(ce, ce->engine);
2160 }
2161
2162 static void execlists_context_reset(struct intel_context *ce)
2163 {
2164         /*
2165          * Because we emit WA_TAIL_DWORDS there may be a disparity
2166          * between our bookkeeping in ce->ring->head and ce->ring->tail and
2167          * that stored in context. As we only write new commands from
2168          * ce->ring->tail onwards, everything before that is junk. If the GPU
2169          * starts reading from its RING_HEAD from the context, it may try to
2170          * execute that junk and die.
2171          *
2172          * The contexts that are stilled pinned on resume belong to the
2173          * kernel, and are local to each engine. All other contexts will
2174          * have their head/tail sanitized upon pinning before use, so they
2175          * will never see garbage,
2176          *
2177          * So to avoid that we reset the context images upon resume. For
2178          * simplicity, we just zero everything out.
2179          */
2180         intel_ring_reset(ce->ring, 0);
2181         __execlists_update_reg_state(ce, ce->engine);
2182 }
2183
2184 static const struct intel_context_ops execlists_context_ops = {
2185         .alloc = execlists_context_alloc,
2186
2187         .pin = execlists_context_pin,
2188         .unpin = execlists_context_unpin,
2189
2190         .enter = intel_context_enter_engine,
2191         .exit = intel_context_exit_engine,
2192
2193         .reset = execlists_context_reset,
2194         .destroy = execlists_context_destroy,
2195 };
2196
2197 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
2198 {
2199         u32 *cs;
2200
2201         GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
2202
2203         cs = intel_ring_begin(rq, 6);
2204         if (IS_ERR(cs))
2205                 return PTR_ERR(cs);
2206
2207         /*
2208          * Check if we have been preempted before we even get started.
2209          *
2210          * After this point i915_request_started() reports true, even if
2211          * we get preempted and so are no longer running.
2212          */
2213         *cs++ = MI_ARB_CHECK;
2214         *cs++ = MI_NOOP;
2215
2216         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
2217         *cs++ = i915_request_timeline(rq)->hwsp_offset;
2218         *cs++ = 0;
2219         *cs++ = rq->fence.seqno - 1;
2220
2221         intel_ring_advance(rq, cs);
2222
2223         /* Record the updated position of the request's payload */
2224         rq->infix = intel_ring_offset(rq, cs);
2225
2226         return 0;
2227 }
2228
2229 static int execlists_request_alloc(struct i915_request *request)
2230 {
2231         int ret;
2232
2233         GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
2234
2235         /*
2236          * Flush enough space to reduce the likelihood of waiting after
2237          * we start building the request - in which case we will just
2238          * have to repeat work.
2239          */
2240         request->reserved_space += EXECLISTS_REQUEST_SIZE;
2241
2242         /*
2243          * Note that after this point, we have committed to using
2244          * this request as it is being used to both track the
2245          * state of engine initialisation and liveness of the
2246          * golden renderstate above. Think twice before you try
2247          * to cancel/unwind this request now.
2248          */
2249
2250         /* Unconditionally invalidate GPU caches and TLBs. */
2251         ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
2252         if (ret)
2253                 return ret;
2254
2255         request->reserved_space -= EXECLISTS_REQUEST_SIZE;
2256         return 0;
2257 }
2258
2259 /*
2260  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
2261  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
2262  * but there is a slight complication as this is applied in WA batch where the
2263  * values are only initialized once so we cannot take register value at the
2264  * beginning and reuse it further; hence we save its value to memory, upload a
2265  * constant value with bit21 set and then we restore it back with the saved value.
2266  * To simplify the WA, a constant value is formed by using the default value
2267  * of this register. This shouldn't be a problem because we are only modifying
2268  * it for a short period and this batch in non-premptible. We can ofcourse
2269  * use additional instructions that read the actual value of the register
2270  * at that time and set our bit of interest but it makes the WA complicated.
2271  *
2272  * This WA is also required for Gen9 so extracting as a function avoids
2273  * code duplication.
2274  */
2275 static u32 *
2276 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
2277 {
2278         /* NB no one else is allowed to scribble over scratch + 256! */
2279         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2280         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2281         *batch++ = intel_gt_scratch_offset(engine->gt,
2282                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2283         *batch++ = 0;
2284
2285         *batch++ = MI_LOAD_REGISTER_IMM(1);
2286         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2287         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
2288
2289         batch = gen8_emit_pipe_control(batch,
2290                                        PIPE_CONTROL_CS_STALL |
2291                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
2292                                        0);
2293
2294         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2295         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2296         *batch++ = intel_gt_scratch_offset(engine->gt,
2297                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2298         *batch++ = 0;
2299
2300         return batch;
2301 }
2302
2303 /*
2304  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
2305  * initialized at the beginning and shared across all contexts but this field
2306  * helps us to have multiple batches at different offsets and select them based
2307  * on a criteria. At the moment this batch always start at the beginning of the page
2308  * and at this point we don't have multiple wa_ctx batch buffers.
2309  *
2310  * The number of WA applied are not known at the beginning; we use this field
2311  * to return the no of DWORDS written.
2312  *
2313  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
2314  * so it adds NOOPs as padding to make it cacheline aligned.
2315  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
2316  * makes a complete batch buffer.
2317  */
2318 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2319 {
2320         /* WaDisableCtxRestoreArbitration:bdw,chv */
2321         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2322
2323         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
2324         if (IS_BROADWELL(engine->i915))
2325                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2326
2327         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
2328         /* Actual scratch location is at 128 bytes offset */
2329         batch = gen8_emit_pipe_control(batch,
2330                                        PIPE_CONTROL_FLUSH_L3 |
2331                                        PIPE_CONTROL_STORE_DATA_INDEX |
2332                                        PIPE_CONTROL_CS_STALL |
2333                                        PIPE_CONTROL_QW_WRITE,
2334                                        LRC_PPHWSP_SCRATCH_ADDR);
2335
2336         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2337
2338         /* Pad to end of cacheline */
2339         while ((unsigned long)batch % CACHELINE_BYTES)
2340                 *batch++ = MI_NOOP;
2341
2342         /*
2343          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
2344          * execution depends on the length specified in terms of cache lines
2345          * in the register CTX_RCS_INDIRECT_CTX
2346          */
2347
2348         return batch;
2349 }
2350
2351 struct lri {
2352         i915_reg_t reg;
2353         u32 value;
2354 };
2355
2356 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
2357 {
2358         GEM_BUG_ON(!count || count > 63);
2359
2360         *batch++ = MI_LOAD_REGISTER_IMM(count);
2361         do {
2362                 *batch++ = i915_mmio_reg_offset(lri->reg);
2363                 *batch++ = lri->value;
2364         } while (lri++, --count);
2365         *batch++ = MI_NOOP;
2366
2367         return batch;
2368 }
2369
2370 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2371 {
2372         static const struct lri lri[] = {
2373                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
2374                 {
2375                         COMMON_SLICE_CHICKEN2,
2376                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
2377                                        0),
2378                 },
2379
2380                 /* BSpec: 11391 */
2381                 {
2382                         FF_SLICE_CHICKEN,
2383                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
2384                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
2385                 },
2386
2387                 /* BSpec: 11299 */
2388                 {
2389                         _3D_CHICKEN3,
2390                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
2391                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
2392                 }
2393         };
2394
2395         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2396
2397         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
2398         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2399
2400         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
2401
2402         /* WaMediaPoolStateCmdInWABB:bxt,glk */
2403         if (HAS_POOLED_EU(engine->i915)) {
2404                 /*
2405                  * EU pool configuration is setup along with golden context
2406                  * during context initialization. This value depends on
2407                  * device type (2x6 or 3x6) and needs to be updated based
2408                  * on which subslice is disabled especially for 2x6
2409                  * devices, however it is safe to load default
2410                  * configuration of 3x6 device instead of masking off
2411                  * corresponding bits because HW ignores bits of a disabled
2412                  * subslice and drops down to appropriate config. Please
2413                  * see render_state_setup() in i915_gem_render_state.c for
2414                  * possible configurations, to avoid duplication they are
2415                  * not shown here again.
2416                  */
2417                 *batch++ = GEN9_MEDIA_POOL_STATE;
2418                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
2419                 *batch++ = 0x00777000;
2420                 *batch++ = 0;
2421                 *batch++ = 0;
2422                 *batch++ = 0;
2423         }
2424
2425         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2426
2427         /* Pad to end of cacheline */
2428         while ((unsigned long)batch % CACHELINE_BYTES)
2429                 *batch++ = MI_NOOP;
2430
2431         return batch;
2432 }
2433
2434 static u32 *
2435 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2436 {
2437         int i;
2438
2439         /*
2440          * WaPipeControlBefore3DStateSamplePattern: cnl
2441          *
2442          * Ensure the engine is idle prior to programming a
2443          * 3DSTATE_SAMPLE_PATTERN during a context restore.
2444          */
2445         batch = gen8_emit_pipe_control(batch,
2446                                        PIPE_CONTROL_CS_STALL,
2447                                        0);
2448         /*
2449          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
2450          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
2451          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
2452          * confusing. Since gen8_emit_pipe_control() already advances the
2453          * batch by 6 dwords, we advance the other 10 here, completing a
2454          * cacheline. It's not clear if the workaround requires this padding
2455          * before other commands, or if it's just the regular padding we would
2456          * already have for the workaround bb, so leave it here for now.
2457          */
2458         for (i = 0; i < 10; i++)
2459                 *batch++ = MI_NOOP;
2460
2461         /* Pad to end of cacheline */
2462         while ((unsigned long)batch % CACHELINE_BYTES)
2463                 *batch++ = MI_NOOP;
2464
2465         return batch;
2466 }
2467
2468 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
2469
2470 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
2471 {
2472         struct drm_i915_gem_object *obj;
2473         struct i915_vma *vma;
2474         int err;
2475
2476         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
2477         if (IS_ERR(obj))
2478                 return PTR_ERR(obj);
2479
2480         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
2481         if (IS_ERR(vma)) {
2482                 err = PTR_ERR(vma);
2483                 goto err;
2484         }
2485
2486         err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
2487         if (err)
2488                 goto err;
2489
2490         engine->wa_ctx.vma = vma;
2491         return 0;
2492
2493 err:
2494         i915_gem_object_put(obj);
2495         return err;
2496 }
2497
2498 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
2499 {
2500         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
2501 }
2502
2503 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
2504
2505 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
2506 {
2507         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2508         struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
2509                                             &wa_ctx->per_ctx };
2510         wa_bb_func_t wa_bb_fn[2];
2511         struct page *page;
2512         void *batch, *batch_ptr;
2513         unsigned int i;
2514         int ret;
2515
2516         if (engine->class != RENDER_CLASS)
2517                 return 0;
2518
2519         switch (INTEL_GEN(engine->i915)) {
2520         case 12:
2521         case 11:
2522                 return 0;
2523         case 10:
2524                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
2525                 wa_bb_fn[1] = NULL;
2526                 break;
2527         case 9:
2528                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
2529                 wa_bb_fn[1] = NULL;
2530                 break;
2531         case 8:
2532                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
2533                 wa_bb_fn[1] = NULL;
2534                 break;
2535         default:
2536                 MISSING_CASE(INTEL_GEN(engine->i915));
2537                 return 0;
2538         }
2539
2540         ret = lrc_setup_wa_ctx(engine);
2541         if (ret) {
2542                 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
2543                 return ret;
2544         }
2545
2546         page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
2547         batch = batch_ptr = kmap_atomic(page);
2548
2549         /*
2550          * Emit the two workaround batch buffers, recording the offset from the
2551          * start of the workaround batch buffer object for each and their
2552          * respective sizes.
2553          */
2554         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
2555                 wa_bb[i]->offset = batch_ptr - batch;
2556                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
2557                                                   CACHELINE_BYTES))) {
2558                         ret = -EINVAL;
2559                         break;
2560                 }
2561                 if (wa_bb_fn[i])
2562                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
2563                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
2564         }
2565
2566         BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
2567
2568         kunmap_atomic(batch);
2569         if (ret)
2570                 lrc_destroy_wa_ctx(engine);
2571
2572         return ret;
2573 }
2574
2575 static void enable_execlists(struct intel_engine_cs *engine)
2576 {
2577         u32 mode;
2578
2579         assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
2580
2581         intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
2582
2583         if (INTEL_GEN(engine->i915) >= 11)
2584                 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
2585         else
2586                 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
2587         ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
2588
2589         ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
2590
2591         ENGINE_WRITE_FW(engine,
2592                         RING_HWS_PGA,
2593                         i915_ggtt_offset(engine->status_page.vma));
2594         ENGINE_POSTING_READ(engine, RING_HWS_PGA);
2595 }
2596
2597 static bool unexpected_starting_state(struct intel_engine_cs *engine)
2598 {
2599         bool unexpected = false;
2600
2601         if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
2602                 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
2603                 unexpected = true;
2604         }
2605
2606         return unexpected;
2607 }
2608
2609 static int execlists_resume(struct intel_engine_cs *engine)
2610 {
2611         intel_engine_apply_workarounds(engine);
2612         intel_engine_apply_whitelist(engine);
2613
2614         intel_mocs_init_engine(engine);
2615
2616         intel_engine_reset_breadcrumbs(engine);
2617
2618         if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
2619                 struct drm_printer p = drm_debug_printer(__func__);
2620
2621                 intel_engine_dump(engine, &p, NULL);
2622         }
2623
2624         enable_execlists(engine);
2625
2626         return 0;
2627 }
2628
2629 static void execlists_reset_prepare(struct intel_engine_cs *engine)
2630 {
2631         struct intel_engine_execlists * const execlists = &engine->execlists;
2632         unsigned long flags;
2633
2634         GEM_TRACE("%s: depth<-%d\n", engine->name,
2635                   atomic_read(&execlists->tasklet.count));
2636
2637         /*
2638          * Prevent request submission to the hardware until we have
2639          * completed the reset in i915_gem_reset_finish(). If a request
2640          * is completed by one engine, it may then queue a request
2641          * to a second via its execlists->tasklet *just* as we are
2642          * calling engine->resume() and also writing the ELSP.
2643          * Turning off the execlists->tasklet until the reset is over
2644          * prevents the race.
2645          */
2646         __tasklet_disable_sync_once(&execlists->tasklet);
2647         GEM_BUG_ON(!reset_in_progress(execlists));
2648
2649         /* And flush any current direct submission. */
2650         spin_lock_irqsave(&engine->active.lock, flags);
2651         spin_unlock_irqrestore(&engine->active.lock, flags);
2652
2653         /*
2654          * We stop engines, otherwise we might get failed reset and a
2655          * dead gpu (on elk). Also as modern gpu as kbl can suffer
2656          * from system hang if batchbuffer is progressing when
2657          * the reset is issued, regardless of READY_TO_RESET ack.
2658          * Thus assume it is best to stop engines on all gens
2659          * where we have a gpu reset.
2660          *
2661          * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
2662          *
2663          * FIXME: Wa for more modern gens needs to be validated
2664          */
2665         intel_engine_stop_cs(engine);
2666 }
2667
2668 static void reset_csb_pointers(struct intel_engine_cs *engine)
2669 {
2670         struct intel_engine_execlists * const execlists = &engine->execlists;
2671         const unsigned int reset_value = execlists->csb_size - 1;
2672
2673         ring_set_paused(engine, 0);
2674
2675         /*
2676          * After a reset, the HW starts writing into CSB entry [0]. We
2677          * therefore have to set our HEAD pointer back one entry so that
2678          * the *first* entry we check is entry 0. To complicate this further,
2679          * as we don't wait for the first interrupt after reset, we have to
2680          * fake the HW write to point back to the last entry so that our
2681          * inline comparison of our cached head position against the last HW
2682          * write works even before the first interrupt.
2683          */
2684         execlists->csb_head = reset_value;
2685         WRITE_ONCE(*execlists->csb_write, reset_value);
2686         wmb(); /* Make sure this is visible to HW (paranoia?) */
2687
2688         invalidate_csb_entries(&execlists->csb_status[0],
2689                                &execlists->csb_status[reset_value]);
2690 }
2691
2692 static struct i915_request *active_request(struct i915_request *rq)
2693 {
2694         const struct intel_context * const ce = rq->hw_context;
2695         struct i915_request *active = NULL;
2696         struct list_head *list;
2697
2698         if (!i915_request_is_active(rq)) /* unwound, but incomplete! */
2699                 return rq;
2700
2701         list = &i915_request_active_timeline(rq)->requests;
2702         list_for_each_entry_from_reverse(rq, list, link) {
2703                 if (i915_request_completed(rq))
2704                         break;
2705
2706                 if (rq->hw_context != ce)
2707                         break;
2708
2709                 active = rq;
2710         }
2711
2712         return active;
2713 }
2714
2715 static void __execlists_reset_reg_state(const struct intel_context *ce,
2716                                         const struct intel_engine_cs *engine)
2717 {
2718         u32 *regs = ce->lrc_reg_state;
2719
2720         if (INTEL_GEN(engine->i915) >= 9) {
2721                 regs[GEN9_CTX_RING_MI_MODE + 1] &= ~STOP_RING;
2722                 regs[GEN9_CTX_RING_MI_MODE + 1] |= STOP_RING << 16;
2723         }
2724 }
2725
2726 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
2727 {
2728         struct intel_engine_execlists * const execlists = &engine->execlists;
2729         struct intel_context *ce;
2730         struct i915_request *rq;
2731         u32 *regs;
2732
2733         mb(); /* paranoia: read the CSB pointers from after the reset */
2734         clflush(execlists->csb_write);
2735         mb();
2736
2737         process_csb(engine); /* drain preemption events */
2738
2739         /* Following the reset, we need to reload the CSB read/write pointers */
2740         reset_csb_pointers(engine);
2741
2742         /*
2743          * Save the currently executing context, even if we completed
2744          * its request, it was still running at the time of the
2745          * reset and will have been clobbered.
2746          */
2747         rq = execlists_active(execlists);
2748         if (!rq)
2749                 goto unwind;
2750
2751         /* We still have requests in-flight; the engine should be active */
2752         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
2753
2754         ce = rq->hw_context;
2755         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2756
2757         /* Proclaim we have exclusive access to the context image! */
2758         __context_pin_acquire(ce);
2759
2760         rq = active_request(rq);
2761         if (!rq) {
2762                 /* Idle context; tidy up the ring so we can restart afresh */
2763                 ce->ring->head = ce->ring->tail;
2764                 goto out_replay;
2765         }
2766
2767         /* Context has requests still in-flight; it should not be idle! */
2768         GEM_BUG_ON(i915_active_is_idle(&ce->active));
2769         ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
2770
2771         /*
2772          * If this request hasn't started yet, e.g. it is waiting on a
2773          * semaphore, we need to avoid skipping the request or else we
2774          * break the signaling chain. However, if the context is corrupt
2775          * the request will not restart and we will be stuck with a wedged
2776          * device. It is quite often the case that if we issue a reset
2777          * while the GPU is loading the context image, that the context
2778          * image becomes corrupt.
2779          *
2780          * Otherwise, if we have not started yet, the request should replay
2781          * perfectly and we do not need to flag the result as being erroneous.
2782          */
2783         if (!i915_request_started(rq))
2784                 goto out_replay;
2785
2786         /*
2787          * If the request was innocent, we leave the request in the ELSP
2788          * and will try to replay it on restarting. The context image may
2789          * have been corrupted by the reset, in which case we may have
2790          * to service a new GPU hang, but more likely we can continue on
2791          * without impact.
2792          *
2793          * If the request was guilty, we presume the context is corrupt
2794          * and have to at least restore the RING register in the context
2795          * image back to the expected values to skip over the guilty request.
2796          */
2797         __i915_request_reset(rq, stalled);
2798         if (!stalled)
2799                 goto out_replay;
2800
2801         /*
2802          * We want a simple context + ring to execute the breadcrumb update.
2803          * We cannot rely on the context being intact across the GPU hang,
2804          * so clear it and rebuild just what we need for the breadcrumb.
2805          * All pending requests for this context will be zapped, and any
2806          * future request will be after userspace has had the opportunity
2807          * to recreate its own state.
2808          */
2809         GEM_BUG_ON(!intel_context_is_pinned(ce));
2810         regs = ce->lrc_reg_state;
2811         if (engine->pinned_default_state) {
2812                 memcpy(regs, /* skip restoring the vanilla PPHWSP */
2813                        engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
2814                        engine->context_size - PAGE_SIZE);
2815         }
2816         execlists_init_reg_state(regs, ce, engine, ce->ring, false);
2817
2818 out_replay:
2819         GEM_TRACE("%s replay {head:%04x, tail:%04x\n",
2820                   engine->name, ce->ring->head, ce->ring->tail);
2821         intel_ring_update_space(ce->ring);
2822         __execlists_reset_reg_state(ce, engine);
2823         __execlists_update_reg_state(ce, engine);
2824         ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
2825         __context_pin_release(ce);
2826
2827 unwind:
2828         /* Push back any incomplete requests for replay after the reset. */
2829         cancel_port_requests(execlists);
2830         __unwind_incomplete_requests(engine);
2831 }
2832
2833 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
2834 {
2835         unsigned long flags;
2836
2837         GEM_TRACE("%s\n", engine->name);
2838
2839         spin_lock_irqsave(&engine->active.lock, flags);
2840
2841         __execlists_reset(engine, stalled);
2842
2843         spin_unlock_irqrestore(&engine->active.lock, flags);
2844 }
2845
2846 static void nop_submission_tasklet(unsigned long data)
2847 {
2848         /* The driver is wedged; don't process any more events. */
2849 }
2850
2851 static void execlists_cancel_requests(struct intel_engine_cs *engine)
2852 {
2853         struct intel_engine_execlists * const execlists = &engine->execlists;
2854         struct i915_request *rq, *rn;
2855         struct rb_node *rb;
2856         unsigned long flags;
2857
2858         GEM_TRACE("%s\n", engine->name);
2859
2860         /*
2861          * Before we call engine->cancel_requests(), we should have exclusive
2862          * access to the submission state. This is arranged for us by the
2863          * caller disabling the interrupt generation, the tasklet and other
2864          * threads that may then access the same state, giving us a free hand
2865          * to reset state. However, we still need to let lockdep be aware that
2866          * we know this state may be accessed in hardirq context, so we
2867          * disable the irq around this manipulation and we want to keep
2868          * the spinlock focused on its duties and not accidentally conflate
2869          * coverage to the submission's irq state. (Similarly, although we
2870          * shouldn't need to disable irq around the manipulation of the
2871          * submission's irq state, we also wish to remind ourselves that
2872          * it is irq state.)
2873          */
2874         spin_lock_irqsave(&engine->active.lock, flags);
2875
2876         __execlists_reset(engine, true);
2877
2878         /* Mark all executing requests as skipped. */
2879         list_for_each_entry(rq, &engine->active.requests, sched.link)
2880                 mark_eio(rq);
2881
2882         /* Flush the queued requests to the timeline list (for retiring). */
2883         while ((rb = rb_first_cached(&execlists->queue))) {
2884                 struct i915_priolist *p = to_priolist(rb);
2885                 int i;
2886
2887                 priolist_for_each_request_consume(rq, rn, p, i) {
2888                         mark_eio(rq);
2889                         __i915_request_submit(rq);
2890                 }
2891
2892                 rb_erase_cached(&p->node, &execlists->queue);
2893                 i915_priolist_free(p);
2894         }
2895
2896         /* Cancel all attached virtual engines */
2897         while ((rb = rb_first_cached(&execlists->virtual))) {
2898                 struct virtual_engine *ve =
2899                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2900
2901                 rb_erase_cached(rb, &execlists->virtual);
2902                 RB_CLEAR_NODE(rb);
2903
2904                 spin_lock(&ve->base.active.lock);
2905                 rq = fetch_and_zero(&ve->request);
2906                 if (rq) {
2907                         mark_eio(rq);
2908
2909                         rq->engine = engine;
2910                         __i915_request_submit(rq);
2911                         i915_request_put(rq);
2912
2913                         ve->base.execlists.queue_priority_hint = INT_MIN;
2914                 }
2915                 spin_unlock(&ve->base.active.lock);
2916         }
2917
2918         /* Remaining _unready_ requests will be nop'ed when submitted */
2919
2920         execlists->queue_priority_hint = INT_MIN;
2921         execlists->queue = RB_ROOT_CACHED;
2922
2923         GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
2924         execlists->tasklet.func = nop_submission_tasklet;
2925
2926         spin_unlock_irqrestore(&engine->active.lock, flags);
2927 }
2928
2929 static void execlists_reset_finish(struct intel_engine_cs *engine)
2930 {
2931         struct intel_engine_execlists * const execlists = &engine->execlists;
2932
2933         /*
2934          * After a GPU reset, we may have requests to replay. Do so now while
2935          * we still have the forcewake to be sure that the GPU is not allowed
2936          * to sleep before we restart and reload a context.
2937          */
2938         GEM_BUG_ON(!reset_in_progress(execlists));
2939         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
2940                 execlists->tasklet.func(execlists->tasklet.data);
2941
2942         if (__tasklet_enable(&execlists->tasklet))
2943                 /* And kick in case we missed a new request submission. */
2944                 tasklet_hi_schedule(&execlists->tasklet);
2945         GEM_TRACE("%s: depth->%d\n", engine->name,
2946                   atomic_read(&execlists->tasklet.count));
2947 }
2948
2949 static int gen8_emit_bb_start(struct i915_request *rq,
2950                               u64 offset, u32 len,
2951                               const unsigned int flags)
2952 {
2953         u32 *cs;
2954
2955         cs = intel_ring_begin(rq, 4);
2956         if (IS_ERR(cs))
2957                 return PTR_ERR(cs);
2958
2959         /*
2960          * WaDisableCtxRestoreArbitration:bdw,chv
2961          *
2962          * We don't need to perform MI_ARB_ENABLE as often as we do (in
2963          * particular all the gen that do not need the w/a at all!), if we
2964          * took care to make sure that on every switch into this context
2965          * (both ordinary and for preemption) that arbitrartion was enabled
2966          * we would be fine.  However, for gen8 there is another w/a that
2967          * requires us to not preempt inside GPGPU execution, so we keep
2968          * arbitration disabled for gen8 batches. Arbitration will be
2969          * re-enabled before we close the request
2970          * (engine->emit_fini_breadcrumb).
2971          */
2972         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2973
2974         /* FIXME(BDW+): Address space and security selectors. */
2975         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
2976                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
2977         *cs++ = lower_32_bits(offset);
2978         *cs++ = upper_32_bits(offset);
2979
2980         intel_ring_advance(rq, cs);
2981
2982         return 0;
2983 }
2984
2985 static int gen9_emit_bb_start(struct i915_request *rq,
2986                               u64 offset, u32 len,
2987                               const unsigned int flags)
2988 {
2989         u32 *cs;
2990
2991         cs = intel_ring_begin(rq, 6);
2992         if (IS_ERR(cs))
2993                 return PTR_ERR(cs);
2994
2995         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2996
2997         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
2998                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
2999         *cs++ = lower_32_bits(offset);
3000         *cs++ = upper_32_bits(offset);
3001
3002         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3003         *cs++ = MI_NOOP;
3004
3005         intel_ring_advance(rq, cs);
3006
3007         return 0;
3008 }
3009
3010 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3011 {
3012         ENGINE_WRITE(engine, RING_IMR,
3013                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
3014         ENGINE_POSTING_READ(engine, RING_IMR);
3015 }
3016
3017 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3018 {
3019         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3020 }
3021
3022 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3023 {
3024         u32 cmd, *cs;
3025
3026         cs = intel_ring_begin(request, 4);
3027         if (IS_ERR(cs))
3028                 return PTR_ERR(cs);
3029
3030         cmd = MI_FLUSH_DW + 1;
3031
3032         /* We always require a command barrier so that subsequent
3033          * commands, such as breadcrumb interrupts, are strictly ordered
3034          * wrt the contents of the write cache being flushed to memory
3035          * (and thus being coherent from the CPU).
3036          */
3037         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3038
3039         if (mode & EMIT_INVALIDATE) {
3040                 cmd |= MI_INVALIDATE_TLB;
3041                 if (request->engine->class == VIDEO_DECODE_CLASS)
3042                         cmd |= MI_INVALIDATE_BSD;
3043         }
3044
3045         *cs++ = cmd;
3046         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3047         *cs++ = 0; /* upper addr */
3048         *cs++ = 0; /* value */
3049         intel_ring_advance(request, cs);
3050
3051         return 0;
3052 }
3053
3054 static int gen8_emit_flush_render(struct i915_request *request,
3055                                   u32 mode)
3056 {
3057         bool vf_flush_wa = false, dc_flush_wa = false;
3058         u32 *cs, flags = 0;
3059         int len;
3060
3061         flags |= PIPE_CONTROL_CS_STALL;
3062
3063         if (mode & EMIT_FLUSH) {
3064                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3065                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3066                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3067                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
3068         }
3069
3070         if (mode & EMIT_INVALIDATE) {
3071                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
3072                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3073                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3074                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3075                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3076                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3077                 flags |= PIPE_CONTROL_QW_WRITE;
3078                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3079
3080                 /*
3081                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
3082                  * pipe control.
3083                  */
3084                 if (IS_GEN(request->i915, 9))
3085                         vf_flush_wa = true;
3086
3087                 /* WaForGAMHang:kbl */
3088                 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
3089                         dc_flush_wa = true;
3090         }
3091
3092         len = 6;
3093
3094         if (vf_flush_wa)
3095                 len += 6;
3096
3097         if (dc_flush_wa)
3098                 len += 12;
3099
3100         cs = intel_ring_begin(request, len);
3101         if (IS_ERR(cs))
3102                 return PTR_ERR(cs);
3103
3104         if (vf_flush_wa)
3105                 cs = gen8_emit_pipe_control(cs, 0, 0);
3106
3107         if (dc_flush_wa)
3108                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
3109                                             0);
3110
3111         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3112
3113         if (dc_flush_wa)
3114                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
3115
3116         intel_ring_advance(request, cs);
3117
3118         return 0;
3119 }
3120
3121 static int gen11_emit_flush_render(struct i915_request *request,
3122                                    u32 mode)
3123 {
3124         if (mode & EMIT_FLUSH) {
3125                 u32 *cs;
3126                 u32 flags = 0;
3127
3128                 flags |= PIPE_CONTROL_CS_STALL;
3129
3130                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3131                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3132                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3133                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3134                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
3135                 flags |= PIPE_CONTROL_QW_WRITE;
3136                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3137
3138                 cs = intel_ring_begin(request, 6);
3139                 if (IS_ERR(cs))
3140                         return PTR_ERR(cs);
3141
3142                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3143                 intel_ring_advance(request, cs);
3144         }
3145
3146         if (mode & EMIT_INVALIDATE) {
3147                 u32 *cs;
3148                 u32 flags = 0;
3149
3150                 flags |= PIPE_CONTROL_CS_STALL;
3151
3152                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3153                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
3154                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3155                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3156                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3157                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3158                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3159                 flags |= PIPE_CONTROL_QW_WRITE;
3160                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3161
3162                 cs = intel_ring_begin(request, 6);
3163                 if (IS_ERR(cs))
3164                         return PTR_ERR(cs);
3165
3166                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3167                 intel_ring_advance(request, cs);
3168         }
3169
3170         return 0;
3171 }
3172
3173 static u32 preparser_disable(bool state)
3174 {
3175         return MI_ARB_CHECK | 1 << 8 | state;
3176 }
3177
3178 static int gen12_emit_flush_render(struct i915_request *request,
3179                                    u32 mode)
3180 {
3181         if (mode & EMIT_FLUSH) {
3182                 u32 flags = 0;
3183                 u32 *cs;
3184
3185                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3186                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3187                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3188                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3189                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
3190
3191                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3192                 flags |= PIPE_CONTROL_QW_WRITE;
3193
3194                 flags |= PIPE_CONTROL_CS_STALL;
3195
3196                 cs = intel_ring_begin(request, 6);
3197                 if (IS_ERR(cs))
3198                         return PTR_ERR(cs);
3199
3200                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3201                 intel_ring_advance(request, cs);
3202         }
3203
3204         if (mode & EMIT_INVALIDATE) {
3205                 u32 flags = 0;
3206                 u32 *cs;
3207
3208                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3209                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
3210                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3211                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3212                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3213                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3214                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3215
3216                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3217                 flags |= PIPE_CONTROL_QW_WRITE;
3218
3219                 flags |= PIPE_CONTROL_CS_STALL;
3220
3221                 cs = intel_ring_begin(request, 8);
3222                 if (IS_ERR(cs))
3223                         return PTR_ERR(cs);
3224
3225                 /*
3226                  * Prevent the pre-parser from skipping past the TLB
3227                  * invalidate and loading a stale page for the batch
3228                  * buffer / request payload.
3229                  */
3230                 *cs++ = preparser_disable(true);
3231
3232                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3233
3234                 *cs++ = preparser_disable(false);
3235                 intel_ring_advance(request, cs);
3236         }
3237
3238         return 0;
3239 }
3240
3241 /*
3242  * Reserve space for 2 NOOPs at the end of each request to be
3243  * used as a workaround for not being allowed to do lite
3244  * restore with HEAD==TAIL (WaIdleLiteRestore).
3245  */
3246 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
3247 {
3248         /* Ensure there's always at least one preemption point per-request. */
3249         *cs++ = MI_ARB_CHECK;
3250         *cs++ = MI_NOOP;
3251         request->wa_tail = intel_ring_offset(request, cs);
3252
3253         return cs;
3254 }
3255
3256 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
3257 {
3258         *cs++ = MI_SEMAPHORE_WAIT |
3259                 MI_SEMAPHORE_GLOBAL_GTT |
3260                 MI_SEMAPHORE_POLL |
3261                 MI_SEMAPHORE_SAD_EQ_SDD;
3262         *cs++ = 0;
3263         *cs++ = intel_hws_preempt_address(request->engine);
3264         *cs++ = 0;
3265
3266         return cs;
3267 }
3268
3269 static __always_inline u32*
3270 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
3271                                  u32 *cs)
3272 {
3273         *cs++ = MI_USER_INTERRUPT;
3274
3275         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3276         if (intel_engine_has_semaphores(request->engine))
3277                 cs = emit_preempt_busywait(request, cs);
3278
3279         request->tail = intel_ring_offset(request, cs);
3280         assert_ring_tail_valid(request->ring, request->tail);
3281
3282         return gen8_emit_wa_tail(request, cs);
3283 }
3284
3285 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
3286 {
3287         cs = gen8_emit_ggtt_write(cs,
3288                                   request->fence.seqno,
3289                                   i915_request_active_timeline(request)->hwsp_offset,
3290                                   0);
3291
3292         return gen8_emit_fini_breadcrumb_footer(request, cs);
3293 }
3294
3295 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3296 {
3297         cs = gen8_emit_pipe_control(cs,
3298                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3299                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3300                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
3301                                     0);
3302
3303         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
3304         cs = gen8_emit_ggtt_write_rcs(cs,
3305                                       request->fence.seqno,
3306                                       i915_request_active_timeline(request)->hwsp_offset,
3307                                       PIPE_CONTROL_FLUSH_ENABLE |
3308                                       PIPE_CONTROL_CS_STALL);
3309
3310         return gen8_emit_fini_breadcrumb_footer(request, cs);
3311 }
3312
3313 static u32 *
3314 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3315 {
3316         cs = gen8_emit_ggtt_write_rcs(cs,
3317                                       request->fence.seqno,
3318                                       i915_request_active_timeline(request)->hwsp_offset,
3319                                       PIPE_CONTROL_CS_STALL |
3320                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
3321                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3322                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3323                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
3324                                       PIPE_CONTROL_FLUSH_ENABLE);
3325
3326         return gen8_emit_fini_breadcrumb_footer(request, cs);
3327 }
3328
3329 /*
3330  * Note that the CS instruction pre-parser will not stall on the breadcrumb
3331  * flush and will continue pre-fetching the instructions after it before the
3332  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
3333  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
3334  * of the next request before the memory has been flushed, we're guaranteed that
3335  * we won't access the batch itself too early.
3336  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
3337  * so, if the current request is modifying an instruction in the next request on
3338  * the same intel_context, we might pre-fetch and then execute the pre-update
3339  * instruction. To avoid this, the users of self-modifying code should either
3340  * disable the parser around the code emitting the memory writes, via a new flag
3341  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
3342  * the in-kernel use-cases we've opted to use a separate context, see
3343  * reloc_gpu() as an example.
3344  * All the above applies only to the instructions themselves. Non-inline data
3345  * used by the instructions is not pre-fetched.
3346  */
3347
3348 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
3349 {
3350         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
3351                 MI_SEMAPHORE_GLOBAL_GTT |
3352                 MI_SEMAPHORE_POLL |
3353                 MI_SEMAPHORE_SAD_EQ_SDD;
3354         *cs++ = 0;
3355         *cs++ = intel_hws_preempt_address(request->engine);
3356         *cs++ = 0;
3357         *cs++ = 0;
3358         *cs++ = MI_NOOP;
3359
3360         return cs;
3361 }
3362
3363 static __always_inline u32*
3364 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
3365 {
3366         *cs++ = MI_USER_INTERRUPT;
3367
3368         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3369         if (intel_engine_has_semaphores(request->engine))
3370                 cs = gen12_emit_preempt_busywait(request, cs);
3371
3372         request->tail = intel_ring_offset(request, cs);
3373         assert_ring_tail_valid(request->ring, request->tail);
3374
3375         return gen8_emit_wa_tail(request, cs);
3376 }
3377
3378 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
3379 {
3380         cs = gen8_emit_ggtt_write(cs,
3381                                   request->fence.seqno,
3382                                   i915_request_active_timeline(request)->hwsp_offset,
3383                                   0);
3384
3385         return gen12_emit_fini_breadcrumb_footer(request, cs);
3386 }
3387
3388 static u32 *
3389 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
3390 {
3391         cs = gen8_emit_ggtt_write_rcs(cs,
3392                                       request->fence.seqno,
3393                                       i915_request_active_timeline(request)->hwsp_offset,
3394                                       PIPE_CONTROL_CS_STALL |
3395                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
3396                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
3397                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
3398                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
3399                                       PIPE_CONTROL_FLUSH_ENABLE);
3400
3401         return gen12_emit_fini_breadcrumb_footer(request, cs);
3402 }
3403
3404 static void execlists_park(struct intel_engine_cs *engine)
3405 {
3406         del_timer(&engine->execlists.timer);
3407 }
3408
3409 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
3410 {
3411         engine->submit_request = execlists_submit_request;
3412         engine->cancel_requests = execlists_cancel_requests;
3413         engine->schedule = i915_schedule;
3414         engine->execlists.tasklet.func = execlists_submission_tasklet;
3415
3416         engine->reset.prepare = execlists_reset_prepare;
3417         engine->reset.reset = execlists_reset;
3418         engine->reset.finish = execlists_reset_finish;
3419
3420         engine->park = execlists_park;
3421         engine->unpark = NULL;
3422
3423         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
3424         if (!intel_vgpu_active(engine->i915)) {
3425                 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
3426                 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
3427                         engine->flags |= I915_ENGINE_HAS_PREEMPTION;
3428         }
3429
3430         if (INTEL_GEN(engine->i915) >= 12)
3431                 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
3432 }
3433
3434 static void execlists_destroy(struct intel_engine_cs *engine)
3435 {
3436         intel_engine_cleanup_common(engine);
3437         lrc_destroy_wa_ctx(engine);
3438         kfree(engine);
3439 }
3440
3441 static void
3442 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
3443 {
3444         /* Default vfuncs which can be overriden by each engine. */
3445
3446         engine->destroy = execlists_destroy;
3447         engine->resume = execlists_resume;
3448
3449         engine->reset.prepare = execlists_reset_prepare;
3450         engine->reset.reset = execlists_reset;
3451         engine->reset.finish = execlists_reset_finish;
3452
3453         engine->cops = &execlists_context_ops;
3454         engine->request_alloc = execlists_request_alloc;
3455
3456         engine->emit_flush = gen8_emit_flush;
3457         engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
3458         engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
3459         if (INTEL_GEN(engine->i915) >= 12)
3460                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
3461
3462         engine->set_default_submission = intel_execlists_set_default_submission;
3463
3464         if (INTEL_GEN(engine->i915) < 11) {
3465                 engine->irq_enable = gen8_logical_ring_enable_irq;
3466                 engine->irq_disable = gen8_logical_ring_disable_irq;
3467         } else {
3468                 /*
3469                  * TODO: On Gen11 interrupt masks need to be clear
3470                  * to allow C6 entry. Keep interrupts enabled at
3471                  * and take the hit of generating extra interrupts
3472                  * until a more refined solution exists.
3473                  */
3474         }
3475         if (IS_GEN(engine->i915, 8))
3476                 engine->emit_bb_start = gen8_emit_bb_start;
3477         else
3478                 engine->emit_bb_start = gen9_emit_bb_start;
3479 }
3480
3481 static inline void
3482 logical_ring_default_irqs(struct intel_engine_cs *engine)
3483 {
3484         unsigned int shift = 0;
3485
3486         if (INTEL_GEN(engine->i915) < 11) {
3487                 const u8 irq_shifts[] = {
3488                         [RCS0]  = GEN8_RCS_IRQ_SHIFT,
3489                         [BCS0]  = GEN8_BCS_IRQ_SHIFT,
3490                         [VCS0]  = GEN8_VCS0_IRQ_SHIFT,
3491                         [VCS1]  = GEN8_VCS1_IRQ_SHIFT,
3492                         [VECS0] = GEN8_VECS_IRQ_SHIFT,
3493                 };
3494
3495                 shift = irq_shifts[engine->id];
3496         }
3497
3498         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
3499         engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
3500 }
3501
3502 static void rcs_submission_override(struct intel_engine_cs *engine)
3503 {
3504         switch (INTEL_GEN(engine->i915)) {
3505         case 12:
3506                 engine->emit_flush = gen12_emit_flush_render;
3507                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
3508                 break;
3509         case 11:
3510                 engine->emit_flush = gen11_emit_flush_render;
3511                 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
3512                 break;
3513         default:
3514                 engine->emit_flush = gen8_emit_flush_render;
3515                 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
3516                 break;
3517         }
3518 }
3519
3520 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
3521 {
3522         tasklet_init(&engine->execlists.tasklet,
3523                      execlists_submission_tasklet, (unsigned long)engine);
3524         timer_setup(&engine->execlists.timer, execlists_submission_timer, 0);
3525
3526         logical_ring_default_vfuncs(engine);
3527         logical_ring_default_irqs(engine);
3528
3529         if (engine->class == RENDER_CLASS)
3530                 rcs_submission_override(engine);
3531
3532         return 0;
3533 }
3534
3535 int intel_execlists_submission_init(struct intel_engine_cs *engine)
3536 {
3537         struct intel_engine_execlists * const execlists = &engine->execlists;
3538         struct drm_i915_private *i915 = engine->i915;
3539         struct intel_uncore *uncore = engine->uncore;
3540         u32 base = engine->mmio_base;
3541         int ret;
3542
3543         ret = intel_engine_init_common(engine);
3544         if (ret)
3545                 return ret;
3546
3547         if (intel_init_workaround_bb(engine))
3548                 /*
3549                  * We continue even if we fail to initialize WA batch
3550                  * because we only expect rare glitches but nothing
3551                  * critical to prevent us from using GPU
3552                  */
3553                 DRM_ERROR("WA batch buffer initialization failed\n");
3554
3555         if (HAS_LOGICAL_RING_ELSQ(i915)) {
3556                 execlists->submit_reg = uncore->regs +
3557                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
3558                 execlists->ctrl_reg = uncore->regs +
3559                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
3560         } else {
3561                 execlists->submit_reg = uncore->regs +
3562                         i915_mmio_reg_offset(RING_ELSP(base));
3563         }
3564
3565         execlists->csb_status =
3566                 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
3567
3568         execlists->csb_write =
3569                 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
3570
3571         if (INTEL_GEN(i915) < 11)
3572                 execlists->csb_size = GEN8_CSB_ENTRIES;
3573         else
3574                 execlists->csb_size = GEN11_CSB_ENTRIES;
3575
3576         reset_csb_pointers(engine);
3577
3578         return 0;
3579 }
3580
3581 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
3582 {
3583         u32 indirect_ctx_offset;
3584
3585         switch (INTEL_GEN(engine->i915)) {
3586         default:
3587                 MISSING_CASE(INTEL_GEN(engine->i915));
3588                 /* fall through */
3589         case 12:
3590                 indirect_ctx_offset =
3591                         GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3592                 break;
3593         case 11:
3594                 indirect_ctx_offset =
3595                         GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3596                 break;
3597         case 10:
3598                 indirect_ctx_offset =
3599                         GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3600                 break;
3601         case 9:
3602                 indirect_ctx_offset =
3603                         GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3604                 break;
3605         case 8:
3606                 indirect_ctx_offset =
3607                         GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3608                 break;
3609         }
3610
3611         return indirect_ctx_offset;
3612 }
3613
3614
3615 static void init_common_reg_state(u32 * const regs,
3616                                   const struct intel_engine_cs *engine,
3617                                   const struct intel_ring *ring)
3618 {
3619         regs[CTX_CONTEXT_CONTROL] =
3620                 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
3621                 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
3622         if (INTEL_GEN(engine->i915) < 11)
3623                 regs[CTX_CONTEXT_CONTROL] |=
3624                         _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
3625                                             CTX_CTRL_RS_CTX_ENABLE);
3626
3627         regs[CTX_RING_BUFFER_CONTROL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3628         regs[CTX_BB_STATE] = RING_BB_PPGTT;
3629 }
3630
3631 static void init_wa_bb_reg_state(u32 * const regs,
3632                                  const struct intel_engine_cs *engine,
3633                                  u32 pos_bb_per_ctx)
3634 {
3635         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
3636
3637         if (wa_ctx->per_ctx.size) {
3638                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3639
3640                 regs[pos_bb_per_ctx] =
3641                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
3642         }
3643
3644         if (wa_ctx->indirect_ctx.size) {
3645                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3646
3647                 regs[pos_bb_per_ctx + 2] =
3648                         (ggtt_offset + wa_ctx->indirect_ctx.offset) |
3649                         (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
3650
3651                 regs[pos_bb_per_ctx + 4] =
3652                         intel_lr_indirect_ctx_offset(engine) << 6;
3653         }
3654 }
3655
3656 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
3657 {
3658         if (i915_vm_is_4lvl(&ppgtt->vm)) {
3659                 /* 64b PPGTT (48bit canonical)
3660                  * PDP0_DESCRIPTOR contains the base address to PML4 and
3661                  * other PDP Descriptors are ignored.
3662                  */
3663                 ASSIGN_CTX_PML4(ppgtt, regs);
3664         } else {
3665                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
3666                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
3667                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
3668                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
3669         }
3670 }
3671
3672 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
3673 {
3674         if (i915_is_ggtt(vm))
3675                 return i915_vm_to_ggtt(vm)->alias;
3676         else
3677                 return i915_vm_to_ppgtt(vm);
3678 }
3679
3680 static void execlists_init_reg_state(u32 *regs,
3681                                      const struct intel_context *ce,
3682                                      const struct intel_engine_cs *engine,
3683                                      const struct intel_ring *ring,
3684                                      bool close)
3685 {
3686         /*
3687          * A context is actually a big batch buffer with several
3688          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
3689          * values we are setting here are only for the first context restore:
3690          * on a subsequent save, the GPU will recreate this batchbuffer with new
3691          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
3692          * we are not initializing here).
3693          *
3694          * Must keep consistent with virtual_update_register_offsets().
3695          */
3696         u32 *bbe = set_offsets(regs, reg_offsets(engine), engine);
3697
3698         if (close) { /* Close the batch; used mainly by live_lrc_layout() */
3699                 *bbe = MI_BATCH_BUFFER_END;
3700                 if (INTEL_GEN(engine->i915) >= 10)
3701                         *bbe |= BIT(0);
3702         }
3703
3704         init_common_reg_state(regs, engine, ring);
3705         init_ppgtt_reg_state(regs, vm_alias(ce->vm));
3706
3707         init_wa_bb_reg_state(regs, engine,
3708                              INTEL_GEN(engine->i915) >= 12 ?
3709                              GEN12_CTX_BB_PER_CTX_PTR :
3710                              CTX_BB_PER_CTX_PTR);
3711 }
3712
3713 static int
3714 populate_lr_context(struct intel_context *ce,
3715                     struct drm_i915_gem_object *ctx_obj,
3716                     struct intel_engine_cs *engine,
3717                     struct intel_ring *ring)
3718 {
3719         bool inhibit = true;
3720         void *vaddr;
3721         u32 *regs;
3722         int ret;
3723
3724         vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
3725         if (IS_ERR(vaddr)) {
3726                 ret = PTR_ERR(vaddr);
3727                 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
3728                 return ret;
3729         }
3730
3731         set_redzone(vaddr, engine);
3732
3733         if (engine->default_state) {
3734                 /*
3735                  * We only want to copy over the template context state;
3736                  * skipping over the headers reserved for GuC communication,
3737                  * leaving those as zero.
3738                  */
3739                 const unsigned long start = LRC_HEADER_PAGES * PAGE_SIZE;
3740                 void *defaults;
3741
3742                 defaults = i915_gem_object_pin_map(engine->default_state,
3743                                                    I915_MAP_WB);
3744                 if (IS_ERR(defaults)) {
3745                         ret = PTR_ERR(defaults);
3746                         goto err_unpin_ctx;
3747                 }
3748
3749                 memcpy(vaddr + start, defaults + start, engine->context_size);
3750                 i915_gem_object_unpin_map(engine->default_state);
3751                 inhibit = false;
3752         }
3753
3754         /* The second page of the context object contains some fields which must
3755          * be set up prior to the first execution. */
3756         regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
3757         execlists_init_reg_state(regs, ce, engine, ring, inhibit);
3758         if (inhibit)
3759                 regs[CTX_CONTEXT_CONTROL] |=
3760                         _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
3761
3762         ret = 0;
3763 err_unpin_ctx:
3764         __i915_gem_object_flush_map(ctx_obj,
3765                                     LRC_HEADER_PAGES * PAGE_SIZE,
3766                                     engine->context_size);
3767         i915_gem_object_unpin_map(ctx_obj);
3768         return ret;
3769 }
3770
3771 static int __execlists_context_alloc(struct intel_context *ce,
3772                                      struct intel_engine_cs *engine)
3773 {
3774         struct drm_i915_gem_object *ctx_obj;
3775         struct intel_ring *ring;
3776         struct i915_vma *vma;
3777         u32 context_size;
3778         int ret;
3779
3780         GEM_BUG_ON(ce->state);
3781         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
3782
3783         /*
3784          * Before the actual start of the context image, we insert a few pages
3785          * for our own use and for sharing with the GuC.
3786          */
3787         context_size += LRC_HEADER_PAGES * PAGE_SIZE;
3788         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3789                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
3790
3791         ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
3792         if (IS_ERR(ctx_obj))
3793                 return PTR_ERR(ctx_obj);
3794
3795         vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
3796         if (IS_ERR(vma)) {
3797                 ret = PTR_ERR(vma);
3798                 goto error_deref_obj;
3799         }
3800
3801         if (!ce->timeline) {
3802                 struct intel_timeline *tl;
3803
3804                 tl = intel_timeline_create(engine->gt, NULL);
3805                 if (IS_ERR(tl)) {
3806                         ret = PTR_ERR(tl);
3807                         goto error_deref_obj;
3808                 }
3809
3810                 ce->timeline = tl;
3811         }
3812
3813         ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
3814         if (IS_ERR(ring)) {
3815                 ret = PTR_ERR(ring);
3816                 goto error_deref_obj;
3817         }
3818
3819         ret = populate_lr_context(ce, ctx_obj, engine, ring);
3820         if (ret) {
3821                 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
3822                 goto error_ring_free;
3823         }
3824
3825         ce->ring = ring;
3826         ce->state = vma;
3827
3828         return 0;
3829
3830 error_ring_free:
3831         intel_ring_put(ring);
3832 error_deref_obj:
3833         i915_gem_object_put(ctx_obj);
3834         return ret;
3835 }
3836
3837 static struct list_head *virtual_queue(struct virtual_engine *ve)
3838 {
3839         return &ve->base.execlists.default_priolist.requests[0];
3840 }
3841
3842 static void virtual_context_destroy(struct kref *kref)
3843 {
3844         struct virtual_engine *ve =
3845                 container_of(kref, typeof(*ve), context.ref);
3846         unsigned int n;
3847
3848         GEM_BUG_ON(!list_empty(virtual_queue(ve)));
3849         GEM_BUG_ON(ve->request);
3850         GEM_BUG_ON(ve->context.inflight);
3851
3852         for (n = 0; n < ve->num_siblings; n++) {
3853                 struct intel_engine_cs *sibling = ve->siblings[n];
3854                 struct rb_node *node = &ve->nodes[sibling->id].rb;
3855
3856                 if (RB_EMPTY_NODE(node))
3857                         continue;
3858
3859                 spin_lock_irq(&sibling->active.lock);
3860
3861                 /* Detachment is lazily performed in the execlists tasklet */
3862                 if (!RB_EMPTY_NODE(node))
3863                         rb_erase_cached(node, &sibling->execlists.virtual);
3864
3865                 spin_unlock_irq(&sibling->active.lock);
3866         }
3867         GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
3868
3869         if (ve->context.state)
3870                 __execlists_context_fini(&ve->context);
3871         intel_context_fini(&ve->context);
3872
3873         kfree(ve->bonds);
3874         kfree(ve);
3875 }
3876
3877 static void virtual_engine_initial_hint(struct virtual_engine *ve)
3878 {
3879         int swp;
3880
3881         /*
3882          * Pick a random sibling on starting to help spread the load around.
3883          *
3884          * New contexts are typically created with exactly the same order
3885          * of siblings, and often started in batches. Due to the way we iterate
3886          * the array of sibling when submitting requests, sibling[0] is
3887          * prioritised for dequeuing. If we make sure that sibling[0] is fairly
3888          * randomised across the system, we also help spread the load by the
3889          * first engine we inspect being different each time.
3890          *
3891          * NB This does not force us to execute on this engine, it will just
3892          * typically be the first we inspect for submission.
3893          */
3894         swp = prandom_u32_max(ve->num_siblings);
3895         if (!swp)
3896                 return;
3897
3898         swap(ve->siblings[swp], ve->siblings[0]);
3899         if (!intel_engine_has_relative_mmio(ve->siblings[0]))
3900                 virtual_update_register_offsets(ve->context.lrc_reg_state,
3901                                                 ve->siblings[0]);
3902 }
3903
3904 static int virtual_context_pin(struct intel_context *ce)
3905 {
3906         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
3907         int err;
3908
3909         /* Note: we must use a real engine class for setting up reg state */
3910         err = __execlists_context_pin(ce, ve->siblings[0]);
3911         if (err)
3912                 return err;
3913
3914         virtual_engine_initial_hint(ve);
3915         return 0;
3916 }
3917
3918 static void virtual_context_enter(struct intel_context *ce)
3919 {
3920         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
3921         unsigned int n;
3922
3923         for (n = 0; n < ve->num_siblings; n++)
3924                 intel_engine_pm_get(ve->siblings[n]);
3925
3926         intel_timeline_enter(ce->timeline);
3927 }
3928
3929 static void virtual_context_exit(struct intel_context *ce)
3930 {
3931         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
3932         unsigned int n;
3933
3934         intel_timeline_exit(ce->timeline);
3935
3936         for (n = 0; n < ve->num_siblings; n++)
3937                 intel_engine_pm_put(ve->siblings[n]);
3938 }
3939
3940 static const struct intel_context_ops virtual_context_ops = {
3941         .pin = virtual_context_pin,
3942         .unpin = execlists_context_unpin,
3943
3944         .enter = virtual_context_enter,
3945         .exit = virtual_context_exit,
3946
3947         .destroy = virtual_context_destroy,
3948 };
3949
3950 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
3951 {
3952         struct i915_request *rq;
3953         intel_engine_mask_t mask;
3954
3955         rq = READ_ONCE(ve->request);
3956         if (!rq)
3957                 return 0;
3958
3959         /* The rq is ready for submission; rq->execution_mask is now stable. */
3960         mask = rq->execution_mask;
3961         if (unlikely(!mask)) {
3962                 /* Invalid selection, submit to a random engine in error */
3963                 i915_request_skip(rq, -ENODEV);
3964                 mask = ve->siblings[0]->mask;
3965         }
3966
3967         GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n",
3968                   ve->base.name,
3969                   rq->fence.context, rq->fence.seqno,
3970                   mask, ve->base.execlists.queue_priority_hint);
3971
3972         return mask;
3973 }
3974
3975 static void virtual_submission_tasklet(unsigned long data)
3976 {
3977         struct virtual_engine * const ve = (struct virtual_engine *)data;
3978         const int prio = ve->base.execlists.queue_priority_hint;
3979         intel_engine_mask_t mask;
3980         unsigned int n;
3981
3982         rcu_read_lock();
3983         mask = virtual_submission_mask(ve);
3984         rcu_read_unlock();
3985         if (unlikely(!mask))
3986                 return;
3987
3988         local_irq_disable();
3989         for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
3990                 struct intel_engine_cs *sibling = ve->siblings[n];
3991                 struct ve_node * const node = &ve->nodes[sibling->id];
3992                 struct rb_node **parent, *rb;
3993                 bool first;
3994
3995                 if (unlikely(!(mask & sibling->mask))) {
3996                         if (!RB_EMPTY_NODE(&node->rb)) {
3997                                 spin_lock(&sibling->active.lock);
3998                                 rb_erase_cached(&node->rb,
3999                                                 &sibling->execlists.virtual);
4000                                 RB_CLEAR_NODE(&node->rb);
4001                                 spin_unlock(&sibling->active.lock);
4002                         }
4003                         continue;
4004                 }
4005
4006                 spin_lock(&sibling->active.lock);
4007
4008                 if (!RB_EMPTY_NODE(&node->rb)) {
4009                         /*
4010                          * Cheat and avoid rebalancing the tree if we can
4011                          * reuse this node in situ.
4012                          */
4013                         first = rb_first_cached(&sibling->execlists.virtual) ==
4014                                 &node->rb;
4015                         if (prio == node->prio || (prio > node->prio && first))
4016                                 goto submit_engine;
4017
4018                         rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4019                 }
4020
4021                 rb = NULL;
4022                 first = true;
4023                 parent = &sibling->execlists.virtual.rb_root.rb_node;
4024                 while (*parent) {
4025                         struct ve_node *other;
4026
4027                         rb = *parent;
4028                         other = rb_entry(rb, typeof(*other), rb);
4029                         if (prio > other->prio) {
4030                                 parent = &rb->rb_left;
4031                         } else {
4032                                 parent = &rb->rb_right;
4033                                 first = false;
4034                         }
4035                 }
4036
4037                 rb_link_node(&node->rb, rb, parent);
4038                 rb_insert_color_cached(&node->rb,
4039                                        &sibling->execlists.virtual,
4040                                        first);
4041
4042 submit_engine:
4043                 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
4044                 node->prio = prio;
4045                 if (first && prio > sibling->execlists.queue_priority_hint) {
4046                         sibling->execlists.queue_priority_hint = prio;
4047                         tasklet_hi_schedule(&sibling->execlists.tasklet);
4048                 }
4049
4050                 spin_unlock(&sibling->active.lock);
4051         }
4052         local_irq_enable();
4053 }
4054
4055 static void virtual_submit_request(struct i915_request *rq)
4056 {
4057         struct virtual_engine *ve = to_virtual_engine(rq->engine);
4058         struct i915_request *old;
4059         unsigned long flags;
4060
4061         GEM_TRACE("%s: rq=%llx:%lld\n",
4062                   ve->base.name,
4063                   rq->fence.context,
4064                   rq->fence.seqno);
4065
4066         GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
4067
4068         spin_lock_irqsave(&ve->base.active.lock, flags);
4069
4070         old = ve->request;
4071         if (old) { /* background completion event from preempt-to-busy */
4072                 GEM_BUG_ON(!i915_request_completed(old));
4073                 __i915_request_submit(old);
4074                 i915_request_put(old);
4075         }
4076
4077         if (i915_request_completed(rq)) {
4078                 __i915_request_submit(rq);
4079
4080                 ve->base.execlists.queue_priority_hint = INT_MIN;
4081                 ve->request = NULL;
4082         } else {
4083                 ve->base.execlists.queue_priority_hint = rq_prio(rq);
4084                 ve->request = i915_request_get(rq);
4085
4086                 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4087                 list_move_tail(&rq->sched.link, virtual_queue(ve));
4088
4089                 tasklet_schedule(&ve->base.execlists.tasklet);
4090         }
4091
4092         spin_unlock_irqrestore(&ve->base.active.lock, flags);
4093 }
4094
4095 static struct ve_bond *
4096 virtual_find_bond(struct virtual_engine *ve,
4097                   const struct intel_engine_cs *master)
4098 {
4099         int i;
4100
4101         for (i = 0; i < ve->num_bonds; i++) {
4102                 if (ve->bonds[i].master == master)
4103                         return &ve->bonds[i];
4104         }
4105
4106         return NULL;
4107 }
4108
4109 static void
4110 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
4111 {
4112         struct virtual_engine *ve = to_virtual_engine(rq->engine);
4113         intel_engine_mask_t allowed, exec;
4114         struct ve_bond *bond;
4115
4116         allowed = ~to_request(signal)->engine->mask;
4117
4118         bond = virtual_find_bond(ve, to_request(signal)->engine);
4119         if (bond)
4120                 allowed &= bond->sibling_mask;
4121
4122         /* Restrict the bonded request to run on only the available engines */
4123         exec = READ_ONCE(rq->execution_mask);
4124         while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
4125                 ;
4126
4127         /* Prevent the master from being re-run on the bonded engines */
4128         to_request(signal)->execution_mask &= ~allowed;
4129 }
4130
4131 struct intel_context *
4132 intel_execlists_create_virtual(struct i915_gem_context *ctx,
4133                                struct intel_engine_cs **siblings,
4134                                unsigned int count)
4135 {
4136         struct virtual_engine *ve;
4137         unsigned int n;
4138         int err;
4139
4140         if (count == 0)
4141                 return ERR_PTR(-EINVAL);
4142
4143         if (count == 1)
4144                 return intel_context_create(ctx, siblings[0]);
4145
4146         ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
4147         if (!ve)
4148                 return ERR_PTR(-ENOMEM);
4149
4150         ve->base.i915 = ctx->i915;
4151         ve->base.gt = siblings[0]->gt;
4152         ve->base.uncore = siblings[0]->uncore;
4153         ve->base.id = -1;
4154         ve->base.class = OTHER_CLASS;
4155         ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
4156         ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4157
4158         /*
4159          * The decision on whether to submit a request using semaphores
4160          * depends on the saturated state of the engine. We only compute
4161          * this during HW submission of the request, and we need for this
4162          * state to be globally applied to all requests being submitted
4163          * to this engine. Virtual engines encompass more than one physical
4164          * engine and so we cannot accurately tell in advance if one of those
4165          * engines is already saturated and so cannot afford to use a semaphore
4166          * and be pessimized in priority for doing so -- if we are the only
4167          * context using semaphores after all other clients have stopped, we
4168          * will be starved on the saturated system. Such a global switch for
4169          * semaphores is less than ideal, but alas is the current compromise.
4170          */
4171         ve->base.saturated = ALL_ENGINES;
4172
4173         snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
4174
4175         intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
4176         intel_engine_init_breadcrumbs(&ve->base);
4177
4178         intel_engine_init_execlists(&ve->base);
4179
4180         ve->base.cops = &virtual_context_ops;
4181         ve->base.request_alloc = execlists_request_alloc;
4182
4183         ve->base.schedule = i915_schedule;
4184         ve->base.submit_request = virtual_submit_request;
4185         ve->base.bond_execute = virtual_bond_execute;
4186
4187         INIT_LIST_HEAD(virtual_queue(ve));
4188         ve->base.execlists.queue_priority_hint = INT_MIN;
4189         tasklet_init(&ve->base.execlists.tasklet,
4190                      virtual_submission_tasklet,
4191                      (unsigned long)ve);
4192
4193         intel_context_init(&ve->context, ctx, &ve->base);
4194
4195         for (n = 0; n < count; n++) {
4196                 struct intel_engine_cs *sibling = siblings[n];
4197
4198                 GEM_BUG_ON(!is_power_of_2(sibling->mask));
4199                 if (sibling->mask & ve->base.mask) {
4200                         DRM_DEBUG("duplicate %s entry in load balancer\n",
4201                                   sibling->name);
4202                         err = -EINVAL;
4203                         goto err_put;
4204                 }
4205
4206                 /*
4207                  * The virtual engine implementation is tightly coupled to
4208                  * the execlists backend -- we push out request directly
4209                  * into a tree inside each physical engine. We could support
4210                  * layering if we handle cloning of the requests and
4211                  * submitting a copy into each backend.
4212                  */
4213                 if (sibling->execlists.tasklet.func !=
4214                     execlists_submission_tasklet) {
4215                         err = -ENODEV;
4216                         goto err_put;
4217                 }
4218
4219                 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
4220                 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
4221
4222                 ve->siblings[ve->num_siblings++] = sibling;
4223                 ve->base.mask |= sibling->mask;
4224
4225                 /*
4226                  * All physical engines must be compatible for their emission
4227                  * functions (as we build the instructions during request
4228                  * construction and do not alter them before submission
4229                  * on the physical engine). We use the engine class as a guide
4230                  * here, although that could be refined.
4231                  */
4232                 if (ve->base.class != OTHER_CLASS) {
4233                         if (ve->base.class != sibling->class) {
4234                                 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
4235                                           sibling->class, ve->base.class);
4236                                 err = -EINVAL;
4237                                 goto err_put;
4238                         }
4239                         continue;
4240                 }
4241
4242                 ve->base.class = sibling->class;
4243                 ve->base.uabi_class = sibling->uabi_class;
4244                 snprintf(ve->base.name, sizeof(ve->base.name),
4245                          "v%dx%d", ve->base.class, count);
4246                 ve->base.context_size = sibling->context_size;
4247
4248                 ve->base.emit_bb_start = sibling->emit_bb_start;
4249                 ve->base.emit_flush = sibling->emit_flush;
4250                 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
4251                 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
4252                 ve->base.emit_fini_breadcrumb_dw =
4253                         sibling->emit_fini_breadcrumb_dw;
4254
4255                 ve->base.flags = sibling->flags;
4256         }
4257
4258         ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
4259
4260         err = __execlists_context_alloc(&ve->context, siblings[0]);
4261         if (err)
4262                 goto err_put;
4263
4264         __set_bit(CONTEXT_ALLOC_BIT, &ve->context.flags);
4265
4266         return &ve->context;
4267
4268 err_put:
4269         intel_context_put(&ve->context);
4270         return ERR_PTR(err);
4271 }
4272
4273 struct intel_context *
4274 intel_execlists_clone_virtual(struct i915_gem_context *ctx,
4275                               struct intel_engine_cs *src)
4276 {
4277         struct virtual_engine *se = to_virtual_engine(src);
4278         struct intel_context *dst;
4279
4280         dst = intel_execlists_create_virtual(ctx,
4281                                              se->siblings,
4282                                              se->num_siblings);
4283         if (IS_ERR(dst))
4284                 return dst;
4285
4286         if (se->num_bonds) {
4287                 struct virtual_engine *de = to_virtual_engine(dst->engine);
4288
4289                 de->bonds = kmemdup(se->bonds,
4290                                     sizeof(*se->bonds) * se->num_bonds,
4291                                     GFP_KERNEL);
4292                 if (!de->bonds) {
4293                         intel_context_put(dst);
4294                         return ERR_PTR(-ENOMEM);
4295                 }
4296
4297                 de->num_bonds = se->num_bonds;
4298         }
4299
4300         return dst;
4301 }
4302
4303 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
4304                                      const struct intel_engine_cs *master,
4305                                      const struct intel_engine_cs *sibling)
4306 {
4307         struct virtual_engine *ve = to_virtual_engine(engine);
4308         struct ve_bond *bond;
4309         int n;
4310
4311         /* Sanity check the sibling is part of the virtual engine */
4312         for (n = 0; n < ve->num_siblings; n++)
4313                 if (sibling == ve->siblings[n])
4314                         break;
4315         if (n == ve->num_siblings)
4316                 return -EINVAL;
4317
4318         bond = virtual_find_bond(ve, master);
4319         if (bond) {
4320                 bond->sibling_mask |= sibling->mask;
4321                 return 0;
4322         }
4323
4324         bond = krealloc(ve->bonds,
4325                         sizeof(*bond) * (ve->num_bonds + 1),
4326                         GFP_KERNEL);
4327         if (!bond)
4328                 return -ENOMEM;
4329
4330         bond[ve->num_bonds].master = master;
4331         bond[ve->num_bonds].sibling_mask = sibling->mask;
4332
4333         ve->bonds = bond;
4334         ve->num_bonds++;
4335
4336         return 0;
4337 }
4338
4339 struct intel_engine_cs *
4340 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
4341                                  unsigned int sibling)
4342 {
4343         struct virtual_engine *ve = to_virtual_engine(engine);
4344
4345         if (sibling >= ve->num_siblings)
4346                 return NULL;
4347
4348         return ve->siblings[sibling];
4349 }
4350
4351 void intel_execlists_show_requests(struct intel_engine_cs *engine,
4352                                    struct drm_printer *m,
4353                                    void (*show_request)(struct drm_printer *m,
4354                                                         struct i915_request *rq,
4355                                                         const char *prefix),
4356                                    unsigned int max)
4357 {
4358         const struct intel_engine_execlists *execlists = &engine->execlists;
4359         struct i915_request *rq, *last;
4360         unsigned long flags;
4361         unsigned int count;
4362         struct rb_node *rb;
4363
4364         spin_lock_irqsave(&engine->active.lock, flags);
4365
4366         last = NULL;
4367         count = 0;
4368         list_for_each_entry(rq, &engine->active.requests, sched.link) {
4369                 if (count++ < max - 1)
4370                         show_request(m, rq, "\t\tE ");
4371                 else
4372                         last = rq;
4373         }
4374         if (last) {
4375                 if (count > max) {
4376                         drm_printf(m,
4377                                    "\t\t...skipping %d executing requests...\n",
4378                                    count - max);
4379                 }
4380                 show_request(m, last, "\t\tE ");
4381         }
4382
4383         last = NULL;
4384         count = 0;
4385         if (execlists->queue_priority_hint != INT_MIN)
4386                 drm_printf(m, "\t\tQueue priority hint: %d\n",
4387                            execlists->queue_priority_hint);
4388         for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
4389                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
4390                 int i;
4391
4392                 priolist_for_each_request(rq, p, i) {
4393                         if (count++ < max - 1)
4394                                 show_request(m, rq, "\t\tQ ");
4395                         else
4396                                 last = rq;
4397                 }
4398         }
4399         if (last) {
4400                 if (count > max) {
4401                         drm_printf(m,
4402                                    "\t\t...skipping %d queued requests...\n",
4403                                    count - max);
4404                 }
4405                 show_request(m, last, "\t\tQ ");
4406         }
4407
4408         last = NULL;
4409         count = 0;
4410         for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
4411                 struct virtual_engine *ve =
4412                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4413                 struct i915_request *rq = READ_ONCE(ve->request);
4414
4415                 if (rq) {
4416                         if (count++ < max - 1)
4417                                 show_request(m, rq, "\t\tV ");
4418                         else
4419                                 last = rq;
4420                 }
4421         }
4422         if (last) {
4423                 if (count > max) {
4424                         drm_printf(m,
4425                                    "\t\t...skipping %d virtual requests...\n",
4426                                    count - max);
4427                 }
4428                 show_request(m, last, "\t\tV ");
4429         }
4430
4431         spin_unlock_irqrestore(&engine->active.lock, flags);
4432 }
4433
4434 void intel_lr_context_reset(struct intel_engine_cs *engine,
4435                             struct intel_context *ce,
4436                             u32 head,
4437                             bool scrub)
4438 {
4439         GEM_BUG_ON(!intel_context_is_pinned(ce));
4440         __context_pin_acquire(ce);
4441
4442         /*
4443          * We want a simple context + ring to execute the breadcrumb update.
4444          * We cannot rely on the context being intact across the GPU hang,
4445          * so clear it and rebuild just what we need for the breadcrumb.
4446          * All pending requests for this context will be zapped, and any
4447          * future request will be after userspace has had the opportunity
4448          * to recreate its own state.
4449          */
4450         if (scrub) {
4451                 u32 *regs = ce->lrc_reg_state;
4452
4453                 if (engine->pinned_default_state) {
4454                         memcpy(regs, /* skip restoring the vanilla PPHWSP */
4455                                engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
4456                                engine->context_size - PAGE_SIZE);
4457                 }
4458                 execlists_init_reg_state(regs, ce, engine, ce->ring, false);
4459         }
4460
4461         /* Rerun the request; its payload has been neutered (if guilty). */
4462         ce->ring->head = head;
4463         intel_ring_update_space(ce->ring);
4464
4465         __execlists_update_reg_state(ce, engine);
4466         __context_pin_release(ce);
4467 }
4468
4469 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
4470 #include "selftest_lrc.c"
4471 #endif