]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/gpu/drm/i915/selftests/intel_hangcheck.c
Merge airlied/drm-next into drm-misc-next
[linux.git] / drivers / gpu / drm / i915 / selftests / intel_hangcheck.c
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24
25 #include <linux/kthread.h>
26
27 #include "../i915_selftest.h"
28
29 #include "mock_context.h"
30 #include "mock_drm.h"
31
32 struct hang {
33         struct drm_i915_private *i915;
34         struct drm_i915_gem_object *hws;
35         struct drm_i915_gem_object *obj;
36         struct i915_gem_context *ctx;
37         u32 *seqno;
38         u32 *batch;
39 };
40
41 static int hang_init(struct hang *h, struct drm_i915_private *i915)
42 {
43         void *vaddr;
44         int err;
45
46         memset(h, 0, sizeof(*h));
47         h->i915 = i915;
48
49         h->ctx = kernel_context(i915);
50         if (IS_ERR(h->ctx))
51                 return PTR_ERR(h->ctx);
52
53         h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
54         if (IS_ERR(h->hws)) {
55                 err = PTR_ERR(h->hws);
56                 goto err_ctx;
57         }
58
59         h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
60         if (IS_ERR(h->obj)) {
61                 err = PTR_ERR(h->obj);
62                 goto err_hws;
63         }
64
65         i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
66         vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
67         if (IS_ERR(vaddr)) {
68                 err = PTR_ERR(vaddr);
69                 goto err_obj;
70         }
71         h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
72
73         vaddr = i915_gem_object_pin_map(h->obj,
74                                         HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
75         if (IS_ERR(vaddr)) {
76                 err = PTR_ERR(vaddr);
77                 goto err_unpin_hws;
78         }
79         h->batch = vaddr;
80
81         return 0;
82
83 err_unpin_hws:
84         i915_gem_object_unpin_map(h->hws);
85 err_obj:
86         i915_gem_object_put(h->obj);
87 err_hws:
88         i915_gem_object_put(h->hws);
89 err_ctx:
90         kernel_context_close(h->ctx);
91         return err;
92 }
93
94 static u64 hws_address(const struct i915_vma *hws,
95                        const struct i915_request *rq)
96 {
97         return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
98 }
99
100 static int emit_recurse_batch(struct hang *h,
101                               struct i915_request *rq)
102 {
103         struct drm_i915_private *i915 = h->i915;
104         struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base;
105         struct i915_vma *hws, *vma;
106         unsigned int flags;
107         u32 *batch;
108         int err;
109
110         vma = i915_vma_instance(h->obj, vm, NULL);
111         if (IS_ERR(vma))
112                 return PTR_ERR(vma);
113
114         hws = i915_vma_instance(h->hws, vm, NULL);
115         if (IS_ERR(hws))
116                 return PTR_ERR(hws);
117
118         err = i915_vma_pin(vma, 0, 0, PIN_USER);
119         if (err)
120                 return err;
121
122         err = i915_vma_pin(hws, 0, 0, PIN_USER);
123         if (err)
124                 goto unpin_vma;
125
126         i915_vma_move_to_active(vma, rq, 0);
127         if (!i915_gem_object_has_active_reference(vma->obj)) {
128                 i915_gem_object_get(vma->obj);
129                 i915_gem_object_set_active_reference(vma->obj);
130         }
131
132         i915_vma_move_to_active(hws, rq, 0);
133         if (!i915_gem_object_has_active_reference(hws->obj)) {
134                 i915_gem_object_get(hws->obj);
135                 i915_gem_object_set_active_reference(hws->obj);
136         }
137
138         batch = h->batch;
139         if (INTEL_GEN(i915) >= 8) {
140                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
141                 *batch++ = lower_32_bits(hws_address(hws, rq));
142                 *batch++ = upper_32_bits(hws_address(hws, rq));
143                 *batch++ = rq->fence.seqno;
144                 *batch++ = MI_ARB_CHECK;
145
146                 memset(batch, 0, 1024);
147                 batch += 1024 / sizeof(*batch);
148
149                 *batch++ = MI_ARB_CHECK;
150                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
151                 *batch++ = lower_32_bits(vma->node.start);
152                 *batch++ = upper_32_bits(vma->node.start);
153         } else if (INTEL_GEN(i915) >= 6) {
154                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
155                 *batch++ = 0;
156                 *batch++ = lower_32_bits(hws_address(hws, rq));
157                 *batch++ = rq->fence.seqno;
158                 *batch++ = MI_ARB_CHECK;
159
160                 memset(batch, 0, 1024);
161                 batch += 1024 / sizeof(*batch);
162
163                 *batch++ = MI_ARB_CHECK;
164                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
165                 *batch++ = lower_32_bits(vma->node.start);
166         } else if (INTEL_GEN(i915) >= 4) {
167                 *batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22;
168                 *batch++ = 0;
169                 *batch++ = lower_32_bits(hws_address(hws, rq));
170                 *batch++ = rq->fence.seqno;
171                 *batch++ = MI_ARB_CHECK;
172
173                 memset(batch, 0, 1024);
174                 batch += 1024 / sizeof(*batch);
175
176                 *batch++ = MI_ARB_CHECK;
177                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
178                 *batch++ = lower_32_bits(vma->node.start);
179         } else {
180                 *batch++ = MI_STORE_DWORD_IMM;
181                 *batch++ = lower_32_bits(hws_address(hws, rq));
182                 *batch++ = rq->fence.seqno;
183                 *batch++ = MI_ARB_CHECK;
184
185                 memset(batch, 0, 1024);
186                 batch += 1024 / sizeof(*batch);
187
188                 *batch++ = MI_ARB_CHECK;
189                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
190                 *batch++ = lower_32_bits(vma->node.start);
191         }
192         *batch++ = MI_BATCH_BUFFER_END; /* not reached */
193         i915_gem_chipset_flush(h->i915);
194
195         flags = 0;
196         if (INTEL_GEN(vm->i915) <= 5)
197                 flags |= I915_DISPATCH_SECURE;
198
199         err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
200
201         i915_vma_unpin(hws);
202 unpin_vma:
203         i915_vma_unpin(vma);
204         return err;
205 }
206
207 static struct i915_request *
208 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
209 {
210         struct i915_request *rq;
211         int err;
212
213         if (i915_gem_object_is_active(h->obj)) {
214                 struct drm_i915_gem_object *obj;
215                 void *vaddr;
216
217                 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
218                 if (IS_ERR(obj))
219                         return ERR_CAST(obj);
220
221                 vaddr = i915_gem_object_pin_map(obj,
222                                                 HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
223                 if (IS_ERR(vaddr)) {
224                         i915_gem_object_put(obj);
225                         return ERR_CAST(vaddr);
226                 }
227
228                 i915_gem_object_unpin_map(h->obj);
229                 i915_gem_object_put(h->obj);
230
231                 h->obj = obj;
232                 h->batch = vaddr;
233         }
234
235         rq = i915_request_alloc(engine, h->ctx);
236         if (IS_ERR(rq))
237                 return rq;
238
239         err = emit_recurse_batch(h, rq);
240         if (err) {
241                 __i915_request_add(rq, false);
242                 return ERR_PTR(err);
243         }
244
245         return rq;
246 }
247
248 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
249 {
250         return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
251 }
252
253 struct wedge_me {
254         struct delayed_work work;
255         struct drm_i915_private *i915;
256         const void *symbol;
257 };
258
259 static void wedge_me(struct work_struct *work)
260 {
261         struct wedge_me *w = container_of(work, typeof(*w), work.work);
262
263         pr_err("%pS timed out, cancelling all further testing.\n",
264                w->symbol);
265         i915_gem_set_wedged(w->i915);
266 }
267
268 static void __init_wedge(struct wedge_me *w,
269                          struct drm_i915_private *i915,
270                          long timeout,
271                          const void *symbol)
272 {
273         w->i915 = i915;
274         w->symbol = symbol;
275
276         INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
277         schedule_delayed_work(&w->work, timeout);
278 }
279
280 static void __fini_wedge(struct wedge_me *w)
281 {
282         cancel_delayed_work_sync(&w->work);
283         destroy_delayed_work_on_stack(&w->work);
284         w->i915 = NULL;
285 }
286
287 #define wedge_on_timeout(W, DEV, TIMEOUT)                               \
288         for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \
289              (W)->i915;                                                 \
290              __fini_wedge((W)))
291
292 static noinline int
293 flush_test(struct drm_i915_private *i915, unsigned int flags)
294 {
295         struct wedge_me w;
296
297         cond_resched();
298
299         wedge_on_timeout(&w, i915, HZ)
300                 i915_gem_wait_for_idle(i915, flags);
301
302         return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
303 }
304
305 static void hang_fini(struct hang *h)
306 {
307         *h->batch = MI_BATCH_BUFFER_END;
308         i915_gem_chipset_flush(h->i915);
309
310         i915_gem_object_unpin_map(h->obj);
311         i915_gem_object_put(h->obj);
312
313         i915_gem_object_unpin_map(h->hws);
314         i915_gem_object_put(h->hws);
315
316         kernel_context_close(h->ctx);
317
318         flush_test(h->i915, I915_WAIT_LOCKED);
319 }
320
321 static bool wait_for_hang(struct hang *h, struct i915_request *rq)
322 {
323         return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
324                                                rq->fence.seqno),
325                              10) &&
326                  wait_for(i915_seqno_passed(hws_seqno(h, rq),
327                                             rq->fence.seqno),
328                           1000));
329 }
330
331 static int igt_hang_sanitycheck(void *arg)
332 {
333         struct drm_i915_private *i915 = arg;
334         struct i915_request *rq;
335         struct intel_engine_cs *engine;
336         enum intel_engine_id id;
337         struct hang h;
338         int err;
339
340         /* Basic check that we can execute our hanging batch */
341
342         mutex_lock(&i915->drm.struct_mutex);
343         err = hang_init(&h, i915);
344         if (err)
345                 goto unlock;
346
347         for_each_engine(engine, i915, id) {
348                 long timeout;
349
350                 if (!intel_engine_can_store_dword(engine))
351                         continue;
352
353                 rq = hang_create_request(&h, engine);
354                 if (IS_ERR(rq)) {
355                         err = PTR_ERR(rq);
356                         pr_err("Failed to create request for %s, err=%d\n",
357                                engine->name, err);
358                         goto fini;
359                 }
360
361                 i915_request_get(rq);
362
363                 *h.batch = MI_BATCH_BUFFER_END;
364                 i915_gem_chipset_flush(i915);
365
366                 __i915_request_add(rq, true);
367
368                 timeout = i915_request_wait(rq,
369                                             I915_WAIT_LOCKED,
370                                             MAX_SCHEDULE_TIMEOUT);
371                 i915_request_put(rq);
372
373                 if (timeout < 0) {
374                         err = timeout;
375                         pr_err("Wait for request failed on %s, err=%d\n",
376                                engine->name, err);
377                         goto fini;
378                 }
379         }
380
381 fini:
382         hang_fini(&h);
383 unlock:
384         mutex_unlock(&i915->drm.struct_mutex);
385         return err;
386 }
387
388 static void global_reset_lock(struct drm_i915_private *i915)
389 {
390         struct intel_engine_cs *engine;
391         enum intel_engine_id id;
392
393         pr_debug("%s: current gpu_error=%08lx\n",
394                  __func__, i915->gpu_error.flags);
395
396         while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
397                 wait_event(i915->gpu_error.reset_queue,
398                            !test_bit(I915_RESET_BACKOFF,
399                                      &i915->gpu_error.flags));
400
401         for_each_engine(engine, i915, id) {
402                 while (test_and_set_bit(I915_RESET_ENGINE + id,
403                                         &i915->gpu_error.flags))
404                         wait_on_bit(&i915->gpu_error.flags,
405                                     I915_RESET_ENGINE + id,
406                                     TASK_UNINTERRUPTIBLE);
407         }
408 }
409
410 static void global_reset_unlock(struct drm_i915_private *i915)
411 {
412         struct intel_engine_cs *engine;
413         enum intel_engine_id id;
414
415         for_each_engine(engine, i915, id)
416                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
417
418         clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
419         wake_up_all(&i915->gpu_error.reset_queue);
420 }
421
422 static int igt_global_reset(void *arg)
423 {
424         struct drm_i915_private *i915 = arg;
425         unsigned int reset_count;
426         int err = 0;
427
428         /* Check that we can issue a global GPU reset */
429
430         global_reset_lock(i915);
431         set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
432
433         mutex_lock(&i915->drm.struct_mutex);
434         reset_count = i915_reset_count(&i915->gpu_error);
435
436         i915_reset(i915, I915_RESET_QUIET);
437
438         if (i915_reset_count(&i915->gpu_error) == reset_count) {
439                 pr_err("No GPU reset recorded!\n");
440                 err = -EINVAL;
441         }
442         mutex_unlock(&i915->drm.struct_mutex);
443
444         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
445         global_reset_unlock(i915);
446
447         if (i915_terminally_wedged(&i915->gpu_error))
448                 err = -EIO;
449
450         return err;
451 }
452
453 static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
454 {
455         struct intel_engine_cs *engine;
456         enum intel_engine_id id;
457         struct hang h;
458         int err = 0;
459
460         /* Check that we can issue an engine reset on an idle engine (no-op) */
461
462         if (!intel_has_reset_engine(i915))
463                 return 0;
464
465         if (active) {
466                 mutex_lock(&i915->drm.struct_mutex);
467                 err = hang_init(&h, i915);
468                 mutex_unlock(&i915->drm.struct_mutex);
469                 if (err)
470                         return err;
471         }
472
473         for_each_engine(engine, i915, id) {
474                 unsigned int reset_count, reset_engine_count;
475                 IGT_TIMEOUT(end_time);
476
477                 if (active && !intel_engine_can_store_dword(engine))
478                         continue;
479
480                 reset_count = i915_reset_count(&i915->gpu_error);
481                 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
482                                                              engine);
483
484                 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
485                 do {
486                         if (active) {
487                                 struct i915_request *rq;
488
489                                 mutex_lock(&i915->drm.struct_mutex);
490                                 rq = hang_create_request(&h, engine);
491                                 if (IS_ERR(rq)) {
492                                         err = PTR_ERR(rq);
493                                         mutex_unlock(&i915->drm.struct_mutex);
494                                         break;
495                                 }
496
497                                 i915_request_get(rq);
498                                 __i915_request_add(rq, true);
499                                 mutex_unlock(&i915->drm.struct_mutex);
500
501                                 if (!wait_for_hang(&h, rq)) {
502                                         struct drm_printer p = drm_info_printer(i915->drm.dev);
503
504                                         pr_err("%s: Failed to start request %x, at %x\n",
505                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
506                                         intel_engine_dump(engine, &p,
507                                                           "%s\n", engine->name);
508
509                                         i915_request_put(rq);
510                                         err = -EIO;
511                                         break;
512                                 }
513
514                                 i915_request_put(rq);
515                         }
516
517                         engine->hangcheck.stalled = true;
518                         engine->hangcheck.seqno =
519                                 intel_engine_get_seqno(engine);
520
521                         err = i915_reset_engine(engine, I915_RESET_QUIET);
522                         if (err) {
523                                 pr_err("i915_reset_engine failed\n");
524                                 break;
525                         }
526
527                         if (i915_reset_count(&i915->gpu_error) != reset_count) {
528                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
529                                 err = -EINVAL;
530                                 break;
531                         }
532
533                         reset_engine_count += active;
534                         if (i915_reset_engine_count(&i915->gpu_error, engine) !=
535                             reset_engine_count) {
536                                 pr_err("%s engine reset %srecorded!\n",
537                                        engine->name, active ? "not " : "");
538                                 err = -EINVAL;
539                                 break;
540                         }
541
542                         engine->hangcheck.stalled = false;
543                 } while (time_before(jiffies, end_time));
544                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
545
546                 if (err)
547                         break;
548
549                 err = flush_test(i915, 0);
550                 if (err)
551                         break;
552         }
553
554         if (i915_terminally_wedged(&i915->gpu_error))
555                 err = -EIO;
556
557         if (active) {
558                 mutex_lock(&i915->drm.struct_mutex);
559                 hang_fini(&h);
560                 mutex_unlock(&i915->drm.struct_mutex);
561         }
562
563         return err;
564 }
565
566 static int igt_reset_idle_engine(void *arg)
567 {
568         return __igt_reset_engine(arg, false);
569 }
570
571 static int igt_reset_active_engine(void *arg)
572 {
573         return __igt_reset_engine(arg, true);
574 }
575
576 static int active_engine(void *data)
577 {
578         struct intel_engine_cs *engine = data;
579         struct i915_request *rq[2] = {};
580         struct i915_gem_context *ctx[2];
581         struct drm_file *file;
582         unsigned long count = 0;
583         int err = 0;
584
585         file = mock_file(engine->i915);
586         if (IS_ERR(file))
587                 return PTR_ERR(file);
588
589         mutex_lock(&engine->i915->drm.struct_mutex);
590         ctx[0] = live_context(engine->i915, file);
591         mutex_unlock(&engine->i915->drm.struct_mutex);
592         if (IS_ERR(ctx[0])) {
593                 err = PTR_ERR(ctx[0]);
594                 goto err_file;
595         }
596
597         mutex_lock(&engine->i915->drm.struct_mutex);
598         ctx[1] = live_context(engine->i915, file);
599         mutex_unlock(&engine->i915->drm.struct_mutex);
600         if (IS_ERR(ctx[1])) {
601                 err = PTR_ERR(ctx[1]);
602                 i915_gem_context_put(ctx[0]);
603                 goto err_file;
604         }
605
606         while (!kthread_should_stop()) {
607                 unsigned int idx = count++ & 1;
608                 struct i915_request *old = rq[idx];
609                 struct i915_request *new;
610
611                 mutex_lock(&engine->i915->drm.struct_mutex);
612                 new = i915_request_alloc(engine, ctx[idx]);
613                 if (IS_ERR(new)) {
614                         mutex_unlock(&engine->i915->drm.struct_mutex);
615                         err = PTR_ERR(new);
616                         break;
617                 }
618
619                 rq[idx] = i915_request_get(new);
620                 i915_request_add(new);
621                 mutex_unlock(&engine->i915->drm.struct_mutex);
622
623                 if (old) {
624                         i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
625                         i915_request_put(old);
626                 }
627         }
628
629         for (count = 0; count < ARRAY_SIZE(rq); count++)
630                 i915_request_put(rq[count]);
631
632 err_file:
633         mock_file_free(engine->i915, file);
634         return err;
635 }
636
637 static int __igt_reset_engine_others(struct drm_i915_private *i915,
638                                      bool active)
639 {
640         struct intel_engine_cs *engine, *other;
641         enum intel_engine_id id, tmp;
642         struct hang h;
643         int err = 0;
644
645         /* Check that issuing a reset on one engine does not interfere
646          * with any other engine.
647          */
648
649         if (!intel_has_reset_engine(i915))
650                 return 0;
651
652         if (active) {
653                 mutex_lock(&i915->drm.struct_mutex);
654                 err = hang_init(&h, i915);
655                 mutex_unlock(&i915->drm.struct_mutex);
656                 if (err)
657                         return err;
658         }
659
660         for_each_engine(engine, i915, id) {
661                 struct task_struct *threads[I915_NUM_ENGINES] = {};
662                 unsigned long resets[I915_NUM_ENGINES];
663                 unsigned long global = i915_reset_count(&i915->gpu_error);
664                 unsigned long count = 0;
665                 IGT_TIMEOUT(end_time);
666
667                 if (active && !intel_engine_can_store_dword(engine))
668                         continue;
669
670                 memset(threads, 0, sizeof(threads));
671                 for_each_engine(other, i915, tmp) {
672                         struct task_struct *tsk;
673
674                         resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
675                                                               other);
676
677                         if (other == engine)
678                                 continue;
679
680                         tsk = kthread_run(active_engine, other,
681                                           "igt/%s", other->name);
682                         if (IS_ERR(tsk)) {
683                                 err = PTR_ERR(tsk);
684                                 goto unwind;
685                         }
686
687                         threads[tmp] = tsk;
688                         get_task_struct(tsk);
689                 }
690
691                 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
692                 do {
693                         if (active) {
694                                 struct i915_request *rq;
695
696                                 mutex_lock(&i915->drm.struct_mutex);
697                                 rq = hang_create_request(&h, engine);
698                                 if (IS_ERR(rq)) {
699                                         err = PTR_ERR(rq);
700                                         mutex_unlock(&i915->drm.struct_mutex);
701                                         break;
702                                 }
703
704                                 i915_request_get(rq);
705                                 __i915_request_add(rq, true);
706                                 mutex_unlock(&i915->drm.struct_mutex);
707
708                                 if (!wait_for_hang(&h, rq)) {
709                                         struct drm_printer p = drm_info_printer(i915->drm.dev);
710
711                                         pr_err("%s: Failed to start request %x, at %x\n",
712                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
713                                         intel_engine_dump(engine, &p,
714                                                           "%s\n", engine->name);
715
716                                         i915_request_put(rq);
717                                         err = -EIO;
718                                         break;
719                                 }
720
721                                 i915_request_put(rq);
722                         }
723
724                         engine->hangcheck.stalled = true;
725                         engine->hangcheck.seqno =
726                                 intel_engine_get_seqno(engine);
727
728                         err = i915_reset_engine(engine, I915_RESET_QUIET);
729                         if (err) {
730                                 pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
731                                        engine->name, active ? "active" : "idle", err);
732                                 break;
733                         }
734
735                         engine->hangcheck.stalled = false;
736                         count++;
737                 } while (time_before(jiffies, end_time));
738                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
739                 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
740                         engine->name, active ? "active" : "idle", count);
741
742                 if (i915_reset_engine_count(&i915->gpu_error, engine) -
743                     resets[engine->id] != (active ? count : 0)) {
744                         pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
745                                engine->name, active ? "active" : "idle", count,
746                                i915_reset_engine_count(&i915->gpu_error,
747                                                        engine) - resets[engine->id]);
748                         if (!err)
749                                 err = -EINVAL;
750                 }
751
752 unwind:
753                 for_each_engine(other, i915, tmp) {
754                         int ret;
755
756                         if (!threads[tmp])
757                                 continue;
758
759                         ret = kthread_stop(threads[tmp]);
760                         if (ret) {
761                                 pr_err("kthread for other engine %s failed, err=%d\n",
762                                        other->name, ret);
763                                 if (!err)
764                                         err = ret;
765                         }
766                         put_task_struct(threads[tmp]);
767
768                         if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
769                                                                    other)) {
770                                 pr_err("Innocent engine %s was reset (count=%ld)\n",
771                                        other->name,
772                                        i915_reset_engine_count(&i915->gpu_error,
773                                                                other) - resets[tmp]);
774                                 if (!err)
775                                         err = -EINVAL;
776                         }
777                 }
778
779                 if (global != i915_reset_count(&i915->gpu_error)) {
780                         pr_err("Global reset (count=%ld)!\n",
781                                i915_reset_count(&i915->gpu_error) - global);
782                         if (!err)
783                                 err = -EINVAL;
784                 }
785
786                 if (err)
787                         break;
788
789                 err = flush_test(i915, 0);
790                 if (err)
791                         break;
792         }
793
794         if (i915_terminally_wedged(&i915->gpu_error))
795                 err = -EIO;
796
797         if (active) {
798                 mutex_lock(&i915->drm.struct_mutex);
799                 hang_fini(&h);
800                 mutex_unlock(&i915->drm.struct_mutex);
801         }
802
803         return err;
804 }
805
806 static int igt_reset_idle_engine_others(void *arg)
807 {
808         return __igt_reset_engine_others(arg, false);
809 }
810
811 static int igt_reset_active_engine_others(void *arg)
812 {
813         return __igt_reset_engine_others(arg, true);
814 }
815
816 static u32 fake_hangcheck(struct i915_request *rq)
817 {
818         u32 reset_count;
819
820         rq->engine->hangcheck.stalled = true;
821         rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine);
822
823         reset_count = i915_reset_count(&rq->i915->gpu_error);
824
825         set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
826         wake_up_all(&rq->i915->gpu_error.wait_queue);
827
828         return reset_count;
829 }
830
831 static int igt_wait_reset(void *arg)
832 {
833         struct drm_i915_private *i915 = arg;
834         struct i915_request *rq;
835         unsigned int reset_count;
836         struct hang h;
837         long timeout;
838         int err;
839
840         if (!intel_engine_can_store_dword(i915->engine[RCS]))
841                 return 0;
842
843         /* Check that we detect a stuck waiter and issue a reset */
844
845         global_reset_lock(i915);
846
847         mutex_lock(&i915->drm.struct_mutex);
848         err = hang_init(&h, i915);
849         if (err)
850                 goto unlock;
851
852         rq = hang_create_request(&h, i915->engine[RCS]);
853         if (IS_ERR(rq)) {
854                 err = PTR_ERR(rq);
855                 goto fini;
856         }
857
858         i915_request_get(rq);
859         __i915_request_add(rq, true);
860
861         if (!wait_for_hang(&h, rq)) {
862                 struct drm_printer p = drm_info_printer(i915->drm.dev);
863
864                 pr_err("%s: Failed to start request %x, at %x\n",
865                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
866                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
867
868                 i915_reset(i915, 0);
869                 i915_gem_set_wedged(i915);
870
871                 err = -EIO;
872                 goto out_rq;
873         }
874
875         reset_count = fake_hangcheck(rq);
876
877         timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
878         if (timeout < 0) {
879                 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
880                        timeout);
881                 err = timeout;
882                 goto out_rq;
883         }
884
885         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
886         if (i915_reset_count(&i915->gpu_error) == reset_count) {
887                 pr_err("No GPU reset recorded!\n");
888                 err = -EINVAL;
889                 goto out_rq;
890         }
891
892 out_rq:
893         i915_request_put(rq);
894 fini:
895         hang_fini(&h);
896 unlock:
897         mutex_unlock(&i915->drm.struct_mutex);
898         global_reset_unlock(i915);
899
900         if (i915_terminally_wedged(&i915->gpu_error))
901                 return -EIO;
902
903         return err;
904 }
905
906 static int igt_reset_queue(void *arg)
907 {
908         struct drm_i915_private *i915 = arg;
909         struct intel_engine_cs *engine;
910         enum intel_engine_id id;
911         struct hang h;
912         int err;
913
914         /* Check that we replay pending requests following a hang */
915
916         global_reset_lock(i915);
917
918         mutex_lock(&i915->drm.struct_mutex);
919         err = hang_init(&h, i915);
920         if (err)
921                 goto unlock;
922
923         for_each_engine(engine, i915, id) {
924                 struct i915_request *prev;
925                 IGT_TIMEOUT(end_time);
926                 unsigned int count;
927
928                 if (!intel_engine_can_store_dword(engine))
929                         continue;
930
931                 prev = hang_create_request(&h, engine);
932                 if (IS_ERR(prev)) {
933                         err = PTR_ERR(prev);
934                         goto fini;
935                 }
936
937                 i915_request_get(prev);
938                 __i915_request_add(prev, true);
939
940                 count = 0;
941                 do {
942                         struct i915_request *rq;
943                         unsigned int reset_count;
944
945                         rq = hang_create_request(&h, engine);
946                         if (IS_ERR(rq)) {
947                                 err = PTR_ERR(rq);
948                                 goto fini;
949                         }
950
951                         i915_request_get(rq);
952                         __i915_request_add(rq, true);
953
954                         if (!wait_for_hang(&h, prev)) {
955                                 struct drm_printer p = drm_info_printer(i915->drm.dev);
956
957                                 pr_err("%s: Failed to start request %x, at %x\n",
958                                        __func__, prev->fence.seqno, hws_seqno(&h, prev));
959                                 intel_engine_dump(prev->engine, &p,
960                                                   "%s\n", prev->engine->name);
961
962                                 i915_request_put(rq);
963                                 i915_request_put(prev);
964
965                                 i915_reset(i915, 0);
966                                 i915_gem_set_wedged(i915);
967
968                                 err = -EIO;
969                                 goto fini;
970                         }
971
972                         reset_count = fake_hangcheck(prev);
973
974                         i915_reset(i915, I915_RESET_QUIET);
975
976                         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
977                                             &i915->gpu_error.flags));
978
979                         if (prev->fence.error != -EIO) {
980                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
981                                        prev->fence.error);
982                                 i915_request_put(rq);
983                                 i915_request_put(prev);
984                                 err = -EINVAL;
985                                 goto fini;
986                         }
987
988                         if (rq->fence.error) {
989                                 pr_err("Fence error status not zero [%d] after unrelated reset\n",
990                                        rq->fence.error);
991                                 i915_request_put(rq);
992                                 i915_request_put(prev);
993                                 err = -EINVAL;
994                                 goto fini;
995                         }
996
997                         if (i915_reset_count(&i915->gpu_error) == reset_count) {
998                                 pr_err("No GPU reset recorded!\n");
999                                 i915_request_put(rq);
1000                                 i915_request_put(prev);
1001                                 err = -EINVAL;
1002                                 goto fini;
1003                         }
1004
1005                         i915_request_put(prev);
1006                         prev = rq;
1007                         count++;
1008                 } while (time_before(jiffies, end_time));
1009                 pr_info("%s: Completed %d resets\n", engine->name, count);
1010
1011                 *h.batch = MI_BATCH_BUFFER_END;
1012                 i915_gem_chipset_flush(i915);
1013
1014                 i915_request_put(prev);
1015
1016                 err = flush_test(i915, I915_WAIT_LOCKED);
1017                 if (err)
1018                         break;
1019         }
1020
1021 fini:
1022         hang_fini(&h);
1023 unlock:
1024         mutex_unlock(&i915->drm.struct_mutex);
1025         global_reset_unlock(i915);
1026
1027         if (i915_terminally_wedged(&i915->gpu_error))
1028                 return -EIO;
1029
1030         return err;
1031 }
1032
1033 static int igt_handle_error(void *arg)
1034 {
1035         struct drm_i915_private *i915 = arg;
1036         struct intel_engine_cs *engine = i915->engine[RCS];
1037         struct hang h;
1038         struct i915_request *rq;
1039         struct i915_gpu_state *error;
1040         int err;
1041
1042         /* Check that we can issue a global GPU and engine reset */
1043
1044         if (!intel_has_reset_engine(i915))
1045                 return 0;
1046
1047         if (!intel_engine_can_store_dword(i915->engine[RCS]))
1048                 return 0;
1049
1050         mutex_lock(&i915->drm.struct_mutex);
1051
1052         err = hang_init(&h, i915);
1053         if (err)
1054                 goto err_unlock;
1055
1056         rq = hang_create_request(&h, engine);
1057         if (IS_ERR(rq)) {
1058                 err = PTR_ERR(rq);
1059                 goto err_fini;
1060         }
1061
1062         i915_request_get(rq);
1063         __i915_request_add(rq, true);
1064
1065         if (!wait_for_hang(&h, rq)) {
1066                 struct drm_printer p = drm_info_printer(i915->drm.dev);
1067
1068                 pr_err("%s: Failed to start request %x, at %x\n",
1069                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1070                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1071
1072                 i915_reset(i915, 0);
1073                 i915_gem_set_wedged(i915);
1074
1075                 err = -EIO;
1076                 goto err_request;
1077         }
1078
1079         mutex_unlock(&i915->drm.struct_mutex);
1080
1081         /* Temporarily disable error capture */
1082         error = xchg(&i915->gpu_error.first_error, (void *)-1);
1083
1084         engine->hangcheck.stalled = true;
1085         engine->hangcheck.seqno = intel_engine_get_seqno(engine);
1086
1087         i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
1088
1089         xchg(&i915->gpu_error.first_error, error);
1090
1091         mutex_lock(&i915->drm.struct_mutex);
1092
1093         if (rq->fence.error != -EIO) {
1094                 pr_err("Guilty request not identified!\n");
1095                 err = -EINVAL;
1096                 goto err_request;
1097         }
1098
1099 err_request:
1100         i915_request_put(rq);
1101 err_fini:
1102         hang_fini(&h);
1103 err_unlock:
1104         mutex_unlock(&i915->drm.struct_mutex);
1105         return err;
1106 }
1107
1108 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1109 {
1110         static const struct i915_subtest tests[] = {
1111                 SUBTEST(igt_global_reset), /* attempt to recover GPU first */
1112                 SUBTEST(igt_hang_sanitycheck),
1113                 SUBTEST(igt_reset_idle_engine),
1114                 SUBTEST(igt_reset_active_engine),
1115                 SUBTEST(igt_reset_idle_engine_others),
1116                 SUBTEST(igt_reset_active_engine_others),
1117                 SUBTEST(igt_wait_reset),
1118                 SUBTEST(igt_reset_queue),
1119                 SUBTEST(igt_handle_error),
1120         };
1121         bool saved_hangcheck;
1122         int err;
1123
1124         if (!intel_has_gpu_reset(i915))
1125                 return 0;
1126
1127         intel_runtime_pm_get(i915);
1128         saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1129
1130         err = i915_subtests(tests, i915);
1131
1132         i915_modparams.enable_hangcheck = saved_hangcheck;
1133         intel_runtime_pm_put(i915);
1134
1135         return err;
1136 }