drivers/gpu/drm/i915/selftests/intel_hangcheck.c

   1 /*
   2  * Copyright © 2016 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include <linux/kthread.h>
  26
  27 #include "../i915_selftest.h"
  28
  29 #include "mock_context.h"
  30 #include "mock_drm.h"
  31
  32 struct hang {
  33         struct drm_i915_private *i915;
  34         struct drm_i915_gem_object *hws;
  35         struct drm_i915_gem_object *obj;
  36         struct i915_gem_context *ctx;
  37         u32 *seqno;
  38         u32 *batch;
  39 };
  40
  41 static int hang_init(struct hang *h, struct drm_i915_private *i915)
  42 {
  43         void *vaddr;
  44         int err;
  45
  46         memset(h, 0, sizeof(*h));
  47         h->i915 = i915;
  48
  49         h->ctx = kernel_context(i915);
  50         if (IS_ERR(h->ctx))
  51                 return PTR_ERR(h->ctx);
  52
  53         h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
  54         if (IS_ERR(h->hws)) {
  55                 err = PTR_ERR(h->hws);
  56                 goto err_ctx;
  57         }
  58
  59         h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
  60         if (IS_ERR(h->obj)) {
  61                 err = PTR_ERR(h->obj);
  62                 goto err_hws;
  63         }
  64
  65         i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
  66         vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
  67         if (IS_ERR(vaddr)) {
  68                 err = PTR_ERR(vaddr);
  69                 goto err_obj;
  70         }
  71         h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  72
  73         vaddr = i915_gem_object_pin_map(h->obj,
  74                                         HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
  75         if (IS_ERR(vaddr)) {
  76                 err = PTR_ERR(vaddr);
  77                 goto err_unpin_hws;
  78         }
  79         h->batch = vaddr;
  80
  81         return 0;
  82
  83 err_unpin_hws:
  84         i915_gem_object_unpin_map(h->hws);
  85 err_obj:
  86         i915_gem_object_put(h->obj);
  87 err_hws:
  88         i915_gem_object_put(h->hws);
  89 err_ctx:
  90         kernel_context_close(h->ctx);
  91         return err;
  92 }
  93
  94 static u64 hws_address(const struct i915_vma *hws,
  95                        const struct i915_request *rq)
  96 {
  97         return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
  98 }
  99
 100 static int emit_recurse_batch(struct hang *h,
 101                               struct i915_request *rq)
 102 {
 103         struct drm_i915_private *i915 = h->i915;
 104         struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base;
 105         struct i915_vma *hws, *vma;
 106         unsigned int flags;
 107         u32 *batch;
 108         int err;
 109
 110         vma = i915_vma_instance(h->obj, vm, NULL);
 111         if (IS_ERR(vma))
 112                 return PTR_ERR(vma);
 113
 114         hws = i915_vma_instance(h->hws, vm, NULL);
 115         if (IS_ERR(hws))
 116                 return PTR_ERR(hws);
 117
 118         err = i915_vma_pin(vma, 0, 0, PIN_USER);
 119         if (err)
 120                 return err;
 121
 122         err = i915_vma_pin(hws, 0, 0, PIN_USER);
 123         if (err)
 124                 goto unpin_vma;
 125
 126         i915_vma_move_to_active(vma, rq, 0);
 127         if (!i915_gem_object_has_active_reference(vma->obj)) {
 128                 i915_gem_object_get(vma->obj);
 129                 i915_gem_object_set_active_reference(vma->obj);
 130         }
 131
 132         i915_vma_move_to_active(hws, rq, 0);
 133         if (!i915_gem_object_has_active_reference(hws->obj)) {
 134                 i915_gem_object_get(hws->obj);
 135                 i915_gem_object_set_active_reference(hws->obj);
 136         }
 137
 138         batch = h->batch;
 139         if (INTEL_GEN(i915) >= 8) {
 140                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
 141                 *batch++ = lower_32_bits(hws_address(hws, rq));
 142                 *batch++ = upper_32_bits(hws_address(hws, rq));
 143                 *batch++ = rq->fence.seqno;
 144                 *batch++ = MI_ARB_CHECK;
 145
 146                 memset(batch, 0, 1024);
 147                 batch += 1024 / sizeof(*batch);
 148
 149                 *batch++ = MI_ARB_CHECK;
 150                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 151                 *batch++ = lower_32_bits(vma->node.start);
 152                 *batch++ = upper_32_bits(vma->node.start);
 153         } else if (INTEL_GEN(i915) >= 6) {
 154                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
 155                 *batch++ = 0;
 156                 *batch++ = lower_32_bits(hws_address(hws, rq));
 157                 *batch++ = rq->fence.seqno;
 158                 *batch++ = MI_ARB_CHECK;
 159
 160                 memset(batch, 0, 1024);
 161                 batch += 1024 / sizeof(*batch);
 162
 163                 *batch++ = MI_ARB_CHECK;
 164                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 165                 *batch++ = lower_32_bits(vma->node.start);
 166         } else if (INTEL_GEN(i915) >= 4) {
 167                 *batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22;
 168                 *batch++ = 0;
 169                 *batch++ = lower_32_bits(hws_address(hws, rq));
 170                 *batch++ = rq->fence.seqno;
 171                 *batch++ = MI_ARB_CHECK;
 172
 173                 memset(batch, 0, 1024);
 174                 batch += 1024 / sizeof(*batch);
 175
 176                 *batch++ = MI_ARB_CHECK;
 177                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 178                 *batch++ = lower_32_bits(vma->node.start);
 179         } else {
 180                 *batch++ = MI_STORE_DWORD_IMM;
 181                 *batch++ = lower_32_bits(hws_address(hws, rq));
 182                 *batch++ = rq->fence.seqno;
 183                 *batch++ = MI_ARB_CHECK;
 184
 185                 memset(batch, 0, 1024);
 186                 batch += 1024 / sizeof(*batch);
 187
 188                 *batch++ = MI_ARB_CHECK;
 189                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
 190                 *batch++ = lower_32_bits(vma->node.start);
 191         }
 192         *batch++ = MI_BATCH_BUFFER_END; /* not reached */
 193         i915_gem_chipset_flush(h->i915);
 194
 195         flags = 0;
 196         if (INTEL_GEN(vm->i915) <= 5)
 197                 flags |= I915_DISPATCH_SECURE;
 198
 199         err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 200
 201         i915_vma_unpin(hws);
 202 unpin_vma:
 203         i915_vma_unpin(vma);
 204         return err;
 205 }
 206
 207 static struct i915_request *
 208 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 209 {
 210         struct i915_request *rq;
 211         int err;
 212
 213         if (i915_gem_object_is_active(h->obj)) {
 214                 struct drm_i915_gem_object *obj;
 215                 void *vaddr;
 216
 217                 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
 218                 if (IS_ERR(obj))
 219                         return ERR_CAST(obj);
 220
 221                 vaddr = i915_gem_object_pin_map(obj,
 222                                                 HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
 223                 if (IS_ERR(vaddr)) {
 224                         i915_gem_object_put(obj);
 225                         return ERR_CAST(vaddr);
 226                 }
 227
 228                 i915_gem_object_unpin_map(h->obj);
 229                 i915_gem_object_put(h->obj);
 230
 231                 h->obj = obj;
 232                 h->batch = vaddr;
 233         }
 234
 235         rq = i915_request_alloc(engine, h->ctx);
 236         if (IS_ERR(rq))
 237                 return rq;
 238
 239         err = emit_recurse_batch(h, rq);
 240         if (err) {
 241                 __i915_request_add(rq, false);
 242                 return ERR_PTR(err);
 243         }
 244
 245         return rq;
 246 }
 247
 248 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
 249 {
 250         return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 251 }
 252
 253 struct wedge_me {
 254         struct delayed_work work;
 255         struct drm_i915_private *i915;
 256         const void *symbol;
 257 };
 258
 259 static void wedge_me(struct work_struct *work)
 260 {
 261         struct wedge_me *w = container_of(work, typeof(*w), work.work);
 262
 263         pr_err("%pS timed out, cancelling all further testing.\n",
 264                w->symbol);
 265         i915_gem_set_wedged(w->i915);
 266 }
 267
 268 static void __init_wedge(struct wedge_me *w,
 269                          struct drm_i915_private *i915,
 270                          long timeout,
 271                          const void *symbol)
 272 {
 273         w->i915 = i915;
 274         w->symbol = symbol;
 275
 276         INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
 277         schedule_delayed_work(&w->work, timeout);
 278 }
 279
 280 static void __fini_wedge(struct wedge_me *w)
 281 {
 282         cancel_delayed_work_sync(&w->work);
 283         destroy_delayed_work_on_stack(&w->work);
 284         w->i915 = NULL;
 285 }
 286
 287 #define wedge_on_timeout(W, DEV, TIMEOUT)                               \
 288         for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \
 289              (W)->i915;                                                 \
 290              __fini_wedge((W)))
 291
 292 static noinline int
 293 flush_test(struct drm_i915_private *i915, unsigned int flags)
 294 {
 295         struct wedge_me w;
 296
 297         cond_resched();
 298
 299         wedge_on_timeout(&w, i915, HZ)
 300                 i915_gem_wait_for_idle(i915, flags);
 301
 302         return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
 303 }
 304
 305 static void hang_fini(struct hang *h)
 306 {
 307         *h->batch = MI_BATCH_BUFFER_END;
 308         i915_gem_chipset_flush(h->i915);
 309
 310         i915_gem_object_unpin_map(h->obj);
 311         i915_gem_object_put(h->obj);
 312
 313         i915_gem_object_unpin_map(h->hws);
 314         i915_gem_object_put(h->hws);
 315
 316         kernel_context_close(h->ctx);
 317
 318         flush_test(h->i915, I915_WAIT_LOCKED);
 319 }
 320
 321 static bool wait_for_hang(struct hang *h, struct i915_request *rq)
 322 {
 323         return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
 324                                                rq->fence.seqno),
 325                              10) &&
 326                  wait_for(i915_seqno_passed(hws_seqno(h, rq),
 327                                             rq->fence.seqno),
 328                           1000));
 329 }
 330
 331 static int igt_hang_sanitycheck(void *arg)
 332 {
 333         struct drm_i915_private *i915 = arg;
 334         struct i915_request *rq;
 335         struct intel_engine_cs *engine;
 336         enum intel_engine_id id;
 337         struct hang h;
 338         int err;
 339
 340         /* Basic check that we can execute our hanging batch */
 341
 342         mutex_lock(&i915->drm.struct_mutex);
 343         err = hang_init(&h, i915);
 344         if (err)
 345                 goto unlock;
 346
 347         for_each_engine(engine, i915, id) {
 348                 long timeout;
 349
 350                 if (!intel_engine_can_store_dword(engine))
 351                         continue;
 352
 353                 rq = hang_create_request(&h, engine);
 354                 if (IS_ERR(rq)) {
 355                         err = PTR_ERR(rq);
 356                         pr_err("Failed to create request for %s, err=%d\n",
 357                                engine->name, err);
 358                         goto fini;
 359                 }
 360
 361                 i915_request_get(rq);
 362
 363                 *h.batch = MI_BATCH_BUFFER_END;
 364                 i915_gem_chipset_flush(i915);
 365
 366                 __i915_request_add(rq, true);
 367
 368                 timeout = i915_request_wait(rq,
 369                                             I915_WAIT_LOCKED,
 370                                             MAX_SCHEDULE_TIMEOUT);
 371                 i915_request_put(rq);
 372
 373                 if (timeout < 0) {
 374                         err = timeout;
 375                         pr_err("Wait for request failed on %s, err=%d\n",
 376                                engine->name, err);
 377                         goto fini;
 378                 }
 379         }
 380
 381 fini:
 382         hang_fini(&h);
 383 unlock:
 384         mutex_unlock(&i915->drm.struct_mutex);
 385         return err;
 386 }
 387
 388 static void global_reset_lock(struct drm_i915_private *i915)
 389 {
 390         struct intel_engine_cs *engine;
 391         enum intel_engine_id id;
 392
 393         pr_debug("%s: current gpu_error=%08lx\n",
 394                  __func__, i915->gpu_error.flags);
 395
 396         while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
 397                 wait_event(i915->gpu_error.reset_queue,
 398                            !test_bit(I915_RESET_BACKOFF,
 399                                      &i915->gpu_error.flags));
 400
 401         for_each_engine(engine, i915, id) {
 402                 while (test_and_set_bit(I915_RESET_ENGINE + id,
 403                                         &i915->gpu_error.flags))
 404                         wait_on_bit(&i915->gpu_error.flags,
 405                                     I915_RESET_ENGINE + id,
 406                                     TASK_UNINTERRUPTIBLE);
 407         }
 408 }
 409
 410 static void global_reset_unlock(struct drm_i915_private *i915)
 411 {
 412         struct intel_engine_cs *engine;
 413         enum intel_engine_id id;
 414
 415         for_each_engine(engine, i915, id)
 416                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 417
 418         clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 419         wake_up_all(&i915->gpu_error.reset_queue);
 420 }
 421
 422 static int igt_global_reset(void *arg)
 423 {
 424         struct drm_i915_private *i915 = arg;
 425         unsigned int reset_count;
 426         int err = 0;
 427
 428         /* Check that we can issue a global GPU reset */
 429
 430         global_reset_lock(i915);
 431         set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
 432
 433         mutex_lock(&i915->drm.struct_mutex);
 434         reset_count = i915_reset_count(&i915->gpu_error);
 435
 436         i915_reset(i915, I915_RESET_QUIET);
 437
 438         if (i915_reset_count(&i915->gpu_error) == reset_count) {
 439                 pr_err("No GPU reset recorded!\n");
 440                 err = -EINVAL;
 441         }
 442         mutex_unlock(&i915->drm.struct_mutex);
 443
 444         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 445         global_reset_unlock(i915);
 446
 447         if (i915_terminally_wedged(&i915->gpu_error))
 448                 err = -EIO;
 449
 450         return err;
 451 }
 452
 453 static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 454 {
 455         struct intel_engine_cs *engine;
 456         enum intel_engine_id id;
 457         struct hang h;
 458         int err = 0;
 459
 460         /* Check that we can issue an engine reset on an idle engine (no-op) */
 461
 462         if (!intel_has_reset_engine(i915))
 463                 return 0;
 464
 465         if (active) {
 466                 mutex_lock(&i915->drm.struct_mutex);
 467                 err = hang_init(&h, i915);
 468                 mutex_unlock(&i915->drm.struct_mutex);
 469                 if (err)
 470                         return err;
 471         }
 472
 473         for_each_engine(engine, i915, id) {
 474                 unsigned int reset_count, reset_engine_count;
 475                 IGT_TIMEOUT(end_time);
 476
 477                 if (active && !intel_engine_can_store_dword(engine))
 478                         continue;
 479
 480                 reset_count = i915_reset_count(&i915->gpu_error);
 481                 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
 482                                                              engine);
 483
 484                 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 485                 do {
 486                         if (active) {
 487                                 struct i915_request *rq;
 488
 489                                 mutex_lock(&i915->drm.struct_mutex);
 490                                 rq = hang_create_request(&h, engine);
 491                                 if (IS_ERR(rq)) {
 492                                         err = PTR_ERR(rq);
 493                                         mutex_unlock(&i915->drm.struct_mutex);
 494                                         break;
 495                                 }
 496
 497                                 i915_request_get(rq);
 498                                 __i915_request_add(rq, true);
 499                                 mutex_unlock(&i915->drm.struct_mutex);
 500
 501                                 if (!wait_for_hang(&h, rq)) {
 502                                         struct drm_printer p = drm_info_printer(i915->drm.dev);
 503
 504                                         pr_err("%s: Failed to start request %x, at %x\n",
 505                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
 506                                         intel_engine_dump(engine, &p,
 507                                                           "%s\n", engine->name);
 508
 509                                         i915_request_put(rq);
 510                                         err = -EIO;
 511                                         break;
 512                                 }
 513
 514                                 i915_request_put(rq);
 515                         }
 516
 517                         engine->hangcheck.stalled = true;
 518                         engine->hangcheck.seqno =
 519                                 intel_engine_get_seqno(engine);
 520
 521                         err = i915_reset_engine(engine, I915_RESET_QUIET);
 522                         if (err) {
 523                                 pr_err("i915_reset_engine failed\n");
 524                                 break;
 525                         }
 526
 527                         if (i915_reset_count(&i915->gpu_error) != reset_count) {
 528                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
 529                                 err = -EINVAL;
 530                                 break;
 531                         }
 532
 533                         reset_engine_count += active;
 534                         if (i915_reset_engine_count(&i915->gpu_error, engine) !=
 535                             reset_engine_count) {
 536                                 pr_err("%s engine reset %srecorded!\n",
 537                                        engine->name, active ? "not " : "");
 538                                 err = -EINVAL;
 539                                 break;
 540                         }
 541
 542                         engine->hangcheck.stalled = false;
 543                 } while (time_before(jiffies, end_time));
 544                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 545
 546                 if (err)
 547                         break;
 548
 549                 err = flush_test(i915, 0);
 550                 if (err)
 551                         break;
 552         }
 553
 554         if (i915_terminally_wedged(&i915->gpu_error))
 555                 err = -EIO;
 556
 557         if (active) {
 558                 mutex_lock(&i915->drm.struct_mutex);
 559                 hang_fini(&h);
 560                 mutex_unlock(&i915->drm.struct_mutex);
 561         }
 562
 563         return err;
 564 }
 565
 566 static int igt_reset_idle_engine(void *arg)
 567 {
 568         return __igt_reset_engine(arg, false);
 569 }
 570
 571 static int igt_reset_active_engine(void *arg)
 572 {
 573         return __igt_reset_engine(arg, true);
 574 }
 575
 576 static int active_engine(void *data)
 577 {
 578         struct intel_engine_cs *engine = data;
 579         struct i915_request *rq[2] = {};
 580         struct i915_gem_context *ctx[2];
 581         struct drm_file *file;
 582         unsigned long count = 0;
 583         int err = 0;
 584
 585         file = mock_file(engine->i915);
 586         if (IS_ERR(file))
 587                 return PTR_ERR(file);
 588
 589         mutex_lock(&engine->i915->drm.struct_mutex);
 590         ctx[0] = live_context(engine->i915, file);
 591         mutex_unlock(&engine->i915->drm.struct_mutex);
 592         if (IS_ERR(ctx[0])) {
 593                 err = PTR_ERR(ctx[0]);
 594                 goto err_file;
 595         }
 596
 597         mutex_lock(&engine->i915->drm.struct_mutex);
 598         ctx[1] = live_context(engine->i915, file);
 599         mutex_unlock(&engine->i915->drm.struct_mutex);
 600         if (IS_ERR(ctx[1])) {
 601                 err = PTR_ERR(ctx[1]);
 602                 i915_gem_context_put(ctx[0]);
 603                 goto err_file;
 604         }
 605
 606         while (!kthread_should_stop()) {
 607                 unsigned int idx = count++ & 1;
 608                 struct i915_request *old = rq[idx];
 609                 struct i915_request *new;
 610
 611                 mutex_lock(&engine->i915->drm.struct_mutex);
 612                 new = i915_request_alloc(engine, ctx[idx]);
 613                 if (IS_ERR(new)) {
 614                         mutex_unlock(&engine->i915->drm.struct_mutex);
 615                         err = PTR_ERR(new);
 616                         break;
 617                 }
 618
 619                 rq[idx] = i915_request_get(new);
 620                 i915_request_add(new);
 621                 mutex_unlock(&engine->i915->drm.struct_mutex);
 622
 623                 if (old) {
 624                         i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
 625                         i915_request_put(old);
 626                 }
 627         }
 628
 629         for (count = 0; count < ARRAY_SIZE(rq); count++)
 630                 i915_request_put(rq[count]);
 631
 632 err_file:
 633         mock_file_free(engine->i915, file);
 634         return err;
 635 }
 636
 637 static int __igt_reset_engine_others(struct drm_i915_private *i915,
 638                                      bool active)
 639 {
 640         struct intel_engine_cs *engine, *other;
 641         enum intel_engine_id id, tmp;
 642         struct hang h;
 643         int err = 0;
 644
 645         /* Check that issuing a reset on one engine does not interfere
 646          * with any other engine.
 647          */
 648
 649         if (!intel_has_reset_engine(i915))
 650                 return 0;
 651
 652         if (active) {
 653                 mutex_lock(&i915->drm.struct_mutex);
 654                 err = hang_init(&h, i915);
 655                 mutex_unlock(&i915->drm.struct_mutex);
 656                 if (err)
 657                         return err;
 658         }
 659
 660         for_each_engine(engine, i915, id) {
 661                 struct task_struct *threads[I915_NUM_ENGINES] = {};
 662                 unsigned long resets[I915_NUM_ENGINES];
 663                 unsigned long global = i915_reset_count(&i915->gpu_error);
 664                 unsigned long count = 0;
 665                 IGT_TIMEOUT(end_time);
 666
 667                 if (active && !intel_engine_can_store_dword(engine))
 668                         continue;
 669
 670                 memset(threads, 0, sizeof(threads));
 671                 for_each_engine(other, i915, tmp) {
 672                         struct task_struct *tsk;
 673
 674                         resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
 675                                                               other);
 676
 677                         if (other == engine)
 678                                 continue;
 679
 680                         tsk = kthread_run(active_engine, other,
 681                                           "igt/%s", other->name);
 682                         if (IS_ERR(tsk)) {
 683                                 err = PTR_ERR(tsk);
 684                                 goto unwind;
 685                         }
 686
 687                         threads[tmp] = tsk;
 688                         get_task_struct(tsk);
 689                 }
 690
 691                 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 692                 do {
 693                         if (active) {
 694                                 struct i915_request *rq;
 695
 696                                 mutex_lock(&i915->drm.struct_mutex);
 697                                 rq = hang_create_request(&h, engine);
 698                                 if (IS_ERR(rq)) {
 699                                         err = PTR_ERR(rq);
 700                                         mutex_unlock(&i915->drm.struct_mutex);
 701                                         break;
 702                                 }
 703
 704                                 i915_request_get(rq);
 705                                 __i915_request_add(rq, true);
 706                                 mutex_unlock(&i915->drm.struct_mutex);
 707
 708                                 if (!wait_for_hang(&h, rq)) {
 709                                         struct drm_printer p = drm_info_printer(i915->drm.dev);
 710
 711                                         pr_err("%s: Failed to start request %x, at %x\n",
 712                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
 713                                         intel_engine_dump(engine, &p,
 714                                                           "%s\n", engine->name);
 715
 716                                         i915_request_put(rq);
 717                                         err = -EIO;
 718                                         break;
 719                                 }
 720
 721                                 i915_request_put(rq);
 722                         }
 723
 724                         engine->hangcheck.stalled = true;
 725                         engine->hangcheck.seqno =
 726                                 intel_engine_get_seqno(engine);
 727
 728                         err = i915_reset_engine(engine, I915_RESET_QUIET);
 729                         if (err) {
 730                                 pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
 731                                        engine->name, active ? "active" : "idle", err);
 732                                 break;
 733                         }
 734
 735                         engine->hangcheck.stalled = false;
 736                         count++;
 737                 } while (time_before(jiffies, end_time));
 738                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 739                 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
 740                         engine->name, active ? "active" : "idle", count);
 741
 742                 if (i915_reset_engine_count(&i915->gpu_error, engine) -
 743                     resets[engine->id] != (active ? count : 0)) {
 744                         pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
 745                                engine->name, active ? "active" : "idle", count,
 746                                i915_reset_engine_count(&i915->gpu_error,
 747                                                        engine) - resets[engine->id]);
 748                         if (!err)
 749                                 err = -EINVAL;
 750                 }
 751
 752 unwind:
 753                 for_each_engine(other, i915, tmp) {
 754                         int ret;
 755
 756                         if (!threads[tmp])
 757                                 continue;
 758
 759                         ret = kthread_stop(threads[tmp]);
 760                         if (ret) {
 761                                 pr_err("kthread for other engine %s failed, err=%d\n",
 762                                        other->name, ret);
 763                                 if (!err)
 764                                         err = ret;
 765                         }
 766                         put_task_struct(threads[tmp]);
 767
 768                         if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
 769                                                                    other)) {
 770                                 pr_err("Innocent engine %s was reset (count=%ld)\n",
 771                                        other->name,
 772                                        i915_reset_engine_count(&i915->gpu_error,
 773                                                                other) - resets[tmp]);
 774                                 if (!err)
 775                                         err = -EINVAL;
 776                         }
 777                 }
 778
 779                 if (global != i915_reset_count(&i915->gpu_error)) {
 780                         pr_err("Global reset (count=%ld)!\n",
 781                                i915_reset_count(&i915->gpu_error) - global);
 782                         if (!err)
 783                                 err = -EINVAL;
 784                 }
 785
 786                 if (err)
 787                         break;
 788
 789                 err = flush_test(i915, 0);
 790                 if (err)
 791                         break;
 792         }
 793
 794         if (i915_terminally_wedged(&i915->gpu_error))
 795                 err = -EIO;
 796
 797         if (active) {
 798                 mutex_lock(&i915->drm.struct_mutex);
 799                 hang_fini(&h);
 800                 mutex_unlock(&i915->drm.struct_mutex);
 801         }
 802
 803         return err;
 804 }
 805
 806 static int igt_reset_idle_engine_others(void *arg)
 807 {
 808         return __igt_reset_engine_others(arg, false);
 809 }
 810
 811 static int igt_reset_active_engine_others(void *arg)
 812 {
 813         return __igt_reset_engine_others(arg, true);
 814 }
 815
 816 static u32 fake_hangcheck(struct i915_request *rq)
 817 {
 818         u32 reset_count;
 819
 820         rq->engine->hangcheck.stalled = true;
 821         rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine);
 822
 823         reset_count = i915_reset_count(&rq->i915->gpu_error);
 824
 825         set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
 826         wake_up_all(&rq->i915->gpu_error.wait_queue);
 827
 828         return reset_count;
 829 }
 830
 831 static int igt_wait_reset(void *arg)
 832 {
 833         struct drm_i915_private *i915 = arg;
 834         struct i915_request *rq;
 835         unsigned int reset_count;
 836         struct hang h;
 837         long timeout;
 838         int err;
 839
 840         if (!intel_engine_can_store_dword(i915->engine[RCS]))
 841                 return 0;
 842
 843         /* Check that we detect a stuck waiter and issue a reset */
 844
 845         global_reset_lock(i915);
 846
 847         mutex_lock(&i915->drm.struct_mutex);
 848         err = hang_init(&h, i915);
 849         if (err)
 850                 goto unlock;
 851
 852         rq = hang_create_request(&h, i915->engine[RCS]);
 853         if (IS_ERR(rq)) {
 854                 err = PTR_ERR(rq);
 855                 goto fini;
 856         }
 857
 858         i915_request_get(rq);
 859         __i915_request_add(rq, true);
 860
 861         if (!wait_for_hang(&h, rq)) {
 862                 struct drm_printer p = drm_info_printer(i915->drm.dev);
 863
 864                 pr_err("%s: Failed to start request %x, at %x\n",
 865                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
 866                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
 867
 868                 i915_reset(i915, 0);
 869                 i915_gem_set_wedged(i915);
 870
 871                 err = -EIO;
 872                 goto out_rq;
 873         }
 874
 875         reset_count = fake_hangcheck(rq);
 876
 877         timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
 878         if (timeout < 0) {
 879                 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
 880                        timeout);
 881                 err = timeout;
 882                 goto out_rq;
 883         }
 884
 885         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 886         if (i915_reset_count(&i915->gpu_error) == reset_count) {
 887                 pr_err("No GPU reset recorded!\n");
 888                 err = -EINVAL;
 889                 goto out_rq;
 890         }
 891
 892 out_rq:
 893         i915_request_put(rq);
 894 fini:
 895         hang_fini(&h);
 896 unlock:
 897         mutex_unlock(&i915->drm.struct_mutex);
 898         global_reset_unlock(i915);
 899
 900         if (i915_terminally_wedged(&i915->gpu_error))
 901                 return -EIO;
 902
 903         return err;
 904 }
 905
 906 static int igt_reset_queue(void *arg)
 907 {
 908         struct drm_i915_private *i915 = arg;
 909         struct intel_engine_cs *engine;
 910         enum intel_engine_id id;
 911         struct hang h;
 912         int err;
 913
 914         /* Check that we replay pending requests following a hang */
 915
 916         global_reset_lock(i915);
 917
 918         mutex_lock(&i915->drm.struct_mutex);
 919         err = hang_init(&h, i915);
 920         if (err)
 921                 goto unlock;
 922
 923         for_each_engine(engine, i915, id) {
 924                 struct i915_request *prev;
 925                 IGT_TIMEOUT(end_time);
 926                 unsigned int count;
 927
 928                 if (!intel_engine_can_store_dword(engine))
 929                         continue;
 930
 931                 prev = hang_create_request(&h, engine);
 932                 if (IS_ERR(prev)) {
 933                         err = PTR_ERR(prev);
 934                         goto fini;
 935                 }
 936
 937                 i915_request_get(prev);
 938                 __i915_request_add(prev, true);
 939
 940                 count = 0;
 941                 do {
 942                         struct i915_request *rq;
 943                         unsigned int reset_count;
 944
 945                         rq = hang_create_request(&h, engine);
 946                         if (IS_ERR(rq)) {
 947                                 err = PTR_ERR(rq);
 948                                 goto fini;
 949                         }
 950
 951                         i915_request_get(rq);
 952                         __i915_request_add(rq, true);
 953
 954                         if (!wait_for_hang(&h, prev)) {
 955                                 struct drm_printer p = drm_info_printer(i915->drm.dev);
 956
 957                                 pr_err("%s: Failed to start request %x, at %x\n",
 958                                        __func__, prev->fence.seqno, hws_seqno(&h, prev));
 959                                 intel_engine_dump(prev->engine, &p,
 960                                                   "%s\n", prev->engine->name);
 961
 962                                 i915_request_put(rq);
 963                                 i915_request_put(prev);
 964
 965                                 i915_reset(i915, 0);
 966                                 i915_gem_set_wedged(i915);
 967
 968                                 err = -EIO;
 969                                 goto fini;
 970                         }
 971
 972                         reset_count = fake_hangcheck(prev);
 973
 974                         i915_reset(i915, I915_RESET_QUIET);
 975
 976                         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
 977                                             &i915->gpu_error.flags));
 978
 979                         if (prev->fence.error != -EIO) {
 980                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
 981                                        prev->fence.error);
 982                                 i915_request_put(rq);
 983                                 i915_request_put(prev);
 984                                 err = -EINVAL;
 985                                 goto fini;
 986                         }
 987
 988                         if (rq->fence.error) {
 989                                 pr_err("Fence error status not zero [%d] after unrelated reset\n",
 990                                        rq->fence.error);
 991                                 i915_request_put(rq);
 992                                 i915_request_put(prev);
 993                                 err = -EINVAL;
 994                                 goto fini;
 995                         }
 996
 997                         if (i915_reset_count(&i915->gpu_error) == reset_count) {
 998                                 pr_err("No GPU reset recorded!\n");
 999                                 i915_request_put(rq);
1000                                 i915_request_put(prev);
1001                                 err = -EINVAL;
1002                                 goto fini;
1003                         }
1004
1005                         i915_request_put(prev);
1006                         prev = rq;
1007                         count++;
1008                 } while (time_before(jiffies, end_time));
1009                 pr_info("%s: Completed %d resets\n", engine->name, count);
1010
1011                 *h.batch = MI_BATCH_BUFFER_END;
1012                 i915_gem_chipset_flush(i915);
1013
1014                 i915_request_put(prev);
1015
1016                 err = flush_test(i915, I915_WAIT_LOCKED);
1017                 if (err)
1018                         break;
1019         }
1020
1021 fini:
1022         hang_fini(&h);
1023 unlock:
1024         mutex_unlock(&i915->drm.struct_mutex);
1025         global_reset_unlock(i915);
1026
1027         if (i915_terminally_wedged(&i915->gpu_error))
1028                 return -EIO;
1029
1030         return err;
1031 }
1032
1033 static int igt_handle_error(void *arg)
1034 {
1035         struct drm_i915_private *i915 = arg;
1036         struct intel_engine_cs *engine = i915->engine[RCS];
1037         struct hang h;
1038         struct i915_request *rq;
1039         struct i915_gpu_state *error;
1040         int err;
1041
1042         /* Check that we can issue a global GPU and engine reset */
1043
1044         if (!intel_has_reset_engine(i915))
1045                 return 0;
1046
1047         if (!intel_engine_can_store_dword(i915->engine[RCS]))
1048                 return 0;
1049
1050         mutex_lock(&i915->drm.struct_mutex);
1051
1052         err = hang_init(&h, i915);
1053         if (err)
1054                 goto err_unlock;
1055
1056         rq = hang_create_request(&h, engine);
1057         if (IS_ERR(rq)) {
1058                 err = PTR_ERR(rq);
1059                 goto err_fini;
1060         }
1061
1062         i915_request_get(rq);
1063         __i915_request_add(rq, true);
1064
1065         if (!wait_for_hang(&h, rq)) {
1066                 struct drm_printer p = drm_info_printer(i915->drm.dev);
1067
1068                 pr_err("%s: Failed to start request %x, at %x\n",
1069                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1070                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1071
1072                 i915_reset(i915, 0);
1073                 i915_gem_set_wedged(i915);
1074
1075                 err = -EIO;
1076                 goto err_request;
1077         }
1078
1079         mutex_unlock(&i915->drm.struct_mutex);
1080
1081         /* Temporarily disable error capture */
1082         error = xchg(&i915->gpu_error.first_error, (void *)-1);
1083
1084         engine->hangcheck.stalled = true;
1085         engine->hangcheck.seqno = intel_engine_get_seqno(engine);
1086
1087         i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
1088
1089         xchg(&i915->gpu_error.first_error, error);
1090
1091         mutex_lock(&i915->drm.struct_mutex);
1092
1093         if (rq->fence.error != -EIO) {
1094                 pr_err("Guilty request not identified!\n");
1095                 err = -EINVAL;
1096                 goto err_request;
1097         }
1098
1099 err_request:
1100         i915_request_put(rq);
1101 err_fini:
1102         hang_fini(&h);
1103 err_unlock:
1104         mutex_unlock(&i915->drm.struct_mutex);
1105         return err;
1106 }
1107
1108 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1109 {
1110         static const struct i915_subtest tests[] = {
1111                 SUBTEST(igt_global_reset), /* attempt to recover GPU first */
1112                 SUBTEST(igt_hang_sanitycheck),
1113                 SUBTEST(igt_reset_idle_engine),
1114                 SUBTEST(igt_reset_active_engine),
1115                 SUBTEST(igt_reset_idle_engine_others),
1116                 SUBTEST(igt_reset_active_engine_others),
1117                 SUBTEST(igt_wait_reset),
1118                 SUBTEST(igt_reset_queue),
1119                 SUBTEST(igt_handle_error),
1120         };
1121         bool saved_hangcheck;
1122         int err;
1123
1124         if (!intel_has_gpu_reset(i915))
1125                 return 0;
1126
1127         intel_runtime_pm_get(i915);
1128         saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1129
1130         err = i915_subtests(tests, i915);
1131
1132         i915_modparams.enable_hangcheck = saved_hangcheck;
1133         intel_runtime_pm_put(i915);
1134
1135         return err;
1136 }