1 /* $NetBSD: selftest_hangcheck.c,v 1.2 2021/12/18 23:45:30 riastradh Exp $ */ 2 3 /* 4 * Copyright 2016 Intel Corporation 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 23 * IN THE SOFTWARE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 __KERNEL_RCSID(0, "$NetBSD: selftest_hangcheck.c,v 1.2 2021/12/18 23:45:30 riastradh Exp $"); 29 30 #include <linux/kthread.h> 31 32 #include "gem/i915_gem_context.h" 33 34 #include "intel_gt.h" 35 #include "intel_engine_heartbeat.h" 36 #include "intel_engine_pm.h" 37 38 #include "i915_selftest.h" 39 #include "selftests/i915_random.h" 40 #include "selftests/igt_flush_test.h" 41 #include "selftests/igt_reset.h" 42 #include "selftests/igt_atomic.h" 43 44 #include "selftests/mock_drm.h" 45 46 #include "gem/selftests/mock_context.h" 47 #include "gem/selftests/igt_gem_utils.h" 48 49 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 50 51 struct hang { 52 struct intel_gt *gt; 53 struct drm_i915_gem_object *hws; 54 struct drm_i915_gem_object *obj; 55 struct i915_gem_context *ctx; 56 u32 *seqno; 57 u32 *batch; 58 }; 59 60 static int hang_init(struct hang *h, struct intel_gt *gt) 61 { 62 void *vaddr; 63 int err; 64 65 memset(h, 0, sizeof(*h)); 66 h->gt = gt; 67 68 h->ctx = kernel_context(gt->i915); 69 if (IS_ERR(h->ctx)) 70 return PTR_ERR(h->ctx); 71 72 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 73 74 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 75 if (IS_ERR(h->hws)) { 76 err = PTR_ERR(h->hws); 77 goto err_ctx; 78 } 79 80 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 81 if (IS_ERR(h->obj)) { 82 err = PTR_ERR(h->obj); 83 goto err_hws; 84 } 85 86 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 87 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB); 88 if (IS_ERR(vaddr)) { 89 err = PTR_ERR(vaddr); 90 goto err_obj; 91 } 92 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 93 94 vaddr = i915_gem_object_pin_map(h->obj, 95 i915_coherent_map_type(gt->i915)); 96 if (IS_ERR(vaddr)) { 97 err = PTR_ERR(vaddr); 98 goto err_unpin_hws; 99 } 100 h->batch = vaddr; 101 102 return 0; 103 104 err_unpin_hws: 105 i915_gem_object_unpin_map(h->hws); 106 err_obj: 107 i915_gem_object_put(h->obj); 108 err_hws: 109 i915_gem_object_put(h->hws); 110 err_ctx: 111 kernel_context_close(h->ctx); 112 return err; 113 } 114 115 static u64 hws_address(const struct i915_vma *hws, 116 const struct i915_request *rq) 117 { 118 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 119 } 120 121 static int move_to_active(struct i915_vma *vma, 122 struct i915_request *rq, 123 unsigned int flags) 124 { 125 int err; 126 127 i915_vma_lock(vma); 128 err = i915_request_await_object(rq, vma->obj, 129 flags & EXEC_OBJECT_WRITE); 130 if (err == 0) 131 err = i915_vma_move_to_active(vma, rq, flags); 132 i915_vma_unlock(vma); 133 134 return err; 135 } 136 137 static struct i915_request * 138 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 139 { 140 struct intel_gt *gt = h->gt; 141 struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx); 142 struct drm_i915_gem_object *obj; 143 struct i915_request *rq = NULL; 144 struct i915_vma *hws, *vma; 145 unsigned int flags; 146 void *vaddr; 147 u32 *batch; 148 int err; 149 150 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 151 if (IS_ERR(obj)) { 152 i915_vm_put(vm); 153 return ERR_CAST(obj); 154 } 155 156 vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915)); 157 if (IS_ERR(vaddr)) { 158 i915_gem_object_put(obj); 159 i915_vm_put(vm); 160 return ERR_CAST(vaddr); 161 } 162 163 i915_gem_object_unpin_map(h->obj); 164 i915_gem_object_put(h->obj); 165 166 h->obj = obj; 167 h->batch = vaddr; 168 169 vma = i915_vma_instance(h->obj, vm, NULL); 170 if (IS_ERR(vma)) { 171 i915_vm_put(vm); 172 return ERR_CAST(vma); 173 } 174 175 hws = i915_vma_instance(h->hws, vm, NULL); 176 if (IS_ERR(hws)) { 177 i915_vm_put(vm); 178 return ERR_CAST(hws); 179 } 180 181 err = i915_vma_pin(vma, 0, 0, PIN_USER); 182 if (err) { 183 i915_vm_put(vm); 184 return ERR_PTR(err); 185 } 186 187 err = i915_vma_pin(hws, 0, 0, PIN_USER); 188 if (err) 189 goto unpin_vma; 190 191 rq = igt_request_alloc(h->ctx, engine); 192 if (IS_ERR(rq)) { 193 err = PTR_ERR(rq); 194 goto unpin_hws; 195 } 196 197 err = move_to_active(vma, rq, 0); 198 if (err) 199 goto cancel_rq; 200 201 err = move_to_active(hws, rq, 0); 202 if (err) 203 goto cancel_rq; 204 205 batch = h->batch; 206 if (INTEL_GEN(gt->i915) >= 8) { 207 *batch++ = MI_STORE_DWORD_IMM_GEN4; 208 *batch++ = lower_32_bits(hws_address(hws, rq)); 209 *batch++ = upper_32_bits(hws_address(hws, rq)); 210 *batch++ = rq->fence.seqno; 211 *batch++ = MI_ARB_CHECK; 212 213 memset(batch, 0, 1024); 214 batch += 1024 / sizeof(*batch); 215 216 *batch++ = MI_ARB_CHECK; 217 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 218 *batch++ = lower_32_bits(vma->node.start); 219 *batch++ = upper_32_bits(vma->node.start); 220 } else if (INTEL_GEN(gt->i915) >= 6) { 221 *batch++ = MI_STORE_DWORD_IMM_GEN4; 222 *batch++ = 0; 223 *batch++ = lower_32_bits(hws_address(hws, rq)); 224 *batch++ = rq->fence.seqno; 225 *batch++ = MI_ARB_CHECK; 226 227 memset(batch, 0, 1024); 228 batch += 1024 / sizeof(*batch); 229 230 *batch++ = MI_ARB_CHECK; 231 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 232 *batch++ = lower_32_bits(vma->node.start); 233 } else if (INTEL_GEN(gt->i915) >= 4) { 234 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 235 *batch++ = 0; 236 *batch++ = lower_32_bits(hws_address(hws, rq)); 237 *batch++ = rq->fence.seqno; 238 *batch++ = MI_ARB_CHECK; 239 240 memset(batch, 0, 1024); 241 batch += 1024 / sizeof(*batch); 242 243 *batch++ = MI_ARB_CHECK; 244 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 245 *batch++ = lower_32_bits(vma->node.start); 246 } else { 247 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 248 *batch++ = lower_32_bits(hws_address(hws, rq)); 249 *batch++ = rq->fence.seqno; 250 *batch++ = MI_ARB_CHECK; 251 252 memset(batch, 0, 1024); 253 batch += 1024 / sizeof(*batch); 254 255 *batch++ = MI_ARB_CHECK; 256 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 257 *batch++ = lower_32_bits(vma->node.start); 258 } 259 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 260 intel_gt_chipset_flush(engine->gt); 261 262 if (rq->engine->emit_init_breadcrumb) { 263 err = rq->engine->emit_init_breadcrumb(rq); 264 if (err) 265 goto cancel_rq; 266 } 267 268 flags = 0; 269 if (INTEL_GEN(gt->i915) <= 5) 270 flags |= I915_DISPATCH_SECURE; 271 272 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 273 274 cancel_rq: 275 if (err) { 276 i915_request_skip(rq, err); 277 i915_request_add(rq); 278 } 279 unpin_hws: 280 i915_vma_unpin(hws); 281 unpin_vma: 282 i915_vma_unpin(vma); 283 i915_vm_put(vm); 284 return err ? ERR_PTR(err) : rq; 285 } 286 287 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 288 { 289 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 290 } 291 292 static void hang_fini(struct hang *h) 293 { 294 *h->batch = MI_BATCH_BUFFER_END; 295 intel_gt_chipset_flush(h->gt); 296 297 i915_gem_object_unpin_map(h->obj); 298 i915_gem_object_put(h->obj); 299 300 i915_gem_object_unpin_map(h->hws); 301 i915_gem_object_put(h->hws); 302 303 kernel_context_close(h->ctx); 304 305 igt_flush_test(h->gt->i915); 306 } 307 308 static bool wait_until_running(struct hang *h, struct i915_request *rq) 309 { 310 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 311 rq->fence.seqno), 312 10) && 313 wait_for(i915_seqno_passed(hws_seqno(h, rq), 314 rq->fence.seqno), 315 1000)); 316 } 317 318 static void engine_heartbeat_disable(struct intel_engine_cs *engine, 319 unsigned long *saved) 320 { 321 *saved = engine->props.heartbeat_interval_ms; 322 engine->props.heartbeat_interval_ms = 0; 323 324 intel_engine_pm_get(engine); 325 intel_engine_park_heartbeat(engine); 326 } 327 328 static void engine_heartbeat_enable(struct intel_engine_cs *engine, 329 unsigned long saved) 330 { 331 intel_engine_pm_put(engine); 332 333 engine->props.heartbeat_interval_ms = saved; 334 } 335 336 static int igt_hang_sanitycheck(void *arg) 337 { 338 struct intel_gt *gt = arg; 339 struct i915_request *rq; 340 struct intel_engine_cs *engine; 341 enum intel_engine_id id; 342 struct hang h; 343 int err; 344 345 /* Basic check that we can execute our hanging batch */ 346 347 err = hang_init(&h, gt); 348 if (err) 349 return err; 350 351 for_each_engine(engine, gt, id) { 352 struct intel_wedge_me w; 353 long timeout; 354 355 if (!intel_engine_can_store_dword(engine)) 356 continue; 357 358 rq = hang_create_request(&h, engine); 359 if (IS_ERR(rq)) { 360 err = PTR_ERR(rq); 361 pr_err("Failed to create request for %s, err=%d\n", 362 engine->name, err); 363 goto fini; 364 } 365 366 i915_request_get(rq); 367 368 *h.batch = MI_BATCH_BUFFER_END; 369 intel_gt_chipset_flush(engine->gt); 370 371 i915_request_add(rq); 372 373 timeout = 0; 374 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 375 timeout = i915_request_wait(rq, 0, 376 MAX_SCHEDULE_TIMEOUT); 377 if (intel_gt_is_wedged(gt)) 378 timeout = -EIO; 379 380 i915_request_put(rq); 381 382 if (timeout < 0) { 383 err = timeout; 384 pr_err("Wait for request failed on %s, err=%d\n", 385 engine->name, err); 386 goto fini; 387 } 388 } 389 390 fini: 391 hang_fini(&h); 392 return err; 393 } 394 395 static bool wait_for_idle(struct intel_engine_cs *engine) 396 { 397 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 398 } 399 400 static int igt_reset_nop(void *arg) 401 { 402 struct intel_gt *gt = arg; 403 struct i915_gpu_error *global = >->i915->gpu_error; 404 struct intel_engine_cs *engine; 405 unsigned int reset_count, count; 406 enum intel_engine_id id; 407 IGT_TIMEOUT(end_time); 408 int err = 0; 409 410 /* Check that we can reset during non-user portions of requests */ 411 412 reset_count = i915_reset_count(global); 413 count = 0; 414 do { 415 for_each_engine(engine, gt, id) { 416 struct intel_context *ce; 417 int i; 418 419 ce = intel_context_create(engine); 420 if (IS_ERR(ce)) { 421 err = PTR_ERR(ce); 422 break; 423 } 424 425 for (i = 0; i < 16; i++) { 426 struct i915_request *rq; 427 428 rq = intel_context_create_request(ce); 429 if (IS_ERR(rq)) { 430 err = PTR_ERR(rq); 431 break; 432 } 433 434 i915_request_add(rq); 435 } 436 437 intel_context_put(ce); 438 } 439 440 igt_global_reset_lock(gt); 441 intel_gt_reset(gt, ALL_ENGINES, NULL); 442 igt_global_reset_unlock(gt); 443 444 if (intel_gt_is_wedged(gt)) { 445 err = -EIO; 446 break; 447 } 448 449 if (i915_reset_count(global) != reset_count + ++count) { 450 pr_err("Full GPU reset not recorded!\n"); 451 err = -EINVAL; 452 break; 453 } 454 455 err = igt_flush_test(gt->i915); 456 if (err) 457 break; 458 } while (time_before(jiffies, end_time)); 459 pr_info("%s: %d resets\n", __func__, count); 460 461 if (igt_flush_test(gt->i915)) 462 err = -EIO; 463 return err; 464 } 465 466 static int igt_reset_nop_engine(void *arg) 467 { 468 struct intel_gt *gt = arg; 469 struct i915_gpu_error *global = >->i915->gpu_error; 470 struct intel_engine_cs *engine; 471 enum intel_engine_id id; 472 473 /* Check that we can engine-reset during non-user portions */ 474 475 if (!intel_has_reset_engine(gt)) 476 return 0; 477 478 for_each_engine(engine, gt, id) { 479 unsigned int reset_count, reset_engine_count, count; 480 struct intel_context *ce; 481 unsigned long heartbeat; 482 IGT_TIMEOUT(end_time); 483 int err; 484 485 ce = intel_context_create(engine); 486 if (IS_ERR(ce)) 487 return PTR_ERR(ce); 488 489 reset_count = i915_reset_count(global); 490 reset_engine_count = i915_reset_engine_count(global, engine); 491 count = 0; 492 493 engine_heartbeat_disable(engine, &heartbeat); 494 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 495 do { 496 int i; 497 498 if (!wait_for_idle(engine)) { 499 pr_err("%s failed to idle before reset\n", 500 engine->name); 501 err = -EIO; 502 break; 503 } 504 505 for (i = 0; i < 16; i++) { 506 struct i915_request *rq; 507 508 rq = intel_context_create_request(ce); 509 if (IS_ERR(rq)) { 510 err = PTR_ERR(rq); 511 break; 512 } 513 514 i915_request_add(rq); 515 } 516 err = intel_engine_reset(engine, NULL); 517 if (err) { 518 pr_err("i915_reset_engine failed\n"); 519 break; 520 } 521 522 if (i915_reset_count(global) != reset_count) { 523 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 524 err = -EINVAL; 525 break; 526 } 527 528 if (i915_reset_engine_count(global, engine) != 529 reset_engine_count + ++count) { 530 pr_err("%s engine reset not recorded!\n", 531 engine->name); 532 err = -EINVAL; 533 break; 534 } 535 } while (time_before(jiffies, end_time)); 536 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 537 engine_heartbeat_enable(engine, heartbeat); 538 539 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 540 541 intel_context_put(ce); 542 if (igt_flush_test(gt->i915)) 543 err = -EIO; 544 if (err) 545 return err; 546 } 547 548 return 0; 549 } 550 551 static int __igt_reset_engine(struct intel_gt *gt, bool active) 552 { 553 struct i915_gpu_error *global = >->i915->gpu_error; 554 struct intel_engine_cs *engine; 555 enum intel_engine_id id; 556 struct hang h; 557 int err = 0; 558 559 /* Check that we can issue an engine reset on an idle engine (no-op) */ 560 561 if (!intel_has_reset_engine(gt)) 562 return 0; 563 564 if (active) { 565 err = hang_init(&h, gt); 566 if (err) 567 return err; 568 } 569 570 for_each_engine(engine, gt, id) { 571 unsigned int reset_count, reset_engine_count; 572 unsigned long heartbeat; 573 IGT_TIMEOUT(end_time); 574 575 if (active && !intel_engine_can_store_dword(engine)) 576 continue; 577 578 if (!wait_for_idle(engine)) { 579 pr_err("%s failed to idle before reset\n", 580 engine->name); 581 err = -EIO; 582 break; 583 } 584 585 reset_count = i915_reset_count(global); 586 reset_engine_count = i915_reset_engine_count(global, engine); 587 588 engine_heartbeat_disable(engine, &heartbeat); 589 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 590 do { 591 if (active) { 592 struct i915_request *rq; 593 594 rq = hang_create_request(&h, engine); 595 if (IS_ERR(rq)) { 596 err = PTR_ERR(rq); 597 break; 598 } 599 600 i915_request_get(rq); 601 i915_request_add(rq); 602 603 if (!wait_until_running(&h, rq)) { 604 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 605 606 pr_err("%s: Failed to start request %llx, at %x\n", 607 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 608 intel_engine_dump(engine, &p, 609 "%s\n", engine->name); 610 611 i915_request_put(rq); 612 err = -EIO; 613 break; 614 } 615 616 i915_request_put(rq); 617 } 618 619 err = intel_engine_reset(engine, NULL); 620 if (err) { 621 pr_err("i915_reset_engine failed\n"); 622 break; 623 } 624 625 if (i915_reset_count(global) != reset_count) { 626 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 627 err = -EINVAL; 628 break; 629 } 630 631 if (i915_reset_engine_count(global, engine) != 632 ++reset_engine_count) { 633 pr_err("%s engine reset not recorded!\n", 634 engine->name); 635 err = -EINVAL; 636 break; 637 } 638 } while (time_before(jiffies, end_time)); 639 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 640 engine_heartbeat_enable(engine, heartbeat); 641 642 if (err) 643 break; 644 645 err = igt_flush_test(gt->i915); 646 if (err) 647 break; 648 } 649 650 if (intel_gt_is_wedged(gt)) 651 err = -EIO; 652 653 if (active) 654 hang_fini(&h); 655 656 return err; 657 } 658 659 static int igt_reset_idle_engine(void *arg) 660 { 661 return __igt_reset_engine(arg, false); 662 } 663 664 static int igt_reset_active_engine(void *arg) 665 { 666 return __igt_reset_engine(arg, true); 667 } 668 669 struct active_engine { 670 struct task_struct *task; 671 struct intel_engine_cs *engine; 672 unsigned long resets; 673 unsigned int flags; 674 }; 675 676 #define TEST_ACTIVE BIT(0) 677 #define TEST_OTHERS BIT(1) 678 #define TEST_SELF BIT(2) 679 #define TEST_PRIORITY BIT(3) 680 681 static int active_request_put(struct i915_request *rq) 682 { 683 int err = 0; 684 685 if (!rq) 686 return 0; 687 688 if (i915_request_wait(rq, 0, 5 * HZ) < 0) { 689 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 690 rq->engine->name, 691 rq->fence.context, 692 rq->fence.seqno); 693 GEM_TRACE_DUMP(); 694 695 intel_gt_set_wedged(rq->engine->gt); 696 err = -EIO; 697 } 698 699 i915_request_put(rq); 700 701 return err; 702 } 703 704 static int active_engine(void *data) 705 { 706 I915_RND_STATE(prng); 707 struct active_engine *arg = data; 708 struct intel_engine_cs *engine = arg->engine; 709 struct i915_request *rq[8] = {}; 710 struct intel_context *ce[ARRAY_SIZE(rq)]; 711 unsigned long count; 712 int err = 0; 713 714 for (count = 0; count < ARRAY_SIZE(ce); count++) { 715 ce[count] = intel_context_create(engine); 716 if (IS_ERR(ce[count])) { 717 err = PTR_ERR(ce[count]); 718 while (--count) 719 intel_context_put(ce[count]); 720 return err; 721 } 722 } 723 724 count = 0; 725 while (!kthread_should_stop()) { 726 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 727 struct i915_request *old = rq[idx]; 728 struct i915_request *new; 729 730 new = intel_context_create_request(ce[idx]); 731 if (IS_ERR(new)) { 732 err = PTR_ERR(new); 733 break; 734 } 735 736 rq[idx] = i915_request_get(new); 737 i915_request_add(new); 738 739 if (engine->schedule && arg->flags & TEST_PRIORITY) { 740 struct i915_sched_attr attr = { 741 .priority = 742 i915_prandom_u32_max_state(512, &prng), 743 }; 744 engine->schedule(rq[idx], &attr); 745 } 746 747 err = active_request_put(old); 748 if (err) 749 break; 750 751 cond_resched(); 752 } 753 754 for (count = 0; count < ARRAY_SIZE(rq); count++) { 755 int err__ = active_request_put(rq[count]); 756 757 /* Keep the first error */ 758 if (!err) 759 err = err__; 760 761 intel_context_put(ce[count]); 762 } 763 764 return err; 765 } 766 767 static int __igt_reset_engines(struct intel_gt *gt, 768 const char *test_name, 769 unsigned int flags) 770 { 771 struct i915_gpu_error *global = >->i915->gpu_error; 772 struct intel_engine_cs *engine, *other; 773 enum intel_engine_id id, tmp; 774 struct hang h; 775 int err = 0; 776 777 /* Check that issuing a reset on one engine does not interfere 778 * with any other engine. 779 */ 780 781 if (!intel_has_reset_engine(gt)) 782 return 0; 783 784 if (flags & TEST_ACTIVE) { 785 err = hang_init(&h, gt); 786 if (err) 787 return err; 788 789 if (flags & TEST_PRIORITY) 790 h.ctx->sched.priority = 1024; 791 } 792 793 for_each_engine(engine, gt, id) { 794 struct active_engine threads[I915_NUM_ENGINES] = {}; 795 unsigned long device = i915_reset_count(global); 796 unsigned long count = 0, reported; 797 unsigned long heartbeat; 798 IGT_TIMEOUT(end_time); 799 800 if (flags & TEST_ACTIVE && 801 !intel_engine_can_store_dword(engine)) 802 continue; 803 804 if (!wait_for_idle(engine)) { 805 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 806 engine->name, test_name); 807 err = -EIO; 808 break; 809 } 810 811 memset(threads, 0, sizeof(threads)); 812 for_each_engine(other, gt, tmp) { 813 struct task_struct *tsk; 814 815 threads[tmp].resets = 816 i915_reset_engine_count(global, other); 817 818 if (!(flags & TEST_OTHERS)) 819 continue; 820 821 if (other == engine && !(flags & TEST_SELF)) 822 continue; 823 824 threads[tmp].engine = other; 825 threads[tmp].flags = flags; 826 827 tsk = kthread_run(active_engine, &threads[tmp], 828 "igt/%s", other->name); 829 if (IS_ERR(tsk)) { 830 err = PTR_ERR(tsk); 831 goto unwind; 832 } 833 834 threads[tmp].task = tsk; 835 get_task_struct(tsk); 836 } 837 838 yield(); /* start all threads before we begin */ 839 840 engine_heartbeat_disable(engine, &heartbeat); 841 set_bit(I915_RESET_ENGINE + id, >->reset.flags); 842 do { 843 struct i915_request *rq = NULL; 844 845 if (flags & TEST_ACTIVE) { 846 rq = hang_create_request(&h, engine); 847 if (IS_ERR(rq)) { 848 err = PTR_ERR(rq); 849 break; 850 } 851 852 i915_request_get(rq); 853 i915_request_add(rq); 854 855 if (!wait_until_running(&h, rq)) { 856 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 857 858 pr_err("%s: Failed to start request %llx, at %x\n", 859 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 860 intel_engine_dump(engine, &p, 861 "%s\n", engine->name); 862 863 i915_request_put(rq); 864 err = -EIO; 865 break; 866 } 867 } 868 869 err = intel_engine_reset(engine, NULL); 870 if (err) { 871 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 872 engine->name, test_name, err); 873 break; 874 } 875 876 count++; 877 878 if (rq) { 879 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 880 struct drm_printer p = 881 drm_info_printer(gt->i915->drm.dev); 882 883 pr_err("i915_reset_engine(%s:%s):" 884 " failed to complete request after reset\n", 885 engine->name, test_name); 886 intel_engine_dump(engine, &p, 887 "%s\n", engine->name); 888 i915_request_put(rq); 889 890 GEM_TRACE_DUMP(); 891 intel_gt_set_wedged(gt); 892 err = -EIO; 893 break; 894 } 895 896 i915_request_put(rq); 897 } 898 899 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 900 struct drm_printer p = 901 drm_info_printer(gt->i915->drm.dev); 902 903 pr_err("i915_reset_engine(%s:%s):" 904 " failed to idle after reset\n", 905 engine->name, test_name); 906 intel_engine_dump(engine, &p, 907 "%s\n", engine->name); 908 909 err = -EIO; 910 break; 911 } 912 } while (time_before(jiffies, end_time)); 913 clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 914 engine_heartbeat_enable(engine, heartbeat); 915 916 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 917 engine->name, test_name, count); 918 919 reported = i915_reset_engine_count(global, engine); 920 reported -= threads[engine->id].resets; 921 if (reported != count) { 922 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 923 engine->name, test_name, count, reported); 924 if (!err) 925 err = -EINVAL; 926 } 927 928 unwind: 929 for_each_engine(other, gt, tmp) { 930 int ret; 931 932 if (!threads[tmp].task) 933 continue; 934 935 ret = kthread_stop(threads[tmp].task); 936 if (ret) { 937 pr_err("kthread for other engine %s failed, err=%d\n", 938 other->name, ret); 939 if (!err) 940 err = ret; 941 } 942 put_task_struct(threads[tmp].task); 943 944 if (other->uabi_class != engine->uabi_class && 945 threads[tmp].resets != 946 i915_reset_engine_count(global, other)) { 947 pr_err("Innocent engine %s was reset (count=%ld)\n", 948 other->name, 949 i915_reset_engine_count(global, other) - 950 threads[tmp].resets); 951 if (!err) 952 err = -EINVAL; 953 } 954 } 955 956 if (device != i915_reset_count(global)) { 957 pr_err("Global reset (count=%ld)!\n", 958 i915_reset_count(global) - device); 959 if (!err) 960 err = -EINVAL; 961 } 962 963 if (err) 964 break; 965 966 err = igt_flush_test(gt->i915); 967 if (err) 968 break; 969 } 970 971 if (intel_gt_is_wedged(gt)) 972 err = -EIO; 973 974 if (flags & TEST_ACTIVE) 975 hang_fini(&h); 976 977 return err; 978 } 979 980 static int igt_reset_engines(void *arg) 981 { 982 static const struct { 983 const char *name; 984 unsigned int flags; 985 } phases[] = { 986 { "idle", 0 }, 987 { "active", TEST_ACTIVE }, 988 { "others-idle", TEST_OTHERS }, 989 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 990 { 991 "others-priority", 992 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 993 }, 994 { 995 "self-priority", 996 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 997 }, 998 { } 999 }; 1000 struct intel_gt *gt = arg; 1001 typeof(*phases) *p; 1002 int err; 1003 1004 for (p = phases; p->name; p++) { 1005 if (p->flags & TEST_PRIORITY) { 1006 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1007 continue; 1008 } 1009 1010 err = __igt_reset_engines(arg, p->name, p->flags); 1011 if (err) 1012 return err; 1013 } 1014 1015 return 0; 1016 } 1017 1018 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1019 { 1020 u32 count = i915_reset_count(>->i915->gpu_error); 1021 1022 intel_gt_reset(gt, mask, NULL); 1023 1024 return count; 1025 } 1026 1027 static int igt_reset_wait(void *arg) 1028 { 1029 struct intel_gt *gt = arg; 1030 struct i915_gpu_error *global = >->i915->gpu_error; 1031 struct intel_engine_cs *engine = gt->engine[RCS0]; 1032 struct i915_request *rq; 1033 unsigned int reset_count; 1034 struct hang h; 1035 long timeout; 1036 int err; 1037 1038 if (!engine || !intel_engine_can_store_dword(engine)) 1039 return 0; 1040 1041 /* Check that we detect a stuck waiter and issue a reset */ 1042 1043 igt_global_reset_lock(gt); 1044 1045 err = hang_init(&h, gt); 1046 if (err) 1047 goto unlock; 1048 1049 rq = hang_create_request(&h, engine); 1050 if (IS_ERR(rq)) { 1051 err = PTR_ERR(rq); 1052 goto fini; 1053 } 1054 1055 i915_request_get(rq); 1056 i915_request_add(rq); 1057 1058 if (!wait_until_running(&h, rq)) { 1059 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1060 1061 pr_err("%s: Failed to start request %llx, at %x\n", 1062 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1063 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1064 1065 intel_gt_set_wedged(gt); 1066 1067 err = -EIO; 1068 goto out_rq; 1069 } 1070 1071 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1072 1073 timeout = i915_request_wait(rq, 0, 10); 1074 if (timeout < 0) { 1075 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1076 timeout); 1077 err = timeout; 1078 goto out_rq; 1079 } 1080 1081 if (i915_reset_count(global) == reset_count) { 1082 pr_err("No GPU reset recorded!\n"); 1083 err = -EINVAL; 1084 goto out_rq; 1085 } 1086 1087 out_rq: 1088 i915_request_put(rq); 1089 fini: 1090 hang_fini(&h); 1091 unlock: 1092 igt_global_reset_unlock(gt); 1093 1094 if (intel_gt_is_wedged(gt)) 1095 return -EIO; 1096 1097 return err; 1098 } 1099 1100 struct evict_vma { 1101 struct completion completion; 1102 struct i915_vma *vma; 1103 }; 1104 1105 static int evict_vma(void *data) 1106 { 1107 struct evict_vma *arg = data; 1108 struct i915_address_space *vm = arg->vma->vm; 1109 struct drm_mm_node evict = arg->vma->node; 1110 int err; 1111 1112 complete(&arg->completion); 1113 1114 mutex_lock(&vm->mutex); 1115 err = i915_gem_evict_for_node(vm, &evict, 0); 1116 mutex_unlock(&vm->mutex); 1117 1118 return err; 1119 } 1120 1121 static int evict_fence(void *data) 1122 { 1123 struct evict_vma *arg = data; 1124 int err; 1125 1126 complete(&arg->completion); 1127 1128 /* Mark the fence register as dirty to force the mmio update. */ 1129 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1130 if (err) { 1131 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1132 return err; 1133 } 1134 1135 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1136 if (err) { 1137 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1138 return err; 1139 } 1140 1141 err = i915_vma_pin_fence(arg->vma); 1142 i915_vma_unpin(arg->vma); 1143 if (err) { 1144 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1145 return err; 1146 } 1147 1148 i915_vma_unpin_fence(arg->vma); 1149 1150 return 0; 1151 } 1152 1153 static int __igt_reset_evict_vma(struct intel_gt *gt, 1154 struct i915_address_space *vm, 1155 int (*fn)(void *), 1156 unsigned int flags) 1157 { 1158 struct intel_engine_cs *engine = gt->engine[RCS0]; 1159 struct drm_i915_gem_object *obj; 1160 struct task_struct *tsk = NULL; 1161 struct i915_request *rq; 1162 struct evict_vma arg; 1163 struct hang h; 1164 unsigned int pin_flags; 1165 int err; 1166 1167 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1168 return 0; 1169 1170 if (!engine || !intel_engine_can_store_dword(engine)) 1171 return 0; 1172 1173 /* Check that we can recover an unbind stuck on a hanging request */ 1174 1175 err = hang_init(&h, gt); 1176 if (err) 1177 return err; 1178 1179 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1180 if (IS_ERR(obj)) { 1181 err = PTR_ERR(obj); 1182 goto fini; 1183 } 1184 1185 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1186 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1187 if (err) { 1188 pr_err("Invalid X-tiling settings; err:%d\n", err); 1189 goto out_obj; 1190 } 1191 } 1192 1193 arg.vma = i915_vma_instance(obj, vm, NULL); 1194 if (IS_ERR(arg.vma)) { 1195 err = PTR_ERR(arg.vma); 1196 goto out_obj; 1197 } 1198 1199 rq = hang_create_request(&h, engine); 1200 if (IS_ERR(rq)) { 1201 err = PTR_ERR(rq); 1202 goto out_obj; 1203 } 1204 1205 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1206 1207 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1208 pin_flags |= PIN_MAPPABLE; 1209 1210 err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1211 if (err) { 1212 i915_request_add(rq); 1213 goto out_obj; 1214 } 1215 1216 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1217 err = i915_vma_pin_fence(arg.vma); 1218 if (err) { 1219 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1220 i915_vma_unpin(arg.vma); 1221 i915_request_add(rq); 1222 goto out_obj; 1223 } 1224 } 1225 1226 i915_vma_lock(arg.vma); 1227 err = i915_request_await_object(rq, arg.vma->obj, 1228 flags & EXEC_OBJECT_WRITE); 1229 if (err == 0) 1230 err = i915_vma_move_to_active(arg.vma, rq, flags); 1231 i915_vma_unlock(arg.vma); 1232 1233 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1234 i915_vma_unpin_fence(arg.vma); 1235 i915_vma_unpin(arg.vma); 1236 1237 i915_request_get(rq); 1238 i915_request_add(rq); 1239 if (err) 1240 goto out_rq; 1241 1242 if (!wait_until_running(&h, rq)) { 1243 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1244 1245 pr_err("%s: Failed to start request %llx, at %x\n", 1246 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1247 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1248 1249 intel_gt_set_wedged(gt); 1250 goto out_reset; 1251 } 1252 1253 init_completion(&arg.completion); 1254 1255 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1256 if (IS_ERR(tsk)) { 1257 err = PTR_ERR(tsk); 1258 tsk = NULL; 1259 goto out_reset; 1260 } 1261 get_task_struct(tsk); 1262 1263 wait_for_completion(&arg.completion); 1264 1265 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1266 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1267 1268 pr_err("igt/evict_vma kthread did not wait\n"); 1269 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1270 1271 intel_gt_set_wedged(gt); 1272 goto out_reset; 1273 } 1274 1275 out_reset: 1276 igt_global_reset_lock(gt); 1277 fake_hangcheck(gt, rq->engine->mask); 1278 igt_global_reset_unlock(gt); 1279 1280 if (tsk) { 1281 struct intel_wedge_me w; 1282 1283 /* The reset, even indirectly, should take less than 10ms. */ 1284 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1285 err = kthread_stop(tsk); 1286 1287 put_task_struct(tsk); 1288 } 1289 1290 out_rq: 1291 i915_request_put(rq); 1292 out_obj: 1293 i915_gem_object_put(obj); 1294 fini: 1295 hang_fini(&h); 1296 if (intel_gt_is_wedged(gt)) 1297 return -EIO; 1298 1299 return err; 1300 } 1301 1302 static int igt_reset_evict_ggtt(void *arg) 1303 { 1304 struct intel_gt *gt = arg; 1305 1306 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1307 evict_vma, EXEC_OBJECT_WRITE); 1308 } 1309 1310 static int igt_reset_evict_ppgtt(void *arg) 1311 { 1312 struct intel_gt *gt = arg; 1313 struct i915_ppgtt *ppgtt; 1314 int err; 1315 1316 /* aliasing == global gtt locking, covered above */ 1317 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1318 return 0; 1319 1320 ppgtt = i915_ppgtt_create(gt); 1321 if (IS_ERR(ppgtt)) 1322 return PTR_ERR(ppgtt); 1323 1324 err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1325 evict_vma, EXEC_OBJECT_WRITE); 1326 i915_vm_put(&ppgtt->vm); 1327 1328 return err; 1329 } 1330 1331 static int igt_reset_evict_fence(void *arg) 1332 { 1333 struct intel_gt *gt = arg; 1334 1335 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1336 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1337 } 1338 1339 static int wait_for_others(struct intel_gt *gt, 1340 struct intel_engine_cs *exclude) 1341 { 1342 struct intel_engine_cs *engine; 1343 enum intel_engine_id id; 1344 1345 for_each_engine(engine, gt, id) { 1346 if (engine == exclude) 1347 continue; 1348 1349 if (!wait_for_idle(engine)) 1350 return -EIO; 1351 } 1352 1353 return 0; 1354 } 1355 1356 static int igt_reset_queue(void *arg) 1357 { 1358 struct intel_gt *gt = arg; 1359 struct i915_gpu_error *global = >->i915->gpu_error; 1360 struct intel_engine_cs *engine; 1361 enum intel_engine_id id; 1362 struct hang h; 1363 int err; 1364 1365 /* Check that we replay pending requests following a hang */ 1366 1367 igt_global_reset_lock(gt); 1368 1369 err = hang_init(&h, gt); 1370 if (err) 1371 goto unlock; 1372 1373 for_each_engine(engine, gt, id) { 1374 struct i915_request *prev; 1375 IGT_TIMEOUT(end_time); 1376 unsigned int count; 1377 1378 if (!intel_engine_can_store_dword(engine)) 1379 continue; 1380 1381 prev = hang_create_request(&h, engine); 1382 if (IS_ERR(prev)) { 1383 err = PTR_ERR(prev); 1384 goto fini; 1385 } 1386 1387 i915_request_get(prev); 1388 i915_request_add(prev); 1389 1390 count = 0; 1391 do { 1392 struct i915_request *rq; 1393 unsigned int reset_count; 1394 1395 rq = hang_create_request(&h, engine); 1396 if (IS_ERR(rq)) { 1397 err = PTR_ERR(rq); 1398 goto fini; 1399 } 1400 1401 i915_request_get(rq); 1402 i915_request_add(rq); 1403 1404 /* 1405 * XXX We don't handle resetting the kernel context 1406 * very well. If we trigger a device reset twice in 1407 * quick succession while the kernel context is 1408 * executing, we may end up skipping the breadcrumb. 1409 * This is really only a problem for the selftest as 1410 * normally there is a large interlude between resets 1411 * (hangcheck), or we focus on resetting just one 1412 * engine and so avoid repeatedly resetting innocents. 1413 */ 1414 err = wait_for_others(gt, engine); 1415 if (err) { 1416 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1417 __func__, engine->name); 1418 i915_request_put(rq); 1419 i915_request_put(prev); 1420 1421 GEM_TRACE_DUMP(); 1422 intel_gt_set_wedged(gt); 1423 goto fini; 1424 } 1425 1426 if (!wait_until_running(&h, prev)) { 1427 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1428 1429 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1430 __func__, engine->name, 1431 prev->fence.seqno, hws_seqno(&h, prev)); 1432 intel_engine_dump(engine, &p, 1433 "%s\n", engine->name); 1434 1435 i915_request_put(rq); 1436 i915_request_put(prev); 1437 1438 intel_gt_set_wedged(gt); 1439 1440 err = -EIO; 1441 goto fini; 1442 } 1443 1444 reset_count = fake_hangcheck(gt, BIT(id)); 1445 1446 if (prev->fence.error != -EIO) { 1447 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1448 prev->fence.error); 1449 i915_request_put(rq); 1450 i915_request_put(prev); 1451 err = -EINVAL; 1452 goto fini; 1453 } 1454 1455 if (rq->fence.error) { 1456 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1457 rq->fence.error); 1458 i915_request_put(rq); 1459 i915_request_put(prev); 1460 err = -EINVAL; 1461 goto fini; 1462 } 1463 1464 if (i915_reset_count(global) == reset_count) { 1465 pr_err("No GPU reset recorded!\n"); 1466 i915_request_put(rq); 1467 i915_request_put(prev); 1468 err = -EINVAL; 1469 goto fini; 1470 } 1471 1472 i915_request_put(prev); 1473 prev = rq; 1474 count++; 1475 } while (time_before(jiffies, end_time)); 1476 pr_info("%s: Completed %d resets\n", engine->name, count); 1477 1478 *h.batch = MI_BATCH_BUFFER_END; 1479 intel_gt_chipset_flush(engine->gt); 1480 1481 i915_request_put(prev); 1482 1483 err = igt_flush_test(gt->i915); 1484 if (err) 1485 break; 1486 } 1487 1488 fini: 1489 hang_fini(&h); 1490 unlock: 1491 igt_global_reset_unlock(gt); 1492 1493 if (intel_gt_is_wedged(gt)) 1494 return -EIO; 1495 1496 return err; 1497 } 1498 1499 static int igt_handle_error(void *arg) 1500 { 1501 struct intel_gt *gt = arg; 1502 struct i915_gpu_error *global = >->i915->gpu_error; 1503 struct intel_engine_cs *engine = gt->engine[RCS0]; 1504 struct hang h; 1505 struct i915_request *rq; 1506 struct i915_gpu_coredump *error; 1507 int err; 1508 1509 /* Check that we can issue a global GPU and engine reset */ 1510 1511 if (!intel_has_reset_engine(gt)) 1512 return 0; 1513 1514 if (!engine || !intel_engine_can_store_dword(engine)) 1515 return 0; 1516 1517 err = hang_init(&h, gt); 1518 if (err) 1519 return err; 1520 1521 rq = hang_create_request(&h, engine); 1522 if (IS_ERR(rq)) { 1523 err = PTR_ERR(rq); 1524 goto err_fini; 1525 } 1526 1527 i915_request_get(rq); 1528 i915_request_add(rq); 1529 1530 if (!wait_until_running(&h, rq)) { 1531 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1532 1533 pr_err("%s: Failed to start request %llx, at %x\n", 1534 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1535 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1536 1537 intel_gt_set_wedged(gt); 1538 1539 err = -EIO; 1540 goto err_request; 1541 } 1542 1543 /* Temporarily disable error capture */ 1544 error = xchg(&global->first_error, (void *)-1); 1545 1546 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1547 1548 xchg(&global->first_error, error); 1549 1550 if (rq->fence.error != -EIO) { 1551 pr_err("Guilty request not identified!\n"); 1552 err = -EINVAL; 1553 goto err_request; 1554 } 1555 1556 err_request: 1557 i915_request_put(rq); 1558 err_fini: 1559 hang_fini(&h); 1560 return err; 1561 } 1562 1563 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1564 const struct igt_atomic_section *p, 1565 const char *mode) 1566 { 1567 struct tasklet_struct * const t = &engine->execlists.tasklet; 1568 int err; 1569 1570 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1571 engine->name, mode, p->name); 1572 1573 tasklet_disable(t); 1574 p->critical_section_begin(); 1575 1576 err = intel_engine_reset(engine, NULL); 1577 1578 p->critical_section_end(); 1579 tasklet_enable(t); 1580 1581 if (err) 1582 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1583 engine->name, mode, p->name); 1584 1585 return err; 1586 } 1587 1588 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1589 const struct igt_atomic_section *p) 1590 { 1591 struct i915_request *rq; 1592 struct hang h; 1593 int err; 1594 1595 err = __igt_atomic_reset_engine(engine, p, "idle"); 1596 if (err) 1597 return err; 1598 1599 err = hang_init(&h, engine->gt); 1600 if (err) 1601 return err; 1602 1603 rq = hang_create_request(&h, engine); 1604 if (IS_ERR(rq)) { 1605 err = PTR_ERR(rq); 1606 goto out; 1607 } 1608 1609 i915_request_get(rq); 1610 i915_request_add(rq); 1611 1612 if (wait_until_running(&h, rq)) { 1613 err = __igt_atomic_reset_engine(engine, p, "active"); 1614 } else { 1615 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1616 __func__, engine->name, 1617 rq->fence.seqno, hws_seqno(&h, rq)); 1618 intel_gt_set_wedged(engine->gt); 1619 err = -EIO; 1620 } 1621 1622 if (err == 0) { 1623 struct intel_wedge_me w; 1624 1625 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1626 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1627 if (intel_gt_is_wedged(engine->gt)) 1628 err = -EIO; 1629 } 1630 1631 i915_request_put(rq); 1632 out: 1633 hang_fini(&h); 1634 return err; 1635 } 1636 1637 static int igt_reset_engines_atomic(void *arg) 1638 { 1639 struct intel_gt *gt = arg; 1640 const typeof(*igt_atomic_phases) *p; 1641 int err = 0; 1642 1643 /* Check that the engines resets are usable from atomic context */ 1644 1645 if (!intel_has_reset_engine(gt)) 1646 return 0; 1647 1648 if (USES_GUC_SUBMISSION(gt->i915)) 1649 return 0; 1650 1651 igt_global_reset_lock(gt); 1652 1653 /* Flush any requests before we get started and check basics */ 1654 if (!igt_force_reset(gt)) 1655 goto unlock; 1656 1657 for (p = igt_atomic_phases; p->name; p++) { 1658 struct intel_engine_cs *engine; 1659 enum intel_engine_id id; 1660 1661 for_each_engine(engine, gt, id) { 1662 err = igt_atomic_reset_engine(engine, p); 1663 if (err) 1664 goto out; 1665 } 1666 } 1667 1668 out: 1669 /* As we poke around the guts, do a full reset before continuing. */ 1670 igt_force_reset(gt); 1671 unlock: 1672 igt_global_reset_unlock(gt); 1673 1674 return err; 1675 } 1676 1677 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 1678 { 1679 static const struct i915_subtest tests[] = { 1680 SUBTEST(igt_hang_sanitycheck), 1681 SUBTEST(igt_reset_nop), 1682 SUBTEST(igt_reset_nop_engine), 1683 SUBTEST(igt_reset_idle_engine), 1684 SUBTEST(igt_reset_active_engine), 1685 SUBTEST(igt_reset_engines), 1686 SUBTEST(igt_reset_engines_atomic), 1687 SUBTEST(igt_reset_queue), 1688 SUBTEST(igt_reset_wait), 1689 SUBTEST(igt_reset_evict_ggtt), 1690 SUBTEST(igt_reset_evict_ppgtt), 1691 SUBTEST(igt_reset_evict_fence), 1692 SUBTEST(igt_handle_error), 1693 }; 1694 struct intel_gt *gt = &i915->gt; 1695 intel_wakeref_t wakeref; 1696 int err; 1697 1698 if (!intel_has_gpu_reset(gt)) 1699 return 0; 1700 1701 if (intel_gt_is_wedged(gt)) 1702 return -EIO; /* we're long past hope of a successful reset */ 1703 1704 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 1705 1706 err = intel_gt_live_subtests(tests, gt); 1707 1708 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 1709 1710 return err; 1711 } 1712