1 1.1 riastrad /* $NetBSD: selftest_hangcheck.c,v 1.2 2021/12/18 23:45:30 riastradh Exp $ */ 2 1.1 riastrad 3 1.1 riastrad /* 4 1.1 riastrad * Copyright 2016 Intel Corporation 5 1.1 riastrad * 6 1.1 riastrad * Permission is hereby granted, free of charge, to any person obtaining a 7 1.1 riastrad * copy of this software and associated documentation files (the "Software"), 8 1.1 riastrad * to deal in the Software without restriction, including without limitation 9 1.1 riastrad * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 1.1 riastrad * and/or sell copies of the Software, and to permit persons to whom the 11 1.1 riastrad * Software is furnished to do so, subject to the following conditions: 12 1.1 riastrad * 13 1.1 riastrad * The above copyright notice and this permission notice (including the next 14 1.1 riastrad * paragraph) shall be included in all copies or substantial portions of the 15 1.1 riastrad * Software. 16 1.1 riastrad * 17 1.1 riastrad * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 1.1 riastrad * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 1.1 riastrad * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 1.1 riastrad * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 1.1 riastrad * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 1.1 riastrad * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 23 1.1 riastrad * IN THE SOFTWARE. 24 1.1 riastrad * 25 1.1 riastrad */ 26 1.1 riastrad 27 1.1 riastrad #include <sys/cdefs.h> 28 1.1 riastrad __KERNEL_RCSID(0, "$NetBSD: selftest_hangcheck.c,v 1.2 2021/12/18 23:45:30 riastradh Exp $"); 29 1.1 riastrad 30 1.1 riastrad #include <linux/kthread.h> 31 1.1 riastrad 32 1.1 riastrad #include "gem/i915_gem_context.h" 33 1.1 riastrad 34 1.1 riastrad #include "intel_gt.h" 35 1.1 riastrad #include "intel_engine_heartbeat.h" 36 1.1 riastrad #include "intel_engine_pm.h" 37 1.1 riastrad 38 1.1 riastrad #include "i915_selftest.h" 39 1.1 riastrad #include "selftests/i915_random.h" 40 1.1 riastrad #include "selftests/igt_flush_test.h" 41 1.1 riastrad #include "selftests/igt_reset.h" 42 1.1 riastrad #include "selftests/igt_atomic.h" 43 1.1 riastrad 44 1.1 riastrad #include "selftests/mock_drm.h" 45 1.1 riastrad 46 1.1 riastrad #include "gem/selftests/mock_context.h" 47 1.1 riastrad #include "gem/selftests/igt_gem_utils.h" 48 1.1 riastrad 49 1.1 riastrad #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 50 1.1 riastrad 51 1.1 riastrad struct hang { 52 1.1 riastrad struct intel_gt *gt; 53 1.1 riastrad struct drm_i915_gem_object *hws; 54 1.1 riastrad struct drm_i915_gem_object *obj; 55 1.1 riastrad struct i915_gem_context *ctx; 56 1.1 riastrad u32 *seqno; 57 1.1 riastrad u32 *batch; 58 1.1 riastrad }; 59 1.1 riastrad 60 1.1 riastrad static int hang_init(struct hang *h, struct intel_gt *gt) 61 1.1 riastrad { 62 1.1 riastrad void *vaddr; 63 1.1 riastrad int err; 64 1.1 riastrad 65 1.1 riastrad memset(h, 0, sizeof(*h)); 66 1.1 riastrad h->gt = gt; 67 1.1 riastrad 68 1.1 riastrad h->ctx = kernel_context(gt->i915); 69 1.1 riastrad if (IS_ERR(h->ctx)) 70 1.1 riastrad return PTR_ERR(h->ctx); 71 1.1 riastrad 72 1.1 riastrad GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 73 1.1 riastrad 74 1.1 riastrad h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 75 1.1 riastrad if (IS_ERR(h->hws)) { 76 1.1 riastrad err = PTR_ERR(h->hws); 77 1.1 riastrad goto err_ctx; 78 1.1 riastrad } 79 1.1 riastrad 80 1.1 riastrad h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 81 1.1 riastrad if (IS_ERR(h->obj)) { 82 1.1 riastrad err = PTR_ERR(h->obj); 83 1.1 riastrad goto err_hws; 84 1.1 riastrad } 85 1.1 riastrad 86 1.1 riastrad i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 87 1.1 riastrad vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB); 88 1.1 riastrad if (IS_ERR(vaddr)) { 89 1.1 riastrad err = PTR_ERR(vaddr); 90 1.1 riastrad goto err_obj; 91 1.1 riastrad } 92 1.1 riastrad h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 93 1.1 riastrad 94 1.1 riastrad vaddr = i915_gem_object_pin_map(h->obj, 95 1.1 riastrad i915_coherent_map_type(gt->i915)); 96 1.1 riastrad if (IS_ERR(vaddr)) { 97 1.1 riastrad err = PTR_ERR(vaddr); 98 1.1 riastrad goto err_unpin_hws; 99 1.1 riastrad } 100 1.1 riastrad h->batch = vaddr; 101 1.1 riastrad 102 1.1 riastrad return 0; 103 1.1 riastrad 104 1.1 riastrad err_unpin_hws: 105 1.1 riastrad i915_gem_object_unpin_map(h->hws); 106 1.1 riastrad err_obj: 107 1.1 riastrad i915_gem_object_put(h->obj); 108 1.1 riastrad err_hws: 109 1.1 riastrad i915_gem_object_put(h->hws); 110 1.1 riastrad err_ctx: 111 1.1 riastrad kernel_context_close(h->ctx); 112 1.1 riastrad return err; 113 1.1 riastrad } 114 1.1 riastrad 115 1.1 riastrad static u64 hws_address(const struct i915_vma *hws, 116 1.1 riastrad const struct i915_request *rq) 117 1.1 riastrad { 118 1.1 riastrad return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 119 1.1 riastrad } 120 1.1 riastrad 121 1.1 riastrad static int move_to_active(struct i915_vma *vma, 122 1.1 riastrad struct i915_request *rq, 123 1.1 riastrad unsigned int flags) 124 1.1 riastrad { 125 1.1 riastrad int err; 126 1.1 riastrad 127 1.1 riastrad i915_vma_lock(vma); 128 1.1 riastrad err = i915_request_await_object(rq, vma->obj, 129 1.1 riastrad flags & EXEC_OBJECT_WRITE); 130 1.1 riastrad if (err == 0) 131 1.1 riastrad err = i915_vma_move_to_active(vma, rq, flags); 132 1.1 riastrad i915_vma_unlock(vma); 133 1.1 riastrad 134 1.1 riastrad return err; 135 1.1 riastrad } 136 1.1 riastrad 137 1.1 riastrad static struct i915_request * 138 1.1 riastrad hang_create_request(struct hang *h, struct intel_engine_cs *engine) 139 1.1 riastrad { 140 1.1 riastrad struct intel_gt *gt = h->gt; 141 1.1 riastrad struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx); 142 1.1 riastrad struct drm_i915_gem_object *obj; 143 1.1 riastrad struct i915_request *rq = NULL; 144 1.1 riastrad struct i915_vma *hws, *vma; 145 1.1 riastrad unsigned int flags; 146 1.1 riastrad void *vaddr; 147 1.1 riastrad u32 *batch; 148 1.1 riastrad int err; 149 1.1 riastrad 150 1.1 riastrad obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 151 1.1 riastrad if (IS_ERR(obj)) { 152 1.1 riastrad i915_vm_put(vm); 153 1.1 riastrad return ERR_CAST(obj); 154 1.1 riastrad } 155 1.1 riastrad 156 1.1 riastrad vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915)); 157 1.1 riastrad if (IS_ERR(vaddr)) { 158 1.1 riastrad i915_gem_object_put(obj); 159 1.1 riastrad i915_vm_put(vm); 160 1.1 riastrad return ERR_CAST(vaddr); 161 1.1 riastrad } 162 1.1 riastrad 163 1.1 riastrad i915_gem_object_unpin_map(h->obj); 164 1.1 riastrad i915_gem_object_put(h->obj); 165 1.1 riastrad 166 1.1 riastrad h->obj = obj; 167 1.1 riastrad h->batch = vaddr; 168 1.1 riastrad 169 1.1 riastrad vma = i915_vma_instance(h->obj, vm, NULL); 170 1.1 riastrad if (IS_ERR(vma)) { 171 1.1 riastrad i915_vm_put(vm); 172 1.1 riastrad return ERR_CAST(vma); 173 1.1 riastrad } 174 1.1 riastrad 175 1.1 riastrad hws = i915_vma_instance(h->hws, vm, NULL); 176 1.1 riastrad if (IS_ERR(hws)) { 177 1.1 riastrad i915_vm_put(vm); 178 1.1 riastrad return ERR_CAST(hws); 179 1.1 riastrad } 180 1.1 riastrad 181 1.1 riastrad err = i915_vma_pin(vma, 0, 0, PIN_USER); 182 1.1 riastrad if (err) { 183 1.1 riastrad i915_vm_put(vm); 184 1.1 riastrad return ERR_PTR(err); 185 1.1 riastrad } 186 1.1 riastrad 187 1.1 riastrad err = i915_vma_pin(hws, 0, 0, PIN_USER); 188 1.1 riastrad if (err) 189 1.1 riastrad goto unpin_vma; 190 1.1 riastrad 191 1.1 riastrad rq = igt_request_alloc(h->ctx, engine); 192 1.1 riastrad if (IS_ERR(rq)) { 193 1.1 riastrad err = PTR_ERR(rq); 194 1.1 riastrad goto unpin_hws; 195 1.1 riastrad } 196 1.1 riastrad 197 1.1 riastrad err = move_to_active(vma, rq, 0); 198 1.1 riastrad if (err) 199 1.1 riastrad goto cancel_rq; 200 1.1 riastrad 201 1.1 riastrad err = move_to_active(hws, rq, 0); 202 1.1 riastrad if (err) 203 1.1 riastrad goto cancel_rq; 204 1.1 riastrad 205 1.1 riastrad batch = h->batch; 206 1.1 riastrad if (INTEL_GEN(gt->i915) >= 8) { 207 1.1 riastrad *batch++ = MI_STORE_DWORD_IMM_GEN4; 208 1.1 riastrad *batch++ = lower_32_bits(hws_address(hws, rq)); 209 1.1 riastrad *batch++ = upper_32_bits(hws_address(hws, rq)); 210 1.1 riastrad *batch++ = rq->fence.seqno; 211 1.1 riastrad *batch++ = MI_ARB_CHECK; 212 1.1 riastrad 213 1.1 riastrad memset(batch, 0, 1024); 214 1.1 riastrad batch += 1024 / sizeof(*batch); 215 1.1 riastrad 216 1.1 riastrad *batch++ = MI_ARB_CHECK; 217 1.1 riastrad *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 218 1.1 riastrad *batch++ = lower_32_bits(vma->node.start); 219 1.1 riastrad *batch++ = upper_32_bits(vma->node.start); 220 1.1 riastrad } else if (INTEL_GEN(gt->i915) >= 6) { 221 1.1 riastrad *batch++ = MI_STORE_DWORD_IMM_GEN4; 222 1.1 riastrad *batch++ = 0; 223 1.1 riastrad *batch++ = lower_32_bits(hws_address(hws, rq)); 224 1.1 riastrad *batch++ = rq->fence.seqno; 225 1.1 riastrad *batch++ = MI_ARB_CHECK; 226 1.1 riastrad 227 1.1 riastrad memset(batch, 0, 1024); 228 1.1 riastrad batch += 1024 / sizeof(*batch); 229 1.1 riastrad 230 1.1 riastrad *batch++ = MI_ARB_CHECK; 231 1.1 riastrad *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 232 1.1 riastrad *batch++ = lower_32_bits(vma->node.start); 233 1.1 riastrad } else if (INTEL_GEN(gt->i915) >= 4) { 234 1.1 riastrad *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 235 1.1 riastrad *batch++ = 0; 236 1.1 riastrad *batch++ = lower_32_bits(hws_address(hws, rq)); 237 1.1 riastrad *batch++ = rq->fence.seqno; 238 1.1 riastrad *batch++ = MI_ARB_CHECK; 239 1.1 riastrad 240 1.1 riastrad memset(batch, 0, 1024); 241 1.1 riastrad batch += 1024 / sizeof(*batch); 242 1.1 riastrad 243 1.1 riastrad *batch++ = MI_ARB_CHECK; 244 1.1 riastrad *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 245 1.1 riastrad *batch++ = lower_32_bits(vma->node.start); 246 1.1 riastrad } else { 247 1.1 riastrad *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 248 1.1 riastrad *batch++ = lower_32_bits(hws_address(hws, rq)); 249 1.1 riastrad *batch++ = rq->fence.seqno; 250 1.1 riastrad *batch++ = MI_ARB_CHECK; 251 1.1 riastrad 252 1.1 riastrad memset(batch, 0, 1024); 253 1.1 riastrad batch += 1024 / sizeof(*batch); 254 1.1 riastrad 255 1.1 riastrad *batch++ = MI_ARB_CHECK; 256 1.1 riastrad *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 257 1.1 riastrad *batch++ = lower_32_bits(vma->node.start); 258 1.1 riastrad } 259 1.1 riastrad *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 260 1.1 riastrad intel_gt_chipset_flush(engine->gt); 261 1.1 riastrad 262 1.1 riastrad if (rq->engine->emit_init_breadcrumb) { 263 1.1 riastrad err = rq->engine->emit_init_breadcrumb(rq); 264 1.1 riastrad if (err) 265 1.1 riastrad goto cancel_rq; 266 1.1 riastrad } 267 1.1 riastrad 268 1.1 riastrad flags = 0; 269 1.1 riastrad if (INTEL_GEN(gt->i915) <= 5) 270 1.1 riastrad flags |= I915_DISPATCH_SECURE; 271 1.1 riastrad 272 1.1 riastrad err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 273 1.1 riastrad 274 1.1 riastrad cancel_rq: 275 1.1 riastrad if (err) { 276 1.1 riastrad i915_request_skip(rq, err); 277 1.1 riastrad i915_request_add(rq); 278 1.1 riastrad } 279 1.1 riastrad unpin_hws: 280 1.1 riastrad i915_vma_unpin(hws); 281 1.1 riastrad unpin_vma: 282 1.1 riastrad i915_vma_unpin(vma); 283 1.1 riastrad i915_vm_put(vm); 284 1.1 riastrad return err ? ERR_PTR(err) : rq; 285 1.1 riastrad } 286 1.1 riastrad 287 1.1 riastrad static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 288 1.1 riastrad { 289 1.1 riastrad return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 290 1.1 riastrad } 291 1.1 riastrad 292 1.1 riastrad static void hang_fini(struct hang *h) 293 1.1 riastrad { 294 1.1 riastrad *h->batch = MI_BATCH_BUFFER_END; 295 1.1 riastrad intel_gt_chipset_flush(h->gt); 296 1.1 riastrad 297 1.1 riastrad i915_gem_object_unpin_map(h->obj); 298 1.1 riastrad i915_gem_object_put(h->obj); 299 1.1 riastrad 300 1.1 riastrad i915_gem_object_unpin_map(h->hws); 301 1.1 riastrad i915_gem_object_put(h->hws); 302 1.1 riastrad 303 1.1 riastrad kernel_context_close(h->ctx); 304 1.1 riastrad 305 1.1 riastrad igt_flush_test(h->gt->i915); 306 1.1 riastrad } 307 1.1 riastrad 308 1.1 riastrad static bool wait_until_running(struct hang *h, struct i915_request *rq) 309 1.1 riastrad { 310 1.1 riastrad return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 311 1.1 riastrad rq->fence.seqno), 312 1.1 riastrad 10) && 313 1.1 riastrad wait_for(i915_seqno_passed(hws_seqno(h, rq), 314 1.1 riastrad rq->fence.seqno), 315 1.1 riastrad 1000)); 316 1.1 riastrad } 317 1.1 riastrad 318 1.1 riastrad static void engine_heartbeat_disable(struct intel_engine_cs *engine, 319 1.1 riastrad unsigned long *saved) 320 1.1 riastrad { 321 1.1 riastrad *saved = engine->props.heartbeat_interval_ms; 322 1.1 riastrad engine->props.heartbeat_interval_ms = 0; 323 1.1 riastrad 324 1.1 riastrad intel_engine_pm_get(engine); 325 1.1 riastrad intel_engine_park_heartbeat(engine); 326 1.1 riastrad } 327 1.1 riastrad 328 1.1 riastrad static void engine_heartbeat_enable(struct intel_engine_cs *engine, 329 1.1 riastrad unsigned long saved) 330 1.1 riastrad { 331 1.1 riastrad intel_engine_pm_put(engine); 332 1.1 riastrad 333 1.1 riastrad engine->props.heartbeat_interval_ms = saved; 334 1.1 riastrad } 335 1.1 riastrad 336 1.1 riastrad static int igt_hang_sanitycheck(void *arg) 337 1.1 riastrad { 338 1.1 riastrad struct intel_gt *gt = arg; 339 1.1 riastrad struct i915_request *rq; 340 1.1 riastrad struct intel_engine_cs *engine; 341 1.1 riastrad enum intel_engine_id id; 342 1.1 riastrad struct hang h; 343 1.1 riastrad int err; 344 1.1 riastrad 345 1.1 riastrad /* Basic check that we can execute our hanging batch */ 346 1.1 riastrad 347 1.1 riastrad err = hang_init(&h, gt); 348 1.1 riastrad if (err) 349 1.1 riastrad return err; 350 1.1 riastrad 351 1.1 riastrad for_each_engine(engine, gt, id) { 352 1.1 riastrad struct intel_wedge_me w; 353 1.1 riastrad long timeout; 354 1.1 riastrad 355 1.1 riastrad if (!intel_engine_can_store_dword(engine)) 356 1.1 riastrad continue; 357 1.1 riastrad 358 1.1 riastrad rq = hang_create_request(&h, engine); 359 1.1 riastrad if (IS_ERR(rq)) { 360 1.1 riastrad err = PTR_ERR(rq); 361 1.1 riastrad pr_err("Failed to create request for %s, err=%d\n", 362 1.1 riastrad engine->name, err); 363 1.1 riastrad goto fini; 364 1.1 riastrad } 365 1.1 riastrad 366 1.1 riastrad i915_request_get(rq); 367 1.1 riastrad 368 1.1 riastrad *h.batch = MI_BATCH_BUFFER_END; 369 1.1 riastrad intel_gt_chipset_flush(engine->gt); 370 1.1 riastrad 371 1.1 riastrad i915_request_add(rq); 372 1.1 riastrad 373 1.1 riastrad timeout = 0; 374 1.1 riastrad intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 375 1.1 riastrad timeout = i915_request_wait(rq, 0, 376 1.1 riastrad MAX_SCHEDULE_TIMEOUT); 377 1.1 riastrad if (intel_gt_is_wedged(gt)) 378 1.1 riastrad timeout = -EIO; 379 1.1 riastrad 380 1.1 riastrad i915_request_put(rq); 381 1.1 riastrad 382 1.1 riastrad if (timeout < 0) { 383 1.1 riastrad err = timeout; 384 1.1 riastrad pr_err("Wait for request failed on %s, err=%d\n", 385 1.1 riastrad engine->name, err); 386 1.1 riastrad goto fini; 387 1.1 riastrad } 388 1.1 riastrad } 389 1.1 riastrad 390 1.1 riastrad fini: 391 1.1 riastrad hang_fini(&h); 392 1.1 riastrad return err; 393 1.1 riastrad } 394 1.1 riastrad 395 1.1 riastrad static bool wait_for_idle(struct intel_engine_cs *engine) 396 1.1 riastrad { 397 1.1 riastrad return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 398 1.1 riastrad } 399 1.1 riastrad 400 1.1 riastrad static int igt_reset_nop(void *arg) 401 1.1 riastrad { 402 1.1 riastrad struct intel_gt *gt = arg; 403 1.1 riastrad struct i915_gpu_error *global = >->i915->gpu_error; 404 1.1 riastrad struct intel_engine_cs *engine; 405 1.1 riastrad unsigned int reset_count, count; 406 1.1 riastrad enum intel_engine_id id; 407 1.1 riastrad IGT_TIMEOUT(end_time); 408 1.1 riastrad int err = 0; 409 1.1 riastrad 410 1.1 riastrad /* Check that we can reset during non-user portions of requests */ 411 1.1 riastrad 412 1.1 riastrad reset_count = i915_reset_count(global); 413 1.1 riastrad count = 0; 414 1.1 riastrad do { 415 1.1 riastrad for_each_engine(engine, gt, id) { 416 1.1 riastrad struct intel_context *ce; 417 1.1 riastrad int i; 418 1.1 riastrad 419 1.1 riastrad ce = intel_context_create(engine); 420 1.1 riastrad if (IS_ERR(ce)) { 421 1.1 riastrad err = PTR_ERR(ce); 422 1.1 riastrad break; 423 1.1 riastrad } 424 1.1 riastrad 425 1.1 riastrad for (i = 0; i < 16; i++) { 426 1.1 riastrad struct i915_request *rq; 427 1.1 riastrad 428 1.1 riastrad rq = intel_context_create_request(ce); 429 1.1 riastrad if (IS_ERR(rq)) { 430 1.1 riastrad err = PTR_ERR(rq); 431 1.1 riastrad break; 432 1.1 riastrad } 433 1.1 riastrad 434 1.1 riastrad i915_request_add(rq); 435 1.1 riastrad } 436 1.1 riastrad 437 1.1 riastrad intel_context_put(ce); 438 1.1 riastrad } 439 1.1 riastrad 440 1.1 riastrad igt_global_reset_lock(gt); 441 1.1 riastrad intel_gt_reset(gt, ALL_ENGINES, NULL); 442 1.1 riastrad igt_global_reset_unlock(gt); 443 1.1 riastrad 444 1.1 riastrad if (intel_gt_is_wedged(gt)) { 445 1.1 riastrad err = -EIO; 446 1.1 riastrad break; 447 1.1 riastrad } 448 1.1 riastrad 449 1.1 riastrad if (i915_reset_count(global) != reset_count + ++count) { 450 1.1 riastrad pr_err("Full GPU reset not recorded!\n"); 451 1.1 riastrad err = -EINVAL; 452 1.1 riastrad break; 453 1.1 riastrad } 454 1.1 riastrad 455 1.1 riastrad err = igt_flush_test(gt->i915); 456 1.1 riastrad if (err) 457 1.1 riastrad break; 458 1.1 riastrad } while (time_before(jiffies, end_time)); 459 1.1 riastrad pr_info("%s: %d resets\n", __func__, count); 460 1.1 riastrad 461 1.1 riastrad if (igt_flush_test(gt->i915)) 462 1.1 riastrad err = -EIO; 463 1.1 riastrad return err; 464 1.1 riastrad } 465 1.1 riastrad 466 1.1 riastrad static int igt_reset_nop_engine(void *arg) 467 1.1 riastrad { 468 1.1 riastrad struct intel_gt *gt = arg; 469 1.1 riastrad struct i915_gpu_error *global = >->i915->gpu_error; 470 1.1 riastrad struct intel_engine_cs *engine; 471 1.1 riastrad enum intel_engine_id id; 472 1.1 riastrad 473 1.1 riastrad /* Check that we can engine-reset during non-user portions */ 474 1.1 riastrad 475 1.1 riastrad if (!intel_has_reset_engine(gt)) 476 1.1 riastrad return 0; 477 1.1 riastrad 478 1.1 riastrad for_each_engine(engine, gt, id) { 479 1.1 riastrad unsigned int reset_count, reset_engine_count, count; 480 1.1 riastrad struct intel_context *ce; 481 1.1 riastrad unsigned long heartbeat; 482 1.1 riastrad IGT_TIMEOUT(end_time); 483 1.1 riastrad int err; 484 1.1 riastrad 485 1.1 riastrad ce = intel_context_create(engine); 486 1.1 riastrad if (IS_ERR(ce)) 487 1.1 riastrad return PTR_ERR(ce); 488 1.1 riastrad 489 1.1 riastrad reset_count = i915_reset_count(global); 490 1.1 riastrad reset_engine_count = i915_reset_engine_count(global, engine); 491 1.1 riastrad count = 0; 492 1.1 riastrad 493 1.1 riastrad engine_heartbeat_disable(engine, &heartbeat); 494 1.1 riastrad set_bit(I915_RESET_ENGINE + id, >->reset.flags); 495 1.1 riastrad do { 496 1.1 riastrad int i; 497 1.1 riastrad 498 1.1 riastrad if (!wait_for_idle(engine)) { 499 1.1 riastrad pr_err("%s failed to idle before reset\n", 500 1.1 riastrad engine->name); 501 1.1 riastrad err = -EIO; 502 1.1 riastrad break; 503 1.1 riastrad } 504 1.1 riastrad 505 1.1 riastrad for (i = 0; i < 16; i++) { 506 1.1 riastrad struct i915_request *rq; 507 1.1 riastrad 508 1.1 riastrad rq = intel_context_create_request(ce); 509 1.1 riastrad if (IS_ERR(rq)) { 510 1.1 riastrad err = PTR_ERR(rq); 511 1.1 riastrad break; 512 1.1 riastrad } 513 1.1 riastrad 514 1.1 riastrad i915_request_add(rq); 515 1.1 riastrad } 516 1.1 riastrad err = intel_engine_reset(engine, NULL); 517 1.1 riastrad if (err) { 518 1.1 riastrad pr_err("i915_reset_engine failed\n"); 519 1.1 riastrad break; 520 1.1 riastrad } 521 1.1 riastrad 522 1.1 riastrad if (i915_reset_count(global) != reset_count) { 523 1.1 riastrad pr_err("Full GPU reset recorded! (engine reset expected)\n"); 524 1.1 riastrad err = -EINVAL; 525 1.1 riastrad break; 526 1.1 riastrad } 527 1.1 riastrad 528 1.1 riastrad if (i915_reset_engine_count(global, engine) != 529 1.1 riastrad reset_engine_count + ++count) { 530 1.1 riastrad pr_err("%s engine reset not recorded!\n", 531 1.1 riastrad engine->name); 532 1.1 riastrad err = -EINVAL; 533 1.1 riastrad break; 534 1.1 riastrad } 535 1.1 riastrad } while (time_before(jiffies, end_time)); 536 1.1 riastrad clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 537 1.1 riastrad engine_heartbeat_enable(engine, heartbeat); 538 1.1 riastrad 539 1.1 riastrad pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 540 1.1 riastrad 541 1.1 riastrad intel_context_put(ce); 542 1.1 riastrad if (igt_flush_test(gt->i915)) 543 1.1 riastrad err = -EIO; 544 1.1 riastrad if (err) 545 1.1 riastrad return err; 546 1.1 riastrad } 547 1.1 riastrad 548 1.1 riastrad return 0; 549 1.1 riastrad } 550 1.1 riastrad 551 1.1 riastrad static int __igt_reset_engine(struct intel_gt *gt, bool active) 552 1.1 riastrad { 553 1.1 riastrad struct i915_gpu_error *global = >->i915->gpu_error; 554 1.1 riastrad struct intel_engine_cs *engine; 555 1.1 riastrad enum intel_engine_id id; 556 1.1 riastrad struct hang h; 557 1.1 riastrad int err = 0; 558 1.1 riastrad 559 1.1 riastrad /* Check that we can issue an engine reset on an idle engine (no-op) */ 560 1.1 riastrad 561 1.1 riastrad if (!intel_has_reset_engine(gt)) 562 1.1 riastrad return 0; 563 1.1 riastrad 564 1.1 riastrad if (active) { 565 1.1 riastrad err = hang_init(&h, gt); 566 1.1 riastrad if (err) 567 1.1 riastrad return err; 568 1.1 riastrad } 569 1.1 riastrad 570 1.1 riastrad for_each_engine(engine, gt, id) { 571 1.1 riastrad unsigned int reset_count, reset_engine_count; 572 1.1 riastrad unsigned long heartbeat; 573 1.1 riastrad IGT_TIMEOUT(end_time); 574 1.1 riastrad 575 1.1 riastrad if (active && !intel_engine_can_store_dword(engine)) 576 1.1 riastrad continue; 577 1.1 riastrad 578 1.1 riastrad if (!wait_for_idle(engine)) { 579 1.1 riastrad pr_err("%s failed to idle before reset\n", 580 1.1 riastrad engine->name); 581 1.1 riastrad err = -EIO; 582 1.1 riastrad break; 583 1.1 riastrad } 584 1.1 riastrad 585 1.1 riastrad reset_count = i915_reset_count(global); 586 1.1 riastrad reset_engine_count = i915_reset_engine_count(global, engine); 587 1.1 riastrad 588 1.1 riastrad engine_heartbeat_disable(engine, &heartbeat); 589 1.1 riastrad set_bit(I915_RESET_ENGINE + id, >->reset.flags); 590 1.1 riastrad do { 591 1.1 riastrad if (active) { 592 1.1 riastrad struct i915_request *rq; 593 1.1 riastrad 594 1.1 riastrad rq = hang_create_request(&h, engine); 595 1.1 riastrad if (IS_ERR(rq)) { 596 1.1 riastrad err = PTR_ERR(rq); 597 1.1 riastrad break; 598 1.1 riastrad } 599 1.1 riastrad 600 1.1 riastrad i915_request_get(rq); 601 1.1 riastrad i915_request_add(rq); 602 1.1 riastrad 603 1.1 riastrad if (!wait_until_running(&h, rq)) { 604 1.1 riastrad struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 605 1.1 riastrad 606 1.1 riastrad pr_err("%s: Failed to start request %llx, at %x\n", 607 1.1 riastrad __func__, rq->fence.seqno, hws_seqno(&h, rq)); 608 1.1 riastrad intel_engine_dump(engine, &p, 609 1.1 riastrad "%s\n", engine->name); 610 1.1 riastrad 611 1.1 riastrad i915_request_put(rq); 612 1.1 riastrad err = -EIO; 613 1.1 riastrad break; 614 1.1 riastrad } 615 1.1 riastrad 616 1.1 riastrad i915_request_put(rq); 617 1.1 riastrad } 618 1.1 riastrad 619 1.1 riastrad err = intel_engine_reset(engine, NULL); 620 1.1 riastrad if (err) { 621 1.1 riastrad pr_err("i915_reset_engine failed\n"); 622 1.1 riastrad break; 623 1.1 riastrad } 624 1.1 riastrad 625 1.1 riastrad if (i915_reset_count(global) != reset_count) { 626 1.1 riastrad pr_err("Full GPU reset recorded! (engine reset expected)\n"); 627 1.1 riastrad err = -EINVAL; 628 1.1 riastrad break; 629 1.1 riastrad } 630 1.1 riastrad 631 1.1 riastrad if (i915_reset_engine_count(global, engine) != 632 1.1 riastrad ++reset_engine_count) { 633 1.1 riastrad pr_err("%s engine reset not recorded!\n", 634 1.1 riastrad engine->name); 635 1.1 riastrad err = -EINVAL; 636 1.1 riastrad break; 637 1.1 riastrad } 638 1.1 riastrad } while (time_before(jiffies, end_time)); 639 1.1 riastrad clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 640 1.1 riastrad engine_heartbeat_enable(engine, heartbeat); 641 1.1 riastrad 642 1.1 riastrad if (err) 643 1.1 riastrad break; 644 1.1 riastrad 645 1.1 riastrad err = igt_flush_test(gt->i915); 646 1.1 riastrad if (err) 647 1.1 riastrad break; 648 1.1 riastrad } 649 1.1 riastrad 650 1.1 riastrad if (intel_gt_is_wedged(gt)) 651 1.1 riastrad err = -EIO; 652 1.1 riastrad 653 1.1 riastrad if (active) 654 1.1 riastrad hang_fini(&h); 655 1.1 riastrad 656 1.1 riastrad return err; 657 1.1 riastrad } 658 1.1 riastrad 659 1.1 riastrad static int igt_reset_idle_engine(void *arg) 660 1.1 riastrad { 661 1.1 riastrad return __igt_reset_engine(arg, false); 662 1.1 riastrad } 663 1.1 riastrad 664 1.1 riastrad static int igt_reset_active_engine(void *arg) 665 1.1 riastrad { 666 1.1 riastrad return __igt_reset_engine(arg, true); 667 1.1 riastrad } 668 1.1 riastrad 669 1.1 riastrad struct active_engine { 670 1.1 riastrad struct task_struct *task; 671 1.1 riastrad struct intel_engine_cs *engine; 672 1.1 riastrad unsigned long resets; 673 1.1 riastrad unsigned int flags; 674 1.1 riastrad }; 675 1.1 riastrad 676 1.1 riastrad #define TEST_ACTIVE BIT(0) 677 1.1 riastrad #define TEST_OTHERS BIT(1) 678 1.1 riastrad #define TEST_SELF BIT(2) 679 1.1 riastrad #define TEST_PRIORITY BIT(3) 680 1.1 riastrad 681 1.1 riastrad static int active_request_put(struct i915_request *rq) 682 1.1 riastrad { 683 1.1 riastrad int err = 0; 684 1.1 riastrad 685 1.1 riastrad if (!rq) 686 1.1 riastrad return 0; 687 1.1 riastrad 688 1.1 riastrad if (i915_request_wait(rq, 0, 5 * HZ) < 0) { 689 1.1 riastrad GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 690 1.1 riastrad rq->engine->name, 691 1.1 riastrad rq->fence.context, 692 1.1 riastrad rq->fence.seqno); 693 1.1 riastrad GEM_TRACE_DUMP(); 694 1.1 riastrad 695 1.1 riastrad intel_gt_set_wedged(rq->engine->gt); 696 1.1 riastrad err = -EIO; 697 1.1 riastrad } 698 1.1 riastrad 699 1.1 riastrad i915_request_put(rq); 700 1.1 riastrad 701 1.1 riastrad return err; 702 1.1 riastrad } 703 1.1 riastrad 704 1.1 riastrad static int active_engine(void *data) 705 1.1 riastrad { 706 1.1 riastrad I915_RND_STATE(prng); 707 1.1 riastrad struct active_engine *arg = data; 708 1.1 riastrad struct intel_engine_cs *engine = arg->engine; 709 1.1 riastrad struct i915_request *rq[8] = {}; 710 1.1 riastrad struct intel_context *ce[ARRAY_SIZE(rq)]; 711 1.1 riastrad unsigned long count; 712 1.1 riastrad int err = 0; 713 1.1 riastrad 714 1.1 riastrad for (count = 0; count < ARRAY_SIZE(ce); count++) { 715 1.1 riastrad ce[count] = intel_context_create(engine); 716 1.1 riastrad if (IS_ERR(ce[count])) { 717 1.1 riastrad err = PTR_ERR(ce[count]); 718 1.1 riastrad while (--count) 719 1.1 riastrad intel_context_put(ce[count]); 720 1.1 riastrad return err; 721 1.1 riastrad } 722 1.1 riastrad } 723 1.1 riastrad 724 1.1 riastrad count = 0; 725 1.1 riastrad while (!kthread_should_stop()) { 726 1.1 riastrad unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 727 1.1 riastrad struct i915_request *old = rq[idx]; 728 1.1 riastrad struct i915_request *new; 729 1.1 riastrad 730 1.1 riastrad new = intel_context_create_request(ce[idx]); 731 1.1 riastrad if (IS_ERR(new)) { 732 1.1 riastrad err = PTR_ERR(new); 733 1.1 riastrad break; 734 1.1 riastrad } 735 1.1 riastrad 736 1.1 riastrad rq[idx] = i915_request_get(new); 737 1.1 riastrad i915_request_add(new); 738 1.1 riastrad 739 1.1 riastrad if (engine->schedule && arg->flags & TEST_PRIORITY) { 740 1.1 riastrad struct i915_sched_attr attr = { 741 1.1 riastrad .priority = 742 1.1 riastrad i915_prandom_u32_max_state(512, &prng), 743 1.1 riastrad }; 744 1.1 riastrad engine->schedule(rq[idx], &attr); 745 1.1 riastrad } 746 1.1 riastrad 747 1.1 riastrad err = active_request_put(old); 748 1.1 riastrad if (err) 749 1.1 riastrad break; 750 1.1 riastrad 751 1.1 riastrad cond_resched(); 752 1.1 riastrad } 753 1.1 riastrad 754 1.1 riastrad for (count = 0; count < ARRAY_SIZE(rq); count++) { 755 1.1 riastrad int err__ = active_request_put(rq[count]); 756 1.1 riastrad 757 1.1 riastrad /* Keep the first error */ 758 1.1 riastrad if (!err) 759 1.1 riastrad err = err__; 760 1.1 riastrad 761 1.1 riastrad intel_context_put(ce[count]); 762 1.1 riastrad } 763 1.1 riastrad 764 1.1 riastrad return err; 765 1.1 riastrad } 766 1.1 riastrad 767 1.1 riastrad static int __igt_reset_engines(struct intel_gt *gt, 768 1.1 riastrad const char *test_name, 769 1.1 riastrad unsigned int flags) 770 1.1 riastrad { 771 1.1 riastrad struct i915_gpu_error *global = >->i915->gpu_error; 772 1.1 riastrad struct intel_engine_cs *engine, *other; 773 1.1 riastrad enum intel_engine_id id, tmp; 774 1.1 riastrad struct hang h; 775 1.1 riastrad int err = 0; 776 1.1 riastrad 777 1.1 riastrad /* Check that issuing a reset on one engine does not interfere 778 1.1 riastrad * with any other engine. 779 1.1 riastrad */ 780 1.1 riastrad 781 1.1 riastrad if (!intel_has_reset_engine(gt)) 782 1.1 riastrad return 0; 783 1.1 riastrad 784 1.1 riastrad if (flags & TEST_ACTIVE) { 785 1.1 riastrad err = hang_init(&h, gt); 786 1.1 riastrad if (err) 787 1.1 riastrad return err; 788 1.1 riastrad 789 1.1 riastrad if (flags & TEST_PRIORITY) 790 1.1 riastrad h.ctx->sched.priority = 1024; 791 1.1 riastrad } 792 1.1 riastrad 793 1.1 riastrad for_each_engine(engine, gt, id) { 794 1.1 riastrad struct active_engine threads[I915_NUM_ENGINES] = {}; 795 1.1 riastrad unsigned long device = i915_reset_count(global); 796 1.1 riastrad unsigned long count = 0, reported; 797 1.1 riastrad unsigned long heartbeat; 798 1.1 riastrad IGT_TIMEOUT(end_time); 799 1.1 riastrad 800 1.1 riastrad if (flags & TEST_ACTIVE && 801 1.1 riastrad !intel_engine_can_store_dword(engine)) 802 1.1 riastrad continue; 803 1.1 riastrad 804 1.1 riastrad if (!wait_for_idle(engine)) { 805 1.1 riastrad pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 806 1.1 riastrad engine->name, test_name); 807 1.1 riastrad err = -EIO; 808 1.1 riastrad break; 809 1.1 riastrad } 810 1.1 riastrad 811 1.1 riastrad memset(threads, 0, sizeof(threads)); 812 1.1 riastrad for_each_engine(other, gt, tmp) { 813 1.1 riastrad struct task_struct *tsk; 814 1.1 riastrad 815 1.1 riastrad threads[tmp].resets = 816 1.1 riastrad i915_reset_engine_count(global, other); 817 1.1 riastrad 818 1.1 riastrad if (!(flags & TEST_OTHERS)) 819 1.1 riastrad continue; 820 1.1 riastrad 821 1.1 riastrad if (other == engine && !(flags & TEST_SELF)) 822 1.1 riastrad continue; 823 1.1 riastrad 824 1.1 riastrad threads[tmp].engine = other; 825 1.1 riastrad threads[tmp].flags = flags; 826 1.1 riastrad 827 1.1 riastrad tsk = kthread_run(active_engine, &threads[tmp], 828 1.1 riastrad "igt/%s", other->name); 829 1.1 riastrad if (IS_ERR(tsk)) { 830 1.1 riastrad err = PTR_ERR(tsk); 831 1.1 riastrad goto unwind; 832 1.1 riastrad } 833 1.1 riastrad 834 1.1 riastrad threads[tmp].task = tsk; 835 1.1 riastrad get_task_struct(tsk); 836 1.1 riastrad } 837 1.1 riastrad 838 1.1 riastrad yield(); /* start all threads before we begin */ 839 1.1 riastrad 840 1.1 riastrad engine_heartbeat_disable(engine, &heartbeat); 841 1.1 riastrad set_bit(I915_RESET_ENGINE + id, >->reset.flags); 842 1.1 riastrad do { 843 1.1 riastrad struct i915_request *rq = NULL; 844 1.1 riastrad 845 1.1 riastrad if (flags & TEST_ACTIVE) { 846 1.1 riastrad rq = hang_create_request(&h, engine); 847 1.1 riastrad if (IS_ERR(rq)) { 848 1.1 riastrad err = PTR_ERR(rq); 849 1.1 riastrad break; 850 1.1 riastrad } 851 1.1 riastrad 852 1.1 riastrad i915_request_get(rq); 853 1.1 riastrad i915_request_add(rq); 854 1.1 riastrad 855 1.1 riastrad if (!wait_until_running(&h, rq)) { 856 1.1 riastrad struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 857 1.1 riastrad 858 1.1 riastrad pr_err("%s: Failed to start request %llx, at %x\n", 859 1.1 riastrad __func__, rq->fence.seqno, hws_seqno(&h, rq)); 860 1.1 riastrad intel_engine_dump(engine, &p, 861 1.1 riastrad "%s\n", engine->name); 862 1.1 riastrad 863 1.1 riastrad i915_request_put(rq); 864 1.1 riastrad err = -EIO; 865 1.1 riastrad break; 866 1.1 riastrad } 867 1.1 riastrad } 868 1.1 riastrad 869 1.1 riastrad err = intel_engine_reset(engine, NULL); 870 1.1 riastrad if (err) { 871 1.1 riastrad pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 872 1.1 riastrad engine->name, test_name, err); 873 1.1 riastrad break; 874 1.1 riastrad } 875 1.1 riastrad 876 1.1 riastrad count++; 877 1.1 riastrad 878 1.1 riastrad if (rq) { 879 1.1 riastrad if (i915_request_wait(rq, 0, HZ / 5) < 0) { 880 1.1 riastrad struct drm_printer p = 881 1.1 riastrad drm_info_printer(gt->i915->drm.dev); 882 1.1 riastrad 883 1.1 riastrad pr_err("i915_reset_engine(%s:%s):" 884 1.1 riastrad " failed to complete request after reset\n", 885 1.1 riastrad engine->name, test_name); 886 1.1 riastrad intel_engine_dump(engine, &p, 887 1.1 riastrad "%s\n", engine->name); 888 1.1 riastrad i915_request_put(rq); 889 1.1 riastrad 890 1.1 riastrad GEM_TRACE_DUMP(); 891 1.1 riastrad intel_gt_set_wedged(gt); 892 1.1 riastrad err = -EIO; 893 1.1 riastrad break; 894 1.1 riastrad } 895 1.1 riastrad 896 1.1 riastrad i915_request_put(rq); 897 1.1 riastrad } 898 1.1 riastrad 899 1.1 riastrad if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 900 1.1 riastrad struct drm_printer p = 901 1.1 riastrad drm_info_printer(gt->i915->drm.dev); 902 1.1 riastrad 903 1.1 riastrad pr_err("i915_reset_engine(%s:%s):" 904 1.1 riastrad " failed to idle after reset\n", 905 1.1 riastrad engine->name, test_name); 906 1.1 riastrad intel_engine_dump(engine, &p, 907 1.1 riastrad "%s\n", engine->name); 908 1.1 riastrad 909 1.1 riastrad err = -EIO; 910 1.1 riastrad break; 911 1.1 riastrad } 912 1.1 riastrad } while (time_before(jiffies, end_time)); 913 1.1 riastrad clear_bit(I915_RESET_ENGINE + id, >->reset.flags); 914 1.1 riastrad engine_heartbeat_enable(engine, heartbeat); 915 1.1 riastrad 916 1.1 riastrad pr_info("i915_reset_engine(%s:%s): %lu resets\n", 917 1.1 riastrad engine->name, test_name, count); 918 1.1 riastrad 919 1.1 riastrad reported = i915_reset_engine_count(global, engine); 920 1.1 riastrad reported -= threads[engine->id].resets; 921 1.1 riastrad if (reported != count) { 922 1.1 riastrad pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 923 1.1 riastrad engine->name, test_name, count, reported); 924 1.1 riastrad if (!err) 925 1.1 riastrad err = -EINVAL; 926 1.1 riastrad } 927 1.1 riastrad 928 1.1 riastrad unwind: 929 1.1 riastrad for_each_engine(other, gt, tmp) { 930 1.1 riastrad int ret; 931 1.1 riastrad 932 1.1 riastrad if (!threads[tmp].task) 933 1.1 riastrad continue; 934 1.1 riastrad 935 1.1 riastrad ret = kthread_stop(threads[tmp].task); 936 1.1 riastrad if (ret) { 937 1.1 riastrad pr_err("kthread for other engine %s failed, err=%d\n", 938 1.1 riastrad other->name, ret); 939 1.1 riastrad if (!err) 940 1.1 riastrad err = ret; 941 1.1 riastrad } 942 1.1 riastrad put_task_struct(threads[tmp].task); 943 1.1 riastrad 944 1.1 riastrad if (other->uabi_class != engine->uabi_class && 945 1.1 riastrad threads[tmp].resets != 946 1.1 riastrad i915_reset_engine_count(global, other)) { 947 1.1 riastrad pr_err("Innocent engine %s was reset (count=%ld)\n", 948 1.1 riastrad other->name, 949 1.1 riastrad i915_reset_engine_count(global, other) - 950 1.1 riastrad threads[tmp].resets); 951 1.1 riastrad if (!err) 952 1.1 riastrad err = -EINVAL; 953 1.1 riastrad } 954 1.1 riastrad } 955 1.1 riastrad 956 1.1 riastrad if (device != i915_reset_count(global)) { 957 1.1 riastrad pr_err("Global reset (count=%ld)!\n", 958 1.1 riastrad i915_reset_count(global) - device); 959 1.1 riastrad if (!err) 960 1.1 riastrad err = -EINVAL; 961 1.1 riastrad } 962 1.1 riastrad 963 1.1 riastrad if (err) 964 1.1 riastrad break; 965 1.1 riastrad 966 1.1 riastrad err = igt_flush_test(gt->i915); 967 1.1 riastrad if (err) 968 1.1 riastrad break; 969 1.1 riastrad } 970 1.1 riastrad 971 1.1 riastrad if (intel_gt_is_wedged(gt)) 972 1.1 riastrad err = -EIO; 973 1.1 riastrad 974 1.1 riastrad if (flags & TEST_ACTIVE) 975 1.1 riastrad hang_fini(&h); 976 1.1 riastrad 977 1.1 riastrad return err; 978 1.1 riastrad } 979 1.1 riastrad 980 1.1 riastrad static int igt_reset_engines(void *arg) 981 1.1 riastrad { 982 1.1 riastrad static const struct { 983 1.1 riastrad const char *name; 984 1.1 riastrad unsigned int flags; 985 1.1 riastrad } phases[] = { 986 1.1 riastrad { "idle", 0 }, 987 1.1 riastrad { "active", TEST_ACTIVE }, 988 1.1 riastrad { "others-idle", TEST_OTHERS }, 989 1.1 riastrad { "others-active", TEST_OTHERS | TEST_ACTIVE }, 990 1.1 riastrad { 991 1.1 riastrad "others-priority", 992 1.1 riastrad TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 993 1.1 riastrad }, 994 1.1 riastrad { 995 1.1 riastrad "self-priority", 996 1.1 riastrad TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 997 1.1 riastrad }, 998 1.1 riastrad { } 999 1.1 riastrad }; 1000 1.1 riastrad struct intel_gt *gt = arg; 1001 1.1 riastrad typeof(*phases) *p; 1002 1.1 riastrad int err; 1003 1.1 riastrad 1004 1.1 riastrad for (p = phases; p->name; p++) { 1005 1.1 riastrad if (p->flags & TEST_PRIORITY) { 1006 1.1 riastrad if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1007 1.1 riastrad continue; 1008 1.1 riastrad } 1009 1.1 riastrad 1010 1.1 riastrad err = __igt_reset_engines(arg, p->name, p->flags); 1011 1.1 riastrad if (err) 1012 1.1 riastrad return err; 1013 1.1 riastrad } 1014 1.1 riastrad 1015 1.1 riastrad return 0; 1016 1.1 riastrad } 1017 1.1 riastrad 1018 1.1 riastrad static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1019 1.1 riastrad { 1020 1.1 riastrad u32 count = i915_reset_count(>->i915->gpu_error); 1021 1.1 riastrad 1022 1.1 riastrad intel_gt_reset(gt, mask, NULL); 1023 1.1 riastrad 1024 1.1 riastrad return count; 1025 1.1 riastrad } 1026 1.1 riastrad 1027 1.1 riastrad static int igt_reset_wait(void *arg) 1028 1.1 riastrad { 1029 1.1 riastrad struct intel_gt *gt = arg; 1030 1.1 riastrad struct i915_gpu_error *global = >->i915->gpu_error; 1031 1.1 riastrad struct intel_engine_cs *engine = gt->engine[RCS0]; 1032 1.1 riastrad struct i915_request *rq; 1033 1.1 riastrad unsigned int reset_count; 1034 1.1 riastrad struct hang h; 1035 1.1 riastrad long timeout; 1036 1.1 riastrad int err; 1037 1.1 riastrad 1038 1.1 riastrad if (!engine || !intel_engine_can_store_dword(engine)) 1039 1.1 riastrad return 0; 1040 1.1 riastrad 1041 1.1 riastrad /* Check that we detect a stuck waiter and issue a reset */ 1042 1.1 riastrad 1043 1.1 riastrad igt_global_reset_lock(gt); 1044 1.1 riastrad 1045 1.1 riastrad err = hang_init(&h, gt); 1046 1.1 riastrad if (err) 1047 1.1 riastrad goto unlock; 1048 1.1 riastrad 1049 1.1 riastrad rq = hang_create_request(&h, engine); 1050 1.1 riastrad if (IS_ERR(rq)) { 1051 1.1 riastrad err = PTR_ERR(rq); 1052 1.1 riastrad goto fini; 1053 1.1 riastrad } 1054 1.1 riastrad 1055 1.1 riastrad i915_request_get(rq); 1056 1.1 riastrad i915_request_add(rq); 1057 1.1 riastrad 1058 1.1 riastrad if (!wait_until_running(&h, rq)) { 1059 1.1 riastrad struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1060 1.1 riastrad 1061 1.1 riastrad pr_err("%s: Failed to start request %llx, at %x\n", 1062 1.1 riastrad __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1063 1.1 riastrad intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1064 1.1 riastrad 1065 1.1 riastrad intel_gt_set_wedged(gt); 1066 1.1 riastrad 1067 1.1 riastrad err = -EIO; 1068 1.1 riastrad goto out_rq; 1069 1.1 riastrad } 1070 1.1 riastrad 1071 1.1 riastrad reset_count = fake_hangcheck(gt, ALL_ENGINES); 1072 1.1 riastrad 1073 1.1 riastrad timeout = i915_request_wait(rq, 0, 10); 1074 1.1 riastrad if (timeout < 0) { 1075 1.1 riastrad pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1076 1.1 riastrad timeout); 1077 1.1 riastrad err = timeout; 1078 1.1 riastrad goto out_rq; 1079 1.1 riastrad } 1080 1.1 riastrad 1081 1.1 riastrad if (i915_reset_count(global) == reset_count) { 1082 1.1 riastrad pr_err("No GPU reset recorded!\n"); 1083 1.1 riastrad err = -EINVAL; 1084 1.1 riastrad goto out_rq; 1085 1.1 riastrad } 1086 1.1 riastrad 1087 1.1 riastrad out_rq: 1088 1.1 riastrad i915_request_put(rq); 1089 1.1 riastrad fini: 1090 1.1 riastrad hang_fini(&h); 1091 1.1 riastrad unlock: 1092 1.1 riastrad igt_global_reset_unlock(gt); 1093 1.1 riastrad 1094 1.1 riastrad if (intel_gt_is_wedged(gt)) 1095 1.1 riastrad return -EIO; 1096 1.1 riastrad 1097 1.1 riastrad return err; 1098 1.1 riastrad } 1099 1.1 riastrad 1100 1.1 riastrad struct evict_vma { 1101 1.1 riastrad struct completion completion; 1102 1.1 riastrad struct i915_vma *vma; 1103 1.1 riastrad }; 1104 1.1 riastrad 1105 1.1 riastrad static int evict_vma(void *data) 1106 1.1 riastrad { 1107 1.1 riastrad struct evict_vma *arg = data; 1108 1.1 riastrad struct i915_address_space *vm = arg->vma->vm; 1109 1.1 riastrad struct drm_mm_node evict = arg->vma->node; 1110 1.1 riastrad int err; 1111 1.1 riastrad 1112 1.1 riastrad complete(&arg->completion); 1113 1.1 riastrad 1114 1.1 riastrad mutex_lock(&vm->mutex); 1115 1.1 riastrad err = i915_gem_evict_for_node(vm, &evict, 0); 1116 1.1 riastrad mutex_unlock(&vm->mutex); 1117 1.1 riastrad 1118 1.1 riastrad return err; 1119 1.1 riastrad } 1120 1.1 riastrad 1121 1.1 riastrad static int evict_fence(void *data) 1122 1.1 riastrad { 1123 1.1 riastrad struct evict_vma *arg = data; 1124 1.1 riastrad int err; 1125 1.1 riastrad 1126 1.1 riastrad complete(&arg->completion); 1127 1.1 riastrad 1128 1.1 riastrad /* Mark the fence register as dirty to force the mmio update. */ 1129 1.1 riastrad err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1130 1.1 riastrad if (err) { 1131 1.1 riastrad pr_err("Invalid Y-tiling settings; err:%d\n", err); 1132 1.1 riastrad return err; 1133 1.1 riastrad } 1134 1.1 riastrad 1135 1.1 riastrad err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1136 1.1 riastrad if (err) { 1137 1.1 riastrad pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1138 1.1 riastrad return err; 1139 1.1 riastrad } 1140 1.1 riastrad 1141 1.1 riastrad err = i915_vma_pin_fence(arg->vma); 1142 1.1 riastrad i915_vma_unpin(arg->vma); 1143 1.1 riastrad if (err) { 1144 1.1 riastrad pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1145 1.1 riastrad return err; 1146 1.1 riastrad } 1147 1.1 riastrad 1148 1.1 riastrad i915_vma_unpin_fence(arg->vma); 1149 1.1 riastrad 1150 1.1 riastrad return 0; 1151 1.1 riastrad } 1152 1.1 riastrad 1153 1.1 riastrad static int __igt_reset_evict_vma(struct intel_gt *gt, 1154 1.1 riastrad struct i915_address_space *vm, 1155 1.1 riastrad int (*fn)(void *), 1156 1.1 riastrad unsigned int flags) 1157 1.1 riastrad { 1158 1.1 riastrad struct intel_engine_cs *engine = gt->engine[RCS0]; 1159 1.1 riastrad struct drm_i915_gem_object *obj; 1160 1.1 riastrad struct task_struct *tsk = NULL; 1161 1.1 riastrad struct i915_request *rq; 1162 1.1 riastrad struct evict_vma arg; 1163 1.1 riastrad struct hang h; 1164 1.1 riastrad unsigned int pin_flags; 1165 1.1 riastrad int err; 1166 1.1 riastrad 1167 1.1 riastrad if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1168 1.1 riastrad return 0; 1169 1.1 riastrad 1170 1.1 riastrad if (!engine || !intel_engine_can_store_dword(engine)) 1171 1.1 riastrad return 0; 1172 1.1 riastrad 1173 1.1 riastrad /* Check that we can recover an unbind stuck on a hanging request */ 1174 1.1 riastrad 1175 1.1 riastrad err = hang_init(&h, gt); 1176 1.1 riastrad if (err) 1177 1.1 riastrad return err; 1178 1.1 riastrad 1179 1.1 riastrad obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1180 1.1 riastrad if (IS_ERR(obj)) { 1181 1.1 riastrad err = PTR_ERR(obj); 1182 1.1 riastrad goto fini; 1183 1.1 riastrad } 1184 1.1 riastrad 1185 1.1 riastrad if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1186 1.1 riastrad err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1187 1.1 riastrad if (err) { 1188 1.1 riastrad pr_err("Invalid X-tiling settings; err:%d\n", err); 1189 1.1 riastrad goto out_obj; 1190 1.1 riastrad } 1191 1.1 riastrad } 1192 1.1 riastrad 1193 1.1 riastrad arg.vma = i915_vma_instance(obj, vm, NULL); 1194 1.1 riastrad if (IS_ERR(arg.vma)) { 1195 1.1 riastrad err = PTR_ERR(arg.vma); 1196 1.1 riastrad goto out_obj; 1197 1.1 riastrad } 1198 1.1 riastrad 1199 1.1 riastrad rq = hang_create_request(&h, engine); 1200 1.1 riastrad if (IS_ERR(rq)) { 1201 1.1 riastrad err = PTR_ERR(rq); 1202 1.1 riastrad goto out_obj; 1203 1.1 riastrad } 1204 1.1 riastrad 1205 1.1 riastrad pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1206 1.1 riastrad 1207 1.1 riastrad if (flags & EXEC_OBJECT_NEEDS_FENCE) 1208 1.1 riastrad pin_flags |= PIN_MAPPABLE; 1209 1.1 riastrad 1210 1.1 riastrad err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1211 1.1 riastrad if (err) { 1212 1.1 riastrad i915_request_add(rq); 1213 1.1 riastrad goto out_obj; 1214 1.1 riastrad } 1215 1.1 riastrad 1216 1.1 riastrad if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1217 1.1 riastrad err = i915_vma_pin_fence(arg.vma); 1218 1.1 riastrad if (err) { 1219 1.1 riastrad pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1220 1.1 riastrad i915_vma_unpin(arg.vma); 1221 1.1 riastrad i915_request_add(rq); 1222 1.1 riastrad goto out_obj; 1223 1.1 riastrad } 1224 1.1 riastrad } 1225 1.1 riastrad 1226 1.1 riastrad i915_vma_lock(arg.vma); 1227 1.1 riastrad err = i915_request_await_object(rq, arg.vma->obj, 1228 1.1 riastrad flags & EXEC_OBJECT_WRITE); 1229 1.1 riastrad if (err == 0) 1230 1.1 riastrad err = i915_vma_move_to_active(arg.vma, rq, flags); 1231 1.1 riastrad i915_vma_unlock(arg.vma); 1232 1.1 riastrad 1233 1.1 riastrad if (flags & EXEC_OBJECT_NEEDS_FENCE) 1234 1.1 riastrad i915_vma_unpin_fence(arg.vma); 1235 1.1 riastrad i915_vma_unpin(arg.vma); 1236 1.1 riastrad 1237 1.1 riastrad i915_request_get(rq); 1238 1.1 riastrad i915_request_add(rq); 1239 1.1 riastrad if (err) 1240 1.1 riastrad goto out_rq; 1241 1.1 riastrad 1242 1.1 riastrad if (!wait_until_running(&h, rq)) { 1243 1.1 riastrad struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1244 1.1 riastrad 1245 1.1 riastrad pr_err("%s: Failed to start request %llx, at %x\n", 1246 1.1 riastrad __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1247 1.1 riastrad intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1248 1.1 riastrad 1249 1.1 riastrad intel_gt_set_wedged(gt); 1250 1.1 riastrad goto out_reset; 1251 1.1 riastrad } 1252 1.1 riastrad 1253 1.1 riastrad init_completion(&arg.completion); 1254 1.1 riastrad 1255 1.1 riastrad tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1256 1.1 riastrad if (IS_ERR(tsk)) { 1257 1.1 riastrad err = PTR_ERR(tsk); 1258 1.1 riastrad tsk = NULL; 1259 1.1 riastrad goto out_reset; 1260 1.1 riastrad } 1261 1.1 riastrad get_task_struct(tsk); 1262 1.1 riastrad 1263 1.1 riastrad wait_for_completion(&arg.completion); 1264 1.1 riastrad 1265 1.1 riastrad if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1266 1.1 riastrad struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1267 1.1 riastrad 1268 1.1 riastrad pr_err("igt/evict_vma kthread did not wait\n"); 1269 1.1 riastrad intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1270 1.1 riastrad 1271 1.1 riastrad intel_gt_set_wedged(gt); 1272 1.1 riastrad goto out_reset; 1273 1.1 riastrad } 1274 1.1 riastrad 1275 1.1 riastrad out_reset: 1276 1.1 riastrad igt_global_reset_lock(gt); 1277 1.1 riastrad fake_hangcheck(gt, rq->engine->mask); 1278 1.1 riastrad igt_global_reset_unlock(gt); 1279 1.1 riastrad 1280 1.1 riastrad if (tsk) { 1281 1.1 riastrad struct intel_wedge_me w; 1282 1.1 riastrad 1283 1.1 riastrad /* The reset, even indirectly, should take less than 10ms. */ 1284 1.1 riastrad intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1285 1.1 riastrad err = kthread_stop(tsk); 1286 1.1 riastrad 1287 1.1 riastrad put_task_struct(tsk); 1288 1.1 riastrad } 1289 1.1 riastrad 1290 1.1 riastrad out_rq: 1291 1.1 riastrad i915_request_put(rq); 1292 1.1 riastrad out_obj: 1293 1.1 riastrad i915_gem_object_put(obj); 1294 1.1 riastrad fini: 1295 1.1 riastrad hang_fini(&h); 1296 1.1 riastrad if (intel_gt_is_wedged(gt)) 1297 1.1 riastrad return -EIO; 1298 1.1 riastrad 1299 1.1 riastrad return err; 1300 1.1 riastrad } 1301 1.1 riastrad 1302 1.1 riastrad static int igt_reset_evict_ggtt(void *arg) 1303 1.1 riastrad { 1304 1.1 riastrad struct intel_gt *gt = arg; 1305 1.1 riastrad 1306 1.1 riastrad return __igt_reset_evict_vma(gt, >->ggtt->vm, 1307 1.1 riastrad evict_vma, EXEC_OBJECT_WRITE); 1308 1.1 riastrad } 1309 1.1 riastrad 1310 1.1 riastrad static int igt_reset_evict_ppgtt(void *arg) 1311 1.1 riastrad { 1312 1.1 riastrad struct intel_gt *gt = arg; 1313 1.1 riastrad struct i915_ppgtt *ppgtt; 1314 1.1 riastrad int err; 1315 1.1 riastrad 1316 1.1 riastrad /* aliasing == global gtt locking, covered above */ 1317 1.1 riastrad if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1318 1.1 riastrad return 0; 1319 1.1 riastrad 1320 1.1 riastrad ppgtt = i915_ppgtt_create(gt); 1321 1.1 riastrad if (IS_ERR(ppgtt)) 1322 1.1 riastrad return PTR_ERR(ppgtt); 1323 1.1 riastrad 1324 1.1 riastrad err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1325 1.1 riastrad evict_vma, EXEC_OBJECT_WRITE); 1326 1.1 riastrad i915_vm_put(&ppgtt->vm); 1327 1.1 riastrad 1328 1.1 riastrad return err; 1329 1.1 riastrad } 1330 1.1 riastrad 1331 1.1 riastrad static int igt_reset_evict_fence(void *arg) 1332 1.1 riastrad { 1333 1.1 riastrad struct intel_gt *gt = arg; 1334 1.1 riastrad 1335 1.1 riastrad return __igt_reset_evict_vma(gt, >->ggtt->vm, 1336 1.1 riastrad evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1337 1.1 riastrad } 1338 1.1 riastrad 1339 1.1 riastrad static int wait_for_others(struct intel_gt *gt, 1340 1.1 riastrad struct intel_engine_cs *exclude) 1341 1.1 riastrad { 1342 1.1 riastrad struct intel_engine_cs *engine; 1343 1.1 riastrad enum intel_engine_id id; 1344 1.1 riastrad 1345 1.1 riastrad for_each_engine(engine, gt, id) { 1346 1.1 riastrad if (engine == exclude) 1347 1.1 riastrad continue; 1348 1.1 riastrad 1349 1.1 riastrad if (!wait_for_idle(engine)) 1350 1.1 riastrad return -EIO; 1351 1.1 riastrad } 1352 1.1 riastrad 1353 1.1 riastrad return 0; 1354 1.1 riastrad } 1355 1.1 riastrad 1356 1.1 riastrad static int igt_reset_queue(void *arg) 1357 1.1 riastrad { 1358 1.1 riastrad struct intel_gt *gt = arg; 1359 1.1 riastrad struct i915_gpu_error *global = >->i915->gpu_error; 1360 1.1 riastrad struct intel_engine_cs *engine; 1361 1.1 riastrad enum intel_engine_id id; 1362 1.1 riastrad struct hang h; 1363 1.1 riastrad int err; 1364 1.1 riastrad 1365 1.1 riastrad /* Check that we replay pending requests following a hang */ 1366 1.1 riastrad 1367 1.1 riastrad igt_global_reset_lock(gt); 1368 1.1 riastrad 1369 1.1 riastrad err = hang_init(&h, gt); 1370 1.1 riastrad if (err) 1371 1.1 riastrad goto unlock; 1372 1.1 riastrad 1373 1.1 riastrad for_each_engine(engine, gt, id) { 1374 1.1 riastrad struct i915_request *prev; 1375 1.1 riastrad IGT_TIMEOUT(end_time); 1376 1.1 riastrad unsigned int count; 1377 1.1 riastrad 1378 1.1 riastrad if (!intel_engine_can_store_dword(engine)) 1379 1.1 riastrad continue; 1380 1.1 riastrad 1381 1.1 riastrad prev = hang_create_request(&h, engine); 1382 1.1 riastrad if (IS_ERR(prev)) { 1383 1.1 riastrad err = PTR_ERR(prev); 1384 1.1 riastrad goto fini; 1385 1.1 riastrad } 1386 1.1 riastrad 1387 1.1 riastrad i915_request_get(prev); 1388 1.1 riastrad i915_request_add(prev); 1389 1.1 riastrad 1390 1.1 riastrad count = 0; 1391 1.1 riastrad do { 1392 1.1 riastrad struct i915_request *rq; 1393 1.1 riastrad unsigned int reset_count; 1394 1.1 riastrad 1395 1.1 riastrad rq = hang_create_request(&h, engine); 1396 1.1 riastrad if (IS_ERR(rq)) { 1397 1.1 riastrad err = PTR_ERR(rq); 1398 1.1 riastrad goto fini; 1399 1.1 riastrad } 1400 1.1 riastrad 1401 1.1 riastrad i915_request_get(rq); 1402 1.1 riastrad i915_request_add(rq); 1403 1.1 riastrad 1404 1.1 riastrad /* 1405 1.1 riastrad * XXX We don't handle resetting the kernel context 1406 1.1 riastrad * very well. If we trigger a device reset twice in 1407 1.1 riastrad * quick succession while the kernel context is 1408 1.1 riastrad * executing, we may end up skipping the breadcrumb. 1409 1.1 riastrad * This is really only a problem for the selftest as 1410 1.1 riastrad * normally there is a large interlude between resets 1411 1.1 riastrad * (hangcheck), or we focus on resetting just one 1412 1.1 riastrad * engine and so avoid repeatedly resetting innocents. 1413 1.1 riastrad */ 1414 1.1 riastrad err = wait_for_others(gt, engine); 1415 1.1 riastrad if (err) { 1416 1.1 riastrad pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1417 1.1 riastrad __func__, engine->name); 1418 1.1 riastrad i915_request_put(rq); 1419 1.1 riastrad i915_request_put(prev); 1420 1.1 riastrad 1421 1.1 riastrad GEM_TRACE_DUMP(); 1422 1.1 riastrad intel_gt_set_wedged(gt); 1423 1.1 riastrad goto fini; 1424 1.1 riastrad } 1425 1.1 riastrad 1426 1.1 riastrad if (!wait_until_running(&h, prev)) { 1427 1.1 riastrad struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1428 1.1 riastrad 1429 1.1 riastrad pr_err("%s(%s): Failed to start request %llx, at %x\n", 1430 1.1 riastrad __func__, engine->name, 1431 1.1 riastrad prev->fence.seqno, hws_seqno(&h, prev)); 1432 1.1 riastrad intel_engine_dump(engine, &p, 1433 1.1 riastrad "%s\n", engine->name); 1434 1.1 riastrad 1435 1.1 riastrad i915_request_put(rq); 1436 1.1 riastrad i915_request_put(prev); 1437 1.1 riastrad 1438 1.1 riastrad intel_gt_set_wedged(gt); 1439 1.1 riastrad 1440 1.1 riastrad err = -EIO; 1441 1.1 riastrad goto fini; 1442 1.1 riastrad } 1443 1.1 riastrad 1444 1.1 riastrad reset_count = fake_hangcheck(gt, BIT(id)); 1445 1.1 riastrad 1446 1.1 riastrad if (prev->fence.error != -EIO) { 1447 1.1 riastrad pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1448 1.1 riastrad prev->fence.error); 1449 1.1 riastrad i915_request_put(rq); 1450 1.1 riastrad i915_request_put(prev); 1451 1.1 riastrad err = -EINVAL; 1452 1.1 riastrad goto fini; 1453 1.1 riastrad } 1454 1.1 riastrad 1455 1.1 riastrad if (rq->fence.error) { 1456 1.1 riastrad pr_err("Fence error status not zero [%d] after unrelated reset\n", 1457 1.1 riastrad rq->fence.error); 1458 1.1 riastrad i915_request_put(rq); 1459 1.1 riastrad i915_request_put(prev); 1460 1.1 riastrad err = -EINVAL; 1461 1.1 riastrad goto fini; 1462 1.1 riastrad } 1463 1.1 riastrad 1464 1.1 riastrad if (i915_reset_count(global) == reset_count) { 1465 1.1 riastrad pr_err("No GPU reset recorded!\n"); 1466 1.1 riastrad i915_request_put(rq); 1467 1.1 riastrad i915_request_put(prev); 1468 1.1 riastrad err = -EINVAL; 1469 1.1 riastrad goto fini; 1470 1.1 riastrad } 1471 1.1 riastrad 1472 1.1 riastrad i915_request_put(prev); 1473 1.1 riastrad prev = rq; 1474 1.1 riastrad count++; 1475 1.1 riastrad } while (time_before(jiffies, end_time)); 1476 1.1 riastrad pr_info("%s: Completed %d resets\n", engine->name, count); 1477 1.1 riastrad 1478 1.1 riastrad *h.batch = MI_BATCH_BUFFER_END; 1479 1.1 riastrad intel_gt_chipset_flush(engine->gt); 1480 1.1 riastrad 1481 1.1 riastrad i915_request_put(prev); 1482 1.1 riastrad 1483 1.1 riastrad err = igt_flush_test(gt->i915); 1484 1.1 riastrad if (err) 1485 1.1 riastrad break; 1486 1.1 riastrad } 1487 1.1 riastrad 1488 1.1 riastrad fini: 1489 1.1 riastrad hang_fini(&h); 1490 1.1 riastrad unlock: 1491 1.1 riastrad igt_global_reset_unlock(gt); 1492 1.1 riastrad 1493 1.1 riastrad if (intel_gt_is_wedged(gt)) 1494 1.1 riastrad return -EIO; 1495 1.1 riastrad 1496 1.1 riastrad return err; 1497 1.1 riastrad } 1498 1.1 riastrad 1499 1.1 riastrad static int igt_handle_error(void *arg) 1500 1.1 riastrad { 1501 1.1 riastrad struct intel_gt *gt = arg; 1502 1.1 riastrad struct i915_gpu_error *global = >->i915->gpu_error; 1503 1.1 riastrad struct intel_engine_cs *engine = gt->engine[RCS0]; 1504 1.1 riastrad struct hang h; 1505 1.1 riastrad struct i915_request *rq; 1506 1.1 riastrad struct i915_gpu_coredump *error; 1507 1.1 riastrad int err; 1508 1.1 riastrad 1509 1.1 riastrad /* Check that we can issue a global GPU and engine reset */ 1510 1.1 riastrad 1511 1.1 riastrad if (!intel_has_reset_engine(gt)) 1512 1.1 riastrad return 0; 1513 1.1 riastrad 1514 1.1 riastrad if (!engine || !intel_engine_can_store_dword(engine)) 1515 1.1 riastrad return 0; 1516 1.1 riastrad 1517 1.1 riastrad err = hang_init(&h, gt); 1518 1.1 riastrad if (err) 1519 1.1 riastrad return err; 1520 1.1 riastrad 1521 1.1 riastrad rq = hang_create_request(&h, engine); 1522 1.1 riastrad if (IS_ERR(rq)) { 1523 1.1 riastrad err = PTR_ERR(rq); 1524 1.1 riastrad goto err_fini; 1525 1.1 riastrad } 1526 1.1 riastrad 1527 1.1 riastrad i915_request_get(rq); 1528 1.1 riastrad i915_request_add(rq); 1529 1.1 riastrad 1530 1.1 riastrad if (!wait_until_running(&h, rq)) { 1531 1.1 riastrad struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1532 1.1 riastrad 1533 1.1 riastrad pr_err("%s: Failed to start request %llx, at %x\n", 1534 1.1 riastrad __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1535 1.1 riastrad intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1536 1.1 riastrad 1537 1.1 riastrad intel_gt_set_wedged(gt); 1538 1.1 riastrad 1539 1.1 riastrad err = -EIO; 1540 1.1 riastrad goto err_request; 1541 1.1 riastrad } 1542 1.1 riastrad 1543 1.1 riastrad /* Temporarily disable error capture */ 1544 1.1 riastrad error = xchg(&global->first_error, (void *)-1); 1545 1.1 riastrad 1546 1.1 riastrad intel_gt_handle_error(gt, engine->mask, 0, NULL); 1547 1.1 riastrad 1548 1.1 riastrad xchg(&global->first_error, error); 1549 1.1 riastrad 1550 1.1 riastrad if (rq->fence.error != -EIO) { 1551 1.1 riastrad pr_err("Guilty request not identified!\n"); 1552 1.1 riastrad err = -EINVAL; 1553 1.1 riastrad goto err_request; 1554 1.1 riastrad } 1555 1.1 riastrad 1556 1.1 riastrad err_request: 1557 1.1 riastrad i915_request_put(rq); 1558 1.1 riastrad err_fini: 1559 1.1 riastrad hang_fini(&h); 1560 1.1 riastrad return err; 1561 1.1 riastrad } 1562 1.1 riastrad 1563 1.1 riastrad static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1564 1.1 riastrad const struct igt_atomic_section *p, 1565 1.1 riastrad const char *mode) 1566 1.1 riastrad { 1567 1.1 riastrad struct tasklet_struct * const t = &engine->execlists.tasklet; 1568 1.1 riastrad int err; 1569 1.1 riastrad 1570 1.1 riastrad GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1571 1.1 riastrad engine->name, mode, p->name); 1572 1.1 riastrad 1573 1.1 riastrad tasklet_disable(t); 1574 1.1 riastrad p->critical_section_begin(); 1575 1.1 riastrad 1576 1.1 riastrad err = intel_engine_reset(engine, NULL); 1577 1.1 riastrad 1578 1.1 riastrad p->critical_section_end(); 1579 1.1 riastrad tasklet_enable(t); 1580 1.1 riastrad 1581 1.1 riastrad if (err) 1582 1.1 riastrad pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1583 1.1 riastrad engine->name, mode, p->name); 1584 1.1 riastrad 1585 1.1 riastrad return err; 1586 1.1 riastrad } 1587 1.1 riastrad 1588 1.1 riastrad static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1589 1.1 riastrad const struct igt_atomic_section *p) 1590 1.1 riastrad { 1591 1.1 riastrad struct i915_request *rq; 1592 1.1 riastrad struct hang h; 1593 1.1 riastrad int err; 1594 1.1 riastrad 1595 1.1 riastrad err = __igt_atomic_reset_engine(engine, p, "idle"); 1596 1.1 riastrad if (err) 1597 1.1 riastrad return err; 1598 1.1 riastrad 1599 1.1 riastrad err = hang_init(&h, engine->gt); 1600 1.1 riastrad if (err) 1601 1.1 riastrad return err; 1602 1.1 riastrad 1603 1.1 riastrad rq = hang_create_request(&h, engine); 1604 1.1 riastrad if (IS_ERR(rq)) { 1605 1.1 riastrad err = PTR_ERR(rq); 1606 1.1 riastrad goto out; 1607 1.1 riastrad } 1608 1.1 riastrad 1609 1.1 riastrad i915_request_get(rq); 1610 1.1 riastrad i915_request_add(rq); 1611 1.1 riastrad 1612 1.1 riastrad if (wait_until_running(&h, rq)) { 1613 1.1 riastrad err = __igt_atomic_reset_engine(engine, p, "active"); 1614 1.1 riastrad } else { 1615 1.1 riastrad pr_err("%s(%s): Failed to start request %llx, at %x\n", 1616 1.1 riastrad __func__, engine->name, 1617 1.1 riastrad rq->fence.seqno, hws_seqno(&h, rq)); 1618 1.1 riastrad intel_gt_set_wedged(engine->gt); 1619 1.1 riastrad err = -EIO; 1620 1.1 riastrad } 1621 1.1 riastrad 1622 1.1 riastrad if (err == 0) { 1623 1.1 riastrad struct intel_wedge_me w; 1624 1.1 riastrad 1625 1.1 riastrad intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1626 1.1 riastrad i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1627 1.1 riastrad if (intel_gt_is_wedged(engine->gt)) 1628 1.1 riastrad err = -EIO; 1629 1.1 riastrad } 1630 1.1 riastrad 1631 1.1 riastrad i915_request_put(rq); 1632 1.1 riastrad out: 1633 1.1 riastrad hang_fini(&h); 1634 1.1 riastrad return err; 1635 1.1 riastrad } 1636 1.1 riastrad 1637 1.1 riastrad static int igt_reset_engines_atomic(void *arg) 1638 1.1 riastrad { 1639 1.1 riastrad struct intel_gt *gt = arg; 1640 1.1 riastrad const typeof(*igt_atomic_phases) *p; 1641 1.1 riastrad int err = 0; 1642 1.1 riastrad 1643 1.1 riastrad /* Check that the engines resets are usable from atomic context */ 1644 1.1 riastrad 1645 1.1 riastrad if (!intel_has_reset_engine(gt)) 1646 1.1 riastrad return 0; 1647 1.1 riastrad 1648 1.1 riastrad if (USES_GUC_SUBMISSION(gt->i915)) 1649 1.1 riastrad return 0; 1650 1.1 riastrad 1651 1.1 riastrad igt_global_reset_lock(gt); 1652 1.1 riastrad 1653 1.1 riastrad /* Flush any requests before we get started and check basics */ 1654 1.1 riastrad if (!igt_force_reset(gt)) 1655 1.1 riastrad goto unlock; 1656 1.1 riastrad 1657 1.1 riastrad for (p = igt_atomic_phases; p->name; p++) { 1658 1.1 riastrad struct intel_engine_cs *engine; 1659 1.1 riastrad enum intel_engine_id id; 1660 1.1 riastrad 1661 1.1 riastrad for_each_engine(engine, gt, id) { 1662 1.1 riastrad err = igt_atomic_reset_engine(engine, p); 1663 1.1 riastrad if (err) 1664 1.1 riastrad goto out; 1665 1.1 riastrad } 1666 1.1 riastrad } 1667 1.1 riastrad 1668 1.1 riastrad out: 1669 1.1 riastrad /* As we poke around the guts, do a full reset before continuing. */ 1670 1.1 riastrad igt_force_reset(gt); 1671 1.1 riastrad unlock: 1672 1.1 riastrad igt_global_reset_unlock(gt); 1673 1.1 riastrad 1674 1.1 riastrad return err; 1675 1.1 riastrad } 1676 1.1 riastrad 1677 1.1 riastrad int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 1678 1.1 riastrad { 1679 1.1 riastrad static const struct i915_subtest tests[] = { 1680 1.1 riastrad SUBTEST(igt_hang_sanitycheck), 1681 1.1 riastrad SUBTEST(igt_reset_nop), 1682 1.1 riastrad SUBTEST(igt_reset_nop_engine), 1683 1.1 riastrad SUBTEST(igt_reset_idle_engine), 1684 1.1 riastrad SUBTEST(igt_reset_active_engine), 1685 1.1 riastrad SUBTEST(igt_reset_engines), 1686 1.1 riastrad SUBTEST(igt_reset_engines_atomic), 1687 1.1 riastrad SUBTEST(igt_reset_queue), 1688 1.1 riastrad SUBTEST(igt_reset_wait), 1689 1.1 riastrad SUBTEST(igt_reset_evict_ggtt), 1690 1.1 riastrad SUBTEST(igt_reset_evict_ppgtt), 1691 1.1 riastrad SUBTEST(igt_reset_evict_fence), 1692 1.1 riastrad SUBTEST(igt_handle_error), 1693 1.1 riastrad }; 1694 1.1 riastrad struct intel_gt *gt = &i915->gt; 1695 1.1 riastrad intel_wakeref_t wakeref; 1696 1.1 riastrad int err; 1697 1.1 riastrad 1698 1.1 riastrad if (!intel_has_gpu_reset(gt)) 1699 1.1 riastrad return 0; 1700 1.1 riastrad 1701 1.1 riastrad if (intel_gt_is_wedged(gt)) 1702 1.1 riastrad return -EIO; /* we're long past hope of a successful reset */ 1703 1.1 riastrad 1704 1.1 riastrad wakeref = intel_runtime_pm_get(gt->uncore->rpm); 1705 1.1 riastrad 1706 1.1 riastrad err = intel_gt_live_subtests(tests, gt); 1707 1.1 riastrad 1708 1.1 riastrad intel_runtime_pm_put(gt->uncore->rpm, wakeref); 1709 1.1 riastrad 1710 1.1 riastrad return err; 1711 1.1 riastrad } 1712