selftest_hangcheck.c - drivers/gpu/drm/i915/gt/selftest_hangcheck.c - Linux source code v3.1

Note: File does not exist in v3.1.
   1/*
   2 * Copyright © 2016 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 */
  24
  25#include <linux/kthread.h>
  26
  27#include "gem/i915_gem_context.h"
  28#include "gt/intel_gt.h"
  29#include "intel_engine_pm.h"
  30
  31#include "i915_selftest.h"
  32#include "selftests/i915_random.h"
  33#include "selftests/igt_flush_test.h"
  34#include "selftests/igt_reset.h"
  35#include "selftests/igt_atomic.h"
  36
  37#include "selftests/mock_drm.h"
  38
  39#include "gem/selftests/mock_context.h"
  40#include "gem/selftests/igt_gem_utils.h"
  41
  42#define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
  43
  44struct hang {
  45	struct intel_gt *gt;
  46	struct drm_i915_gem_object *hws;
  47	struct drm_i915_gem_object *obj;
  48	struct i915_gem_context *ctx;
  49	u32 *seqno;
  50	u32 *batch;
  51};
  52
  53static int hang_init(struct hang *h, struct intel_gt *gt)
  54{
  55	void *vaddr;
  56	int err;
  57
  58	memset(h, 0, sizeof(*h));
  59	h->gt = gt;
  60
  61	h->ctx = kernel_context(gt->i915);
  62	if (IS_ERR(h->ctx))
  63		return PTR_ERR(h->ctx);
  64
  65	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
  66
  67	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
  68	if (IS_ERR(h->hws)) {
  69		err = PTR_ERR(h->hws);
  70		goto err_ctx;
  71	}
  72
  73	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
  74	if (IS_ERR(h->obj)) {
  75		err = PTR_ERR(h->obj);
  76		goto err_hws;
  77	}
  78
  79	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
  80	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
  81	if (IS_ERR(vaddr)) {
  82		err = PTR_ERR(vaddr);
  83		goto err_obj;
  84	}
  85	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  86
  87	vaddr = i915_gem_object_pin_map(h->obj,
  88					i915_coherent_map_type(gt->i915));
  89	if (IS_ERR(vaddr)) {
  90		err = PTR_ERR(vaddr);
  91		goto err_unpin_hws;
  92	}
  93	h->batch = vaddr;
  94
  95	return 0;
  96
  97err_unpin_hws:
  98	i915_gem_object_unpin_map(h->hws);
  99err_obj:
 100	i915_gem_object_put(h->obj);
 101err_hws:
 102	i915_gem_object_put(h->hws);
 103err_ctx:
 104	kernel_context_close(h->ctx);
 105	return err;
 106}
 107
 108static u64 hws_address(const struct i915_vma *hws,
 109		       const struct i915_request *rq)
 110{
 111	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
 112}
 113
 114static int move_to_active(struct i915_vma *vma,
 115			  struct i915_request *rq,
 116			  unsigned int flags)
 117{
 118	int err;
 119
 120	i915_vma_lock(vma);
 121	err = i915_request_await_object(rq, vma->obj,
 122					flags & EXEC_OBJECT_WRITE);
 123	if (err == 0)
 124		err = i915_vma_move_to_active(vma, rq, flags);
 125	i915_vma_unlock(vma);
 126
 127	return err;
 128}
 129
 130static struct i915_request *
 131hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 132{
 133	struct intel_gt *gt = h->gt;
 134	struct i915_address_space *vm = h->ctx->vm ?: &engine->gt->ggtt->vm;
 135	struct drm_i915_gem_object *obj;
 136	struct i915_request *rq = NULL;
 137	struct i915_vma *hws, *vma;
 138	unsigned int flags;
 139	void *vaddr;
 140	u32 *batch;
 141	int err;
 142
 143	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
 144	if (IS_ERR(obj))
 145		return ERR_CAST(obj);
 146
 147	vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
 148	if (IS_ERR(vaddr)) {
 149		i915_gem_object_put(obj);
 150		return ERR_CAST(vaddr);
 151	}
 152
 153	i915_gem_object_unpin_map(h->obj);
 154	i915_gem_object_put(h->obj);
 155
 156	h->obj = obj;
 157	h->batch = vaddr;
 158
 159	vma = i915_vma_instance(h->obj, vm, NULL);
 160	if (IS_ERR(vma))
 161		return ERR_CAST(vma);
 162
 163	hws = i915_vma_instance(h->hws, vm, NULL);
 164	if (IS_ERR(hws))
 165		return ERR_CAST(hws);
 166
 167	err = i915_vma_pin(vma, 0, 0, PIN_USER);
 168	if (err)
 169		return ERR_PTR(err);
 170
 171	err = i915_vma_pin(hws, 0, 0, PIN_USER);
 172	if (err)
 173		goto unpin_vma;
 174
 175	rq = igt_request_alloc(h->ctx, engine);
 176	if (IS_ERR(rq)) {
 177		err = PTR_ERR(rq);
 178		goto unpin_hws;
 179	}
 180
 181	err = move_to_active(vma, rq, 0);
 182	if (err)
 183		goto cancel_rq;
 184
 185	err = move_to_active(hws, rq, 0);
 186	if (err)
 187		goto cancel_rq;
 188
 189	batch = h->batch;
 190	if (INTEL_GEN(gt->i915) >= 8) {
 191		*batch++ = MI_STORE_DWORD_IMM_GEN4;
 192		*batch++ = lower_32_bits(hws_address(hws, rq));
 193		*batch++ = upper_32_bits(hws_address(hws, rq));
 194		*batch++ = rq->fence.seqno;
 195		*batch++ = MI_ARB_CHECK;
 196
 197		memset(batch, 0, 1024);
 198		batch += 1024 / sizeof(*batch);
 199
 200		*batch++ = MI_ARB_CHECK;
 201		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 202		*batch++ = lower_32_bits(vma->node.start);
 203		*batch++ = upper_32_bits(vma->node.start);
 204	} else if (INTEL_GEN(gt->i915) >= 6) {
 205		*batch++ = MI_STORE_DWORD_IMM_GEN4;
 206		*batch++ = 0;
 207		*batch++ = lower_32_bits(hws_address(hws, rq));
 208		*batch++ = rq->fence.seqno;
 209		*batch++ = MI_ARB_CHECK;
 210
 211		memset(batch, 0, 1024);
 212		batch += 1024 / sizeof(*batch);
 213
 214		*batch++ = MI_ARB_CHECK;
 215		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 216		*batch++ = lower_32_bits(vma->node.start);
 217	} else if (INTEL_GEN(gt->i915) >= 4) {
 218		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 219		*batch++ = 0;
 220		*batch++ = lower_32_bits(hws_address(hws, rq));
 221		*batch++ = rq->fence.seqno;
 222		*batch++ = MI_ARB_CHECK;
 223
 224		memset(batch, 0, 1024);
 225		batch += 1024 / sizeof(*batch);
 226
 227		*batch++ = MI_ARB_CHECK;
 228		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 229		*batch++ = lower_32_bits(vma->node.start);
 230	} else {
 231		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
 232		*batch++ = lower_32_bits(hws_address(hws, rq));
 233		*batch++ = rq->fence.seqno;
 234		*batch++ = MI_ARB_CHECK;
 235
 236		memset(batch, 0, 1024);
 237		batch += 1024 / sizeof(*batch);
 238
 239		*batch++ = MI_ARB_CHECK;
 240		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 241		*batch++ = lower_32_bits(vma->node.start);
 242	}
 243	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
 244	intel_gt_chipset_flush(engine->gt);
 245
 246	if (rq->engine->emit_init_breadcrumb) {
 247		err = rq->engine->emit_init_breadcrumb(rq);
 248		if (err)
 249			goto cancel_rq;
 250	}
 251
 252	flags = 0;
 253	if (INTEL_GEN(gt->i915) <= 5)
 254		flags |= I915_DISPATCH_SECURE;
 255
 256	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 257
 258cancel_rq:
 259	if (err) {
 260		i915_request_skip(rq, err);
 261		i915_request_add(rq);
 262	}
 263unpin_hws:
 264	i915_vma_unpin(hws);
 265unpin_vma:
 266	i915_vma_unpin(vma);
 267	return err ? ERR_PTR(err) : rq;
 268}
 269
 270static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
 271{
 272	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 273}
 274
 275static void hang_fini(struct hang *h)
 276{
 277	*h->batch = MI_BATCH_BUFFER_END;
 278	intel_gt_chipset_flush(h->gt);
 279
 280	i915_gem_object_unpin_map(h->obj);
 281	i915_gem_object_put(h->obj);
 282
 283	i915_gem_object_unpin_map(h->hws);
 284	i915_gem_object_put(h->hws);
 285
 286	kernel_context_close(h->ctx);
 287
 288	igt_flush_test(h->gt->i915, I915_WAIT_LOCKED);
 289}
 290
 291static bool wait_until_running(struct hang *h, struct i915_request *rq)
 292{
 293	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
 294					       rq->fence.seqno),
 295			     10) &&
 296		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
 297					    rq->fence.seqno),
 298			  1000));
 299}
 300
 301static int igt_hang_sanitycheck(void *arg)
 302{
 303	struct intel_gt *gt = arg;
 304	struct i915_request *rq;
 305	struct intel_engine_cs *engine;
 306	enum intel_engine_id id;
 307	struct hang h;
 308	int err;
 309
 310	/* Basic check that we can execute our hanging batch */
 311
 312	mutex_lock(&gt->i915->drm.struct_mutex);
 313	err = hang_init(&h, gt);
 314	if (err)
 315		goto unlock;
 316
 317	for_each_engine(engine, gt->i915, id) {
 318		struct intel_wedge_me w;
 319		long timeout;
 320
 321		if (!intel_engine_can_store_dword(engine))
 322			continue;
 323
 324		rq = hang_create_request(&h, engine);
 325		if (IS_ERR(rq)) {
 326			err = PTR_ERR(rq);
 327			pr_err("Failed to create request for %s, err=%d\n",
 328			       engine->name, err);
 329			goto fini;
 330		}
 331
 332		i915_request_get(rq);
 333
 334		*h.batch = MI_BATCH_BUFFER_END;
 335		intel_gt_chipset_flush(engine->gt);
 336
 337		i915_request_add(rq);
 338
 339		timeout = 0;
 340		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
 341			timeout = i915_request_wait(rq, 0,
 342						    MAX_SCHEDULE_TIMEOUT);
 343		if (intel_gt_is_wedged(gt))
 344			timeout = -EIO;
 345
 346		i915_request_put(rq);
 347
 348		if (timeout < 0) {
 349			err = timeout;
 350			pr_err("Wait for request failed on %s, err=%d\n",
 351			       engine->name, err);
 352			goto fini;
 353		}
 354	}
 355
 356fini:
 357	hang_fini(&h);
 358unlock:
 359	mutex_unlock(&gt->i915->drm.struct_mutex);
 360	return err;
 361}
 362
 363static bool wait_for_idle(struct intel_engine_cs *engine)
 364{
 365	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
 366}
 367
 368static int igt_reset_nop(void *arg)
 369{
 370	struct intel_gt *gt = arg;
 371	struct i915_gpu_error *global = &gt->i915->gpu_error;
 372	struct intel_engine_cs *engine;
 373	struct i915_gem_context *ctx;
 374	unsigned int reset_count, count;
 375	enum intel_engine_id id;
 376	struct drm_file *file;
 377	IGT_TIMEOUT(end_time);
 378	int err = 0;
 379
 380	/* Check that we can reset during non-user portions of requests */
 381
 382	file = mock_file(gt->i915);
 383	if (IS_ERR(file))
 384		return PTR_ERR(file);
 385
 386	mutex_lock(&gt->i915->drm.struct_mutex);
 387	ctx = live_context(gt->i915, file);
 388	mutex_unlock(&gt->i915->drm.struct_mutex);
 389	if (IS_ERR(ctx)) {
 390		err = PTR_ERR(ctx);
 391		goto out;
 392	}
 393
 394	i915_gem_context_clear_bannable(ctx);
 395	reset_count = i915_reset_count(global);
 396	count = 0;
 397	do {
 398		mutex_lock(&gt->i915->drm.struct_mutex);
 399
 400		for_each_engine(engine, gt->i915, id) {
 401			int i;
 402
 403			for (i = 0; i < 16; i++) {
 404				struct i915_request *rq;
 405
 406				rq = igt_request_alloc(ctx, engine);
 407				if (IS_ERR(rq)) {
 408					err = PTR_ERR(rq);
 409					break;
 410				}
 411
 412				i915_request_add(rq);
 413			}
 414		}
 415
 416		igt_global_reset_lock(gt);
 417		intel_gt_reset(gt, ALL_ENGINES, NULL);
 418		igt_global_reset_unlock(gt);
 419
 420		mutex_unlock(&gt->i915->drm.struct_mutex);
 421		if (intel_gt_is_wedged(gt)) {
 422			err = -EIO;
 423			break;
 424		}
 425
 426		if (i915_reset_count(global) != reset_count + ++count) {
 427			pr_err("Full GPU reset not recorded!\n");
 428			err = -EINVAL;
 429			break;
 430		}
 431
 432		err = igt_flush_test(gt->i915, 0);
 433		if (err)
 434			break;
 435	} while (time_before(jiffies, end_time));
 436	pr_info("%s: %d resets\n", __func__, count);
 437
 438	mutex_lock(&gt->i915->drm.struct_mutex);
 439	err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
 440	mutex_unlock(&gt->i915->drm.struct_mutex);
 441
 442out:
 443	mock_file_free(gt->i915, file);
 444	if (intel_gt_is_wedged(gt))
 445		err = -EIO;
 446	return err;
 447}
 448
 449static int igt_reset_nop_engine(void *arg)
 450{
 451	struct intel_gt *gt = arg;
 452	struct i915_gpu_error *global = &gt->i915->gpu_error;
 453	struct intel_engine_cs *engine;
 454	struct i915_gem_context *ctx;
 455	enum intel_engine_id id;
 456	struct drm_file *file;
 457	int err = 0;
 458
 459	/* Check that we can engine-reset during non-user portions */
 460
 461	if (!intel_has_reset_engine(gt->i915))
 462		return 0;
 463
 464	file = mock_file(gt->i915);
 465	if (IS_ERR(file))
 466		return PTR_ERR(file);
 467
 468	mutex_lock(&gt->i915->drm.struct_mutex);
 469	ctx = live_context(gt->i915, file);
 470	mutex_unlock(&gt->i915->drm.struct_mutex);
 471	if (IS_ERR(ctx)) {
 472		err = PTR_ERR(ctx);
 473		goto out;
 474	}
 475
 476	i915_gem_context_clear_bannable(ctx);
 477	for_each_engine(engine, gt->i915, id) {
 478		unsigned int reset_count, reset_engine_count;
 479		unsigned int count;
 480		IGT_TIMEOUT(end_time);
 481
 482		reset_count = i915_reset_count(global);
 483		reset_engine_count = i915_reset_engine_count(global, engine);
 484		count = 0;
 485
 486		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 487		do {
 488			int i;
 489
 490			if (!wait_for_idle(engine)) {
 491				pr_err("%s failed to idle before reset\n",
 492				       engine->name);
 493				err = -EIO;
 494				break;
 495			}
 496
 497			mutex_lock(&gt->i915->drm.struct_mutex);
 498			for (i = 0; i < 16; i++) {
 499				struct i915_request *rq;
 500
 501				rq = igt_request_alloc(ctx, engine);
 502				if (IS_ERR(rq)) {
 503					err = PTR_ERR(rq);
 504					break;
 505				}
 506
 507				i915_request_add(rq);
 508			}
 509			err = intel_engine_reset(engine, NULL);
 510			mutex_unlock(&gt->i915->drm.struct_mutex);
 511			if (err) {
 512				pr_err("i915_reset_engine failed\n");
 513				break;
 514			}
 515
 516			if (i915_reset_count(global) != reset_count) {
 517				pr_err("Full GPU reset recorded! (engine reset expected)\n");
 518				err = -EINVAL;
 519				break;
 520			}
 521
 522			if (i915_reset_engine_count(global, engine) !=
 523			    reset_engine_count + ++count) {
 524				pr_err("%s engine reset not recorded!\n",
 525				       engine->name);
 526				err = -EINVAL;
 527				break;
 528			}
 529		} while (time_before(jiffies, end_time));
 530		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 531		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
 532
 533		if (err)
 534			break;
 535
 536		err = igt_flush_test(gt->i915, 0);
 537		if (err)
 538			break;
 539	}
 540
 541	mutex_lock(&gt->i915->drm.struct_mutex);
 542	err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
 543	mutex_unlock(&gt->i915->drm.struct_mutex);
 544
 545out:
 546	mock_file_free(gt->i915, file);
 547	if (intel_gt_is_wedged(gt))
 548		err = -EIO;
 549	return err;
 550}
 551
 552static int __igt_reset_engine(struct intel_gt *gt, bool active)
 553{
 554	struct i915_gpu_error *global = &gt->i915->gpu_error;
 555	struct intel_engine_cs *engine;
 556	enum intel_engine_id id;
 557	struct hang h;
 558	int err = 0;
 559
 560	/* Check that we can issue an engine reset on an idle engine (no-op) */
 561
 562	if (!intel_has_reset_engine(gt->i915))
 563		return 0;
 564
 565	if (active) {
 566		mutex_lock(&gt->i915->drm.struct_mutex);
 567		err = hang_init(&h, gt);
 568		mutex_unlock(&gt->i915->drm.struct_mutex);
 569		if (err)
 570			return err;
 571	}
 572
 573	for_each_engine(engine, gt->i915, id) {
 574		unsigned int reset_count, reset_engine_count;
 575		IGT_TIMEOUT(end_time);
 576
 577		if (active && !intel_engine_can_store_dword(engine))
 578			continue;
 579
 580		if (!wait_for_idle(engine)) {
 581			pr_err("%s failed to idle before reset\n",
 582			       engine->name);
 583			err = -EIO;
 584			break;
 585		}
 586
 587		reset_count = i915_reset_count(global);
 588		reset_engine_count = i915_reset_engine_count(global, engine);
 589
 590		intel_engine_pm_get(engine);
 591		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 592		do {
 593			if (active) {
 594				struct i915_request *rq;
 595
 596				mutex_lock(&gt->i915->drm.struct_mutex);
 597				rq = hang_create_request(&h, engine);
 598				if (IS_ERR(rq)) {
 599					err = PTR_ERR(rq);
 600					mutex_unlock(&gt->i915->drm.struct_mutex);
 601					break;
 602				}
 603
 604				i915_request_get(rq);
 605				i915_request_add(rq);
 606				mutex_unlock(&gt->i915->drm.struct_mutex);
 607
 608				if (!wait_until_running(&h, rq)) {
 609					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
 610
 611					pr_err("%s: Failed to start request %llx, at %x\n",
 612					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
 613					intel_engine_dump(engine, &p,
 614							  "%s\n", engine->name);
 615
 616					i915_request_put(rq);
 617					err = -EIO;
 618					break;
 619				}
 620
 621				i915_request_put(rq);
 622			}
 623
 624			err = intel_engine_reset(engine, NULL);
 625			if (err) {
 626				pr_err("i915_reset_engine failed\n");
 627				break;
 628			}
 629
 630			if (i915_reset_count(global) != reset_count) {
 631				pr_err("Full GPU reset recorded! (engine reset expected)\n");
 632				err = -EINVAL;
 633				break;
 634			}
 635
 636			if (i915_reset_engine_count(global, engine) !=
 637			    ++reset_engine_count) {
 638				pr_err("%s engine reset not recorded!\n",
 639				       engine->name);
 640				err = -EINVAL;
 641				break;
 642			}
 643		} while (time_before(jiffies, end_time));
 644		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 645		intel_engine_pm_put(engine);
 646
 647		if (err)
 648			break;
 649
 650		err = igt_flush_test(gt->i915, 0);
 651		if (err)
 652			break;
 653	}
 654
 655	if (intel_gt_is_wedged(gt))
 656		err = -EIO;
 657
 658	if (active) {
 659		mutex_lock(&gt->i915->drm.struct_mutex);
 660		hang_fini(&h);
 661		mutex_unlock(&gt->i915->drm.struct_mutex);
 662	}
 663
 664	return err;
 665}
 666
 667static int igt_reset_idle_engine(void *arg)
 668{
 669	return __igt_reset_engine(arg, false);
 670}
 671
 672static int igt_reset_active_engine(void *arg)
 673{
 674	return __igt_reset_engine(arg, true);
 675}
 676
 677struct active_engine {
 678	struct task_struct *task;
 679	struct intel_engine_cs *engine;
 680	unsigned long resets;
 681	unsigned int flags;
 682};
 683
 684#define TEST_ACTIVE	BIT(0)
 685#define TEST_OTHERS	BIT(1)
 686#define TEST_SELF	BIT(2)
 687#define TEST_PRIORITY	BIT(3)
 688
 689static int active_request_put(struct i915_request *rq)
 690{
 691	int err = 0;
 692
 693	if (!rq)
 694		return 0;
 695
 696	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
 697		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
 698			  rq->engine->name,
 699			  rq->fence.context,
 700			  rq->fence.seqno);
 701		GEM_TRACE_DUMP();
 702
 703		intel_gt_set_wedged(rq->engine->gt);
 704		err = -EIO;
 705	}
 706
 707	i915_request_put(rq);
 708
 709	return err;
 710}
 711
 712static int active_engine(void *data)
 713{
 714	I915_RND_STATE(prng);
 715	struct active_engine *arg = data;
 716	struct intel_engine_cs *engine = arg->engine;
 717	struct i915_request *rq[8] = {};
 718	struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
 719	struct drm_file *file;
 720	unsigned long count = 0;
 721	int err = 0;
 722
 723	file = mock_file(engine->i915);
 724	if (IS_ERR(file))
 725		return PTR_ERR(file);
 726
 727	for (count = 0; count < ARRAY_SIZE(ctx); count++) {
 728		mutex_lock(&engine->i915->drm.struct_mutex);
 729		ctx[count] = live_context(engine->i915, file);
 730		mutex_unlock(&engine->i915->drm.struct_mutex);
 731		if (IS_ERR(ctx[count])) {
 732			err = PTR_ERR(ctx[count]);
 733			while (--count)
 734				i915_gem_context_put(ctx[count]);
 735			goto err_file;
 736		}
 737	}
 738
 739	while (!kthread_should_stop()) {
 740		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
 741		struct i915_request *old = rq[idx];
 742		struct i915_request *new;
 743
 744		mutex_lock(&engine->i915->drm.struct_mutex);
 745		new = igt_request_alloc(ctx[idx], engine);
 746		if (IS_ERR(new)) {
 747			mutex_unlock(&engine->i915->drm.struct_mutex);
 748			err = PTR_ERR(new);
 749			break;
 750		}
 751
 752		if (arg->flags & TEST_PRIORITY)
 753			ctx[idx]->sched.priority =
 754				i915_prandom_u32_max_state(512, &prng);
 755
 756		rq[idx] = i915_request_get(new);
 757		i915_request_add(new);
 758		mutex_unlock(&engine->i915->drm.struct_mutex);
 759
 760		err = active_request_put(old);
 761		if (err)
 762			break;
 763
 764		cond_resched();
 765	}
 766
 767	for (count = 0; count < ARRAY_SIZE(rq); count++) {
 768		int err__ = active_request_put(rq[count]);
 769
 770		/* Keep the first error */
 771		if (!err)
 772			err = err__;
 773	}
 774
 775err_file:
 776	mock_file_free(engine->i915, file);
 777	return err;
 778}
 779
 780static int __igt_reset_engines(struct intel_gt *gt,
 781			       const char *test_name,
 782			       unsigned int flags)
 783{
 784	struct i915_gpu_error *global = &gt->i915->gpu_error;
 785	struct intel_engine_cs *engine, *other;
 786	enum intel_engine_id id, tmp;
 787	struct hang h;
 788	int err = 0;
 789
 790	/* Check that issuing a reset on one engine does not interfere
 791	 * with any other engine.
 792	 */
 793
 794	if (!intel_has_reset_engine(gt->i915))
 795		return 0;
 796
 797	if (flags & TEST_ACTIVE) {
 798		mutex_lock(&gt->i915->drm.struct_mutex);
 799		err = hang_init(&h, gt);
 800		mutex_unlock(&gt->i915->drm.struct_mutex);
 801		if (err)
 802			return err;
 803
 804		if (flags & TEST_PRIORITY)
 805			h.ctx->sched.priority = 1024;
 806	}
 807
 808	for_each_engine(engine, gt->i915, id) {
 809		struct active_engine threads[I915_NUM_ENGINES] = {};
 810		unsigned long device = i915_reset_count(global);
 811		unsigned long count = 0, reported;
 812		IGT_TIMEOUT(end_time);
 813
 814		if (flags & TEST_ACTIVE &&
 815		    !intel_engine_can_store_dword(engine))
 816			continue;
 817
 818		if (!wait_for_idle(engine)) {
 819			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
 820			       engine->name, test_name);
 821			err = -EIO;
 822			break;
 823		}
 824
 825		memset(threads, 0, sizeof(threads));
 826		for_each_engine(other, gt->i915, tmp) {
 827			struct task_struct *tsk;
 828
 829			threads[tmp].resets =
 830				i915_reset_engine_count(global, other);
 831
 832			if (!(flags & TEST_OTHERS))
 833				continue;
 834
 835			if (other == engine && !(flags & TEST_SELF))
 836				continue;
 837
 838			threads[tmp].engine = other;
 839			threads[tmp].flags = flags;
 840
 841			tsk = kthread_run(active_engine, &threads[tmp],
 842					  "igt/%s", other->name);
 843			if (IS_ERR(tsk)) {
 844				err = PTR_ERR(tsk);
 845				goto unwind;
 846			}
 847
 848			threads[tmp].task = tsk;
 849			get_task_struct(tsk);
 850		}
 851
 852		intel_engine_pm_get(engine);
 853		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 854		do {
 855			struct i915_request *rq = NULL;
 856
 857			if (flags & TEST_ACTIVE) {
 858				mutex_lock(&gt->i915->drm.struct_mutex);
 859				rq = hang_create_request(&h, engine);
 860				if (IS_ERR(rq)) {
 861					err = PTR_ERR(rq);
 862					mutex_unlock(&gt->i915->drm.struct_mutex);
 863					break;
 864				}
 865
 866				i915_request_get(rq);
 867				i915_request_add(rq);
 868				mutex_unlock(&gt->i915->drm.struct_mutex);
 869
 870				if (!wait_until_running(&h, rq)) {
 871					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
 872
 873					pr_err("%s: Failed to start request %llx, at %x\n",
 874					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
 875					intel_engine_dump(engine, &p,
 876							  "%s\n", engine->name);
 877
 878					i915_request_put(rq);
 879					err = -EIO;
 880					break;
 881				}
 882			}
 883
 884			err = intel_engine_reset(engine, NULL);
 885			if (err) {
 886				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
 887				       engine->name, test_name, err);
 888				break;
 889			}
 890
 891			count++;
 892
 893			if (rq) {
 894				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 895					struct drm_printer p =
 896						drm_info_printer(gt->i915->drm.dev);
 897
 898					pr_err("i915_reset_engine(%s:%s):"
 899					       " failed to complete request after reset\n",
 900					       engine->name, test_name);
 901					intel_engine_dump(engine, &p,
 902							  "%s\n", engine->name);
 903					i915_request_put(rq);
 904
 905					GEM_TRACE_DUMP();
 906					intel_gt_set_wedged(gt);
 907					err = -EIO;
 908					break;
 909				}
 910
 911				i915_request_put(rq);
 912			}
 913
 914			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
 915				struct drm_printer p =
 916					drm_info_printer(gt->i915->drm.dev);
 917
 918				pr_err("i915_reset_engine(%s:%s):"
 919				       " failed to idle after reset\n",
 920				       engine->name, test_name);
 921				intel_engine_dump(engine, &p,
 922						  "%s\n", engine->name);
 923
 924				err = -EIO;
 925				break;
 926			}
 927		} while (time_before(jiffies, end_time));
 928		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 929		intel_engine_pm_put(engine);
 930		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
 931			engine->name, test_name, count);
 932
 933		reported = i915_reset_engine_count(global, engine);
 934		reported -= threads[engine->id].resets;
 935		if (reported != count) {
 936			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
 937			       engine->name, test_name, count, reported);
 938			if (!err)
 939				err = -EINVAL;
 940		}
 941
 942unwind:
 943		for_each_engine(other, gt->i915, tmp) {
 944			int ret;
 945
 946			if (!threads[tmp].task)
 947				continue;
 948
 949			ret = kthread_stop(threads[tmp].task);
 950			if (ret) {
 951				pr_err("kthread for other engine %s failed, err=%d\n",
 952				       other->name, ret);
 953				if (!err)
 954					err = ret;
 955			}
 956			put_task_struct(threads[tmp].task);
 957
 958			if (other->uabi_class != engine->uabi_class &&
 959			    threads[tmp].resets !=
 960			    i915_reset_engine_count(global, other)) {
 961				pr_err("Innocent engine %s was reset (count=%ld)\n",
 962				       other->name,
 963				       i915_reset_engine_count(global, other) -
 964				       threads[tmp].resets);
 965				if (!err)
 966					err = -EINVAL;
 967			}
 968		}
 969
 970		if (device != i915_reset_count(global)) {
 971			pr_err("Global reset (count=%ld)!\n",
 972			       i915_reset_count(global) - device);
 973			if (!err)
 974				err = -EINVAL;
 975		}
 976
 977		if (err)
 978			break;
 979
 980		mutex_lock(&gt->i915->drm.struct_mutex);
 981		err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
 982		mutex_unlock(&gt->i915->drm.struct_mutex);
 983		if (err)
 984			break;
 985	}
 986
 987	if (intel_gt_is_wedged(gt))
 988		err = -EIO;
 989
 990	if (flags & TEST_ACTIVE) {
 991		mutex_lock(&gt->i915->drm.struct_mutex);
 992		hang_fini(&h);
 993		mutex_unlock(&gt->i915->drm.struct_mutex);
 994	}
 995
 996	return err;
 997}
 998
 999static int igt_reset_engines(void *arg)
1000{
1001	static const struct {
1002		const char *name;
1003		unsigned int flags;
1004	} phases[] = {
1005		{ "idle", 0 },
1006		{ "active", TEST_ACTIVE },
1007		{ "others-idle", TEST_OTHERS },
1008		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
1009		{
1010			"others-priority",
1011			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1012		},
1013		{
1014			"self-priority",
1015			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1016		},
1017		{ }
1018	};
1019	struct intel_gt *gt = arg;
1020	typeof(*phases) *p;
1021	int err;
1022
1023	for (p = phases; p->name; p++) {
1024		if (p->flags & TEST_PRIORITY) {
1025			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1026				continue;
1027		}
1028
1029		err = __igt_reset_engines(arg, p->name, p->flags);
1030		if (err)
1031			return err;
1032	}
1033
1034	return 0;
1035}
1036
1037static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1038{
1039	u32 count = i915_reset_count(&gt->i915->gpu_error);
1040
1041	intel_gt_reset(gt, mask, NULL);
1042
1043	return count;
1044}
1045
1046static int igt_reset_wait(void *arg)
1047{
1048	struct intel_gt *gt = arg;
1049	struct i915_gpu_error *global = &gt->i915->gpu_error;
1050	struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1051	struct i915_request *rq;
1052	unsigned int reset_count;
1053	struct hang h;
1054	long timeout;
1055	int err;
1056
1057	if (!engine || !intel_engine_can_store_dword(engine))
1058		return 0;
1059
1060	/* Check that we detect a stuck waiter and issue a reset */
1061
1062	igt_global_reset_lock(gt);
1063
1064	mutex_lock(&gt->i915->drm.struct_mutex);
1065	err = hang_init(&h, gt);
1066	if (err)
1067		goto unlock;
1068
1069	rq = hang_create_request(&h, engine);
1070	if (IS_ERR(rq)) {
1071		err = PTR_ERR(rq);
1072		goto fini;
1073	}
1074
1075	i915_request_get(rq);
1076	i915_request_add(rq);
1077
1078	if (!wait_until_running(&h, rq)) {
1079		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1080
1081		pr_err("%s: Failed to start request %llx, at %x\n",
1082		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1083		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1084
1085		intel_gt_set_wedged(gt);
1086
1087		err = -EIO;
1088		goto out_rq;
1089	}
1090
1091	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1092
1093	timeout = i915_request_wait(rq, 0, 10);
1094	if (timeout < 0) {
1095		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1096		       timeout);
1097		err = timeout;
1098		goto out_rq;
1099	}
1100
1101	if (i915_reset_count(global) == reset_count) {
1102		pr_err("No GPU reset recorded!\n");
1103		err = -EINVAL;
1104		goto out_rq;
1105	}
1106
1107out_rq:
1108	i915_request_put(rq);
1109fini:
1110	hang_fini(&h);
1111unlock:
1112	mutex_unlock(&gt->i915->drm.struct_mutex);
1113	igt_global_reset_unlock(gt);
1114
1115	if (intel_gt_is_wedged(gt))
1116		return -EIO;
1117
1118	return err;
1119}
1120
1121struct evict_vma {
1122	struct completion completion;
1123	struct i915_vma *vma;
1124};
1125
1126static int evict_vma(void *data)
1127{
1128	struct evict_vma *arg = data;
1129	struct i915_address_space *vm = arg->vma->vm;
1130	struct drm_i915_private *i915 = vm->i915;
1131	struct drm_mm_node evict = arg->vma->node;
1132	int err;
1133
1134	complete(&arg->completion);
1135
1136	mutex_lock(&i915->drm.struct_mutex);
1137	err = i915_gem_evict_for_node(vm, &evict, 0);
1138	mutex_unlock(&i915->drm.struct_mutex);
1139
1140	return err;
1141}
1142
1143static int evict_fence(void *data)
1144{
1145	struct evict_vma *arg = data;
1146	struct drm_i915_private *i915 = arg->vma->vm->i915;
1147	int err;
1148
1149	complete(&arg->completion);
1150
1151	mutex_lock(&i915->drm.struct_mutex);
1152
1153	/* Mark the fence register as dirty to force the mmio update. */
1154	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1155	if (err) {
1156		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1157		goto out_unlock;
1158	}
1159
1160	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1161	if (err) {
1162		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1163		goto out_unlock;
1164	}
1165
1166	err = i915_vma_pin_fence(arg->vma);
1167	i915_vma_unpin(arg->vma);
1168	if (err) {
1169		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1170		goto out_unlock;
1171	}
1172
1173	i915_vma_unpin_fence(arg->vma);
1174
1175out_unlock:
1176	mutex_unlock(&i915->drm.struct_mutex);
1177
1178	return err;
1179}
1180
1181static int __igt_reset_evict_vma(struct intel_gt *gt,
1182				 struct i915_address_space *vm,
1183				 int (*fn)(void *),
1184				 unsigned int flags)
1185{
1186	struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1187	struct drm_i915_gem_object *obj;
1188	struct task_struct *tsk = NULL;
1189	struct i915_request *rq;
1190	struct evict_vma arg;
1191	struct hang h;
1192	int err;
1193
1194	if (!engine || !intel_engine_can_store_dword(engine))
1195		return 0;
1196
1197	/* Check that we can recover an unbind stuck on a hanging request */
1198
1199	mutex_lock(&gt->i915->drm.struct_mutex);
1200	err = hang_init(&h, gt);
1201	if (err)
1202		goto unlock;
1203
1204	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1205	if (IS_ERR(obj)) {
1206		err = PTR_ERR(obj);
1207		goto fini;
1208	}
1209
1210	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1211		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1212		if (err) {
1213			pr_err("Invalid X-tiling settings; err:%d\n", err);
1214			goto out_obj;
1215		}
1216	}
1217
1218	arg.vma = i915_vma_instance(obj, vm, NULL);
1219	if (IS_ERR(arg.vma)) {
1220		err = PTR_ERR(arg.vma);
1221		goto out_obj;
1222	}
1223
1224	rq = hang_create_request(&h, engine);
1225	if (IS_ERR(rq)) {
1226		err = PTR_ERR(rq);
1227		goto out_obj;
1228	}
1229
1230	err = i915_vma_pin(arg.vma, 0, 0,
1231			   i915_vma_is_ggtt(arg.vma) ?
1232			   PIN_GLOBAL | PIN_MAPPABLE :
1233			   PIN_USER);
1234	if (err) {
1235		i915_request_add(rq);
1236		goto out_obj;
1237	}
1238
1239	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1240		err = i915_vma_pin_fence(arg.vma);
1241		if (err) {
1242			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1243			i915_vma_unpin(arg.vma);
1244			i915_request_add(rq);
1245			goto out_obj;
1246		}
1247	}
1248
1249	i915_vma_lock(arg.vma);
1250	err = i915_request_await_object(rq, arg.vma->obj,
1251					flags & EXEC_OBJECT_WRITE);
1252	if (err == 0)
1253		err = i915_vma_move_to_active(arg.vma, rq, flags);
1254	i915_vma_unlock(arg.vma);
1255
1256	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1257		i915_vma_unpin_fence(arg.vma);
1258	i915_vma_unpin(arg.vma);
1259
1260	i915_request_get(rq);
1261	i915_request_add(rq);
1262	if (err)
1263		goto out_rq;
1264
1265	mutex_unlock(&gt->i915->drm.struct_mutex);
1266
1267	if (!wait_until_running(&h, rq)) {
1268		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1269
1270		pr_err("%s: Failed to start request %llx, at %x\n",
1271		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1272		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1273
1274		intel_gt_set_wedged(gt);
1275		goto out_reset;
1276	}
1277
1278	init_completion(&arg.completion);
1279
1280	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1281	if (IS_ERR(tsk)) {
1282		err = PTR_ERR(tsk);
1283		tsk = NULL;
1284		goto out_reset;
1285	}
1286	get_task_struct(tsk);
1287
1288	wait_for_completion(&arg.completion);
1289
1290	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1291		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1292
1293		pr_err("igt/evict_vma kthread did not wait\n");
1294		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1295
1296		intel_gt_set_wedged(gt);
1297		goto out_reset;
1298	}
1299
1300out_reset:
1301	igt_global_reset_lock(gt);
1302	fake_hangcheck(gt, rq->engine->mask);
1303	igt_global_reset_unlock(gt);
1304
1305	if (tsk) {
1306		struct intel_wedge_me w;
1307
1308		/* The reset, even indirectly, should take less than 10ms. */
1309		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1310			err = kthread_stop(tsk);
1311
1312		put_task_struct(tsk);
1313	}
1314
1315	mutex_lock(&gt->i915->drm.struct_mutex);
1316out_rq:
1317	i915_request_put(rq);
1318out_obj:
1319	i915_gem_object_put(obj);
1320fini:
1321	hang_fini(&h);
1322unlock:
1323	mutex_unlock(&gt->i915->drm.struct_mutex);
1324
1325	if (intel_gt_is_wedged(gt))
1326		return -EIO;
1327
1328	return err;
1329}
1330
1331static int igt_reset_evict_ggtt(void *arg)
1332{
1333	struct intel_gt *gt = arg;
1334
1335	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1336				     evict_vma, EXEC_OBJECT_WRITE);
1337}
1338
1339static int igt_reset_evict_ppgtt(void *arg)
1340{
1341	struct intel_gt *gt = arg;
1342	struct i915_gem_context *ctx;
1343	struct drm_file *file;
1344	int err;
1345
1346	file = mock_file(gt->i915);
1347	if (IS_ERR(file))
1348		return PTR_ERR(file);
1349
1350	mutex_lock(&gt->i915->drm.struct_mutex);
1351	ctx = live_context(gt->i915, file);
1352	mutex_unlock(&gt->i915->drm.struct_mutex);
1353	if (IS_ERR(ctx)) {
1354		err = PTR_ERR(ctx);
1355		goto out;
1356	}
1357
1358	err = 0;
1359	if (ctx->vm) /* aliasing == global gtt locking, covered above */
1360		err = __igt_reset_evict_vma(gt, ctx->vm,
1361					    evict_vma, EXEC_OBJECT_WRITE);
1362
1363out:
1364	mock_file_free(gt->i915, file);
1365	return err;
1366}
1367
1368static int igt_reset_evict_fence(void *arg)
1369{
1370	struct intel_gt *gt = arg;
1371
1372	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1373				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1374}
1375
1376static int wait_for_others(struct intel_gt *gt,
1377			   struct intel_engine_cs *exclude)
1378{
1379	struct intel_engine_cs *engine;
1380	enum intel_engine_id id;
1381
1382	for_each_engine(engine, gt->i915, id) {
1383		if (engine == exclude)
1384			continue;
1385
1386		if (!wait_for_idle(engine))
1387			return -EIO;
1388	}
1389
1390	return 0;
1391}
1392
1393static int igt_reset_queue(void *arg)
1394{
1395	struct intel_gt *gt = arg;
1396	struct i915_gpu_error *global = &gt->i915->gpu_error;
1397	struct intel_engine_cs *engine;
1398	enum intel_engine_id id;
1399	struct hang h;
1400	int err;
1401
1402	/* Check that we replay pending requests following a hang */
1403
1404	igt_global_reset_lock(gt);
1405
1406	mutex_lock(&gt->i915->drm.struct_mutex);
1407	err = hang_init(&h, gt);
1408	if (err)
1409		goto unlock;
1410
1411	for_each_engine(engine, gt->i915, id) {
1412		struct i915_request *prev;
1413		IGT_TIMEOUT(end_time);
1414		unsigned int count;
1415
1416		if (!intel_engine_can_store_dword(engine))
1417			continue;
1418
1419		prev = hang_create_request(&h, engine);
1420		if (IS_ERR(prev)) {
1421			err = PTR_ERR(prev);
1422			goto fini;
1423		}
1424
1425		i915_request_get(prev);
1426		i915_request_add(prev);
1427
1428		count = 0;
1429		do {
1430			struct i915_request *rq;
1431			unsigned int reset_count;
1432
1433			rq = hang_create_request(&h, engine);
1434			if (IS_ERR(rq)) {
1435				err = PTR_ERR(rq);
1436				goto fini;
1437			}
1438
1439			i915_request_get(rq);
1440			i915_request_add(rq);
1441
1442			/*
1443			 * XXX We don't handle resetting the kernel context
1444			 * very well. If we trigger a device reset twice in
1445			 * quick succession while the kernel context is
1446			 * executing, we may end up skipping the breadcrumb.
1447			 * This is really only a problem for the selftest as
1448			 * normally there is a large interlude between resets
1449			 * (hangcheck), or we focus on resetting just one
1450			 * engine and so avoid repeatedly resetting innocents.
1451			 */
1452			err = wait_for_others(gt, engine);
1453			if (err) {
1454				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1455				       __func__, engine->name);
1456				i915_request_put(rq);
1457				i915_request_put(prev);
1458
1459				GEM_TRACE_DUMP();
1460				intel_gt_set_wedged(gt);
1461				goto fini;
1462			}
1463
1464			if (!wait_until_running(&h, prev)) {
1465				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1466
1467				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1468				       __func__, engine->name,
1469				       prev->fence.seqno, hws_seqno(&h, prev));
1470				intel_engine_dump(engine, &p,
1471						  "%s\n", engine->name);
1472
1473				i915_request_put(rq);
1474				i915_request_put(prev);
1475
1476				intel_gt_set_wedged(gt);
1477
1478				err = -EIO;
1479				goto fini;
1480			}
1481
1482			reset_count = fake_hangcheck(gt, BIT(id));
1483
1484			if (prev->fence.error != -EIO) {
1485				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1486				       prev->fence.error);
1487				i915_request_put(rq);
1488				i915_request_put(prev);
1489				err = -EINVAL;
1490				goto fini;
1491			}
1492
1493			if (rq->fence.error) {
1494				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1495				       rq->fence.error);
1496				i915_request_put(rq);
1497				i915_request_put(prev);
1498				err = -EINVAL;
1499				goto fini;
1500			}
1501
1502			if (i915_reset_count(global) == reset_count) {
1503				pr_err("No GPU reset recorded!\n");
1504				i915_request_put(rq);
1505				i915_request_put(prev);
1506				err = -EINVAL;
1507				goto fini;
1508			}
1509
1510			i915_request_put(prev);
1511			prev = rq;
1512			count++;
1513		} while (time_before(jiffies, end_time));
1514		pr_info("%s: Completed %d resets\n", engine->name, count);
1515
1516		*h.batch = MI_BATCH_BUFFER_END;
1517		intel_gt_chipset_flush(engine->gt);
1518
1519		i915_request_put(prev);
1520
1521		err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
1522		if (err)
1523			break;
1524	}
1525
1526fini:
1527	hang_fini(&h);
1528unlock:
1529	mutex_unlock(&gt->i915->drm.struct_mutex);
1530	igt_global_reset_unlock(gt);
1531
1532	if (intel_gt_is_wedged(gt))
1533		return -EIO;
1534
1535	return err;
1536}
1537
1538static int igt_handle_error(void *arg)
1539{
1540	struct intel_gt *gt = arg;
1541	struct i915_gpu_error *global = &gt->i915->gpu_error;
1542	struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1543	struct hang h;
1544	struct i915_request *rq;
1545	struct i915_gpu_state *error;
1546	int err;
1547
1548	/* Check that we can issue a global GPU and engine reset */
1549
1550	if (!intel_has_reset_engine(gt->i915))
1551		return 0;
1552
1553	if (!engine || !intel_engine_can_store_dword(engine))
1554		return 0;
1555
1556	mutex_lock(&gt->i915->drm.struct_mutex);
1557
1558	err = hang_init(&h, gt);
1559	if (err)
1560		goto err_unlock;
1561
1562	rq = hang_create_request(&h, engine);
1563	if (IS_ERR(rq)) {
1564		err = PTR_ERR(rq);
1565		goto err_fini;
1566	}
1567
1568	i915_request_get(rq);
1569	i915_request_add(rq);
1570
1571	if (!wait_until_running(&h, rq)) {
1572		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1573
1574		pr_err("%s: Failed to start request %llx, at %x\n",
1575		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1576		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1577
1578		intel_gt_set_wedged(gt);
1579
1580		err = -EIO;
1581		goto err_request;
1582	}
1583
1584	mutex_unlock(&gt->i915->drm.struct_mutex);
1585
1586	/* Temporarily disable error capture */
1587	error = xchg(&global->first_error, (void *)-1);
1588
1589	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1590
1591	xchg(&global->first_error, error);
1592
1593	mutex_lock(&gt->i915->drm.struct_mutex);
1594
1595	if (rq->fence.error != -EIO) {
1596		pr_err("Guilty request not identified!\n");
1597		err = -EINVAL;
1598		goto err_request;
1599	}
1600
1601err_request:
1602	i915_request_put(rq);
1603err_fini:
1604	hang_fini(&h);
1605err_unlock:
1606	mutex_unlock(&gt->i915->drm.struct_mutex);
1607	return err;
1608}
1609
1610static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1611				     const struct igt_atomic_section *p,
1612				     const char *mode)
1613{
1614	struct tasklet_struct * const t = &engine->execlists.tasklet;
1615	int err;
1616
1617	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1618		  engine->name, mode, p->name);
1619
1620	tasklet_disable_nosync(t);
1621	p->critical_section_begin();
1622
1623	err = intel_engine_reset(engine, NULL);
1624
1625	p->critical_section_end();
1626	tasklet_enable(t);
1627
1628	if (err)
1629		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1630		       engine->name, mode, p->name);
1631
1632	return err;
1633}
1634
1635static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1636				   const struct igt_atomic_section *p)
1637{
1638	struct i915_request *rq;
1639	struct hang h;
1640	int err;
1641
1642	err = __igt_atomic_reset_engine(engine, p, "idle");
1643	if (err)
1644		return err;
1645
1646	err = hang_init(&h, engine->gt);
1647	if (err)
1648		return err;
1649
1650	rq = hang_create_request(&h, engine);
1651	if (IS_ERR(rq)) {
1652		err = PTR_ERR(rq);
1653		goto out;
1654	}
1655
1656	i915_request_get(rq);
1657	i915_request_add(rq);
1658
1659	if (wait_until_running(&h, rq)) {
1660		err = __igt_atomic_reset_engine(engine, p, "active");
1661	} else {
1662		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1663		       __func__, engine->name,
1664		       rq->fence.seqno, hws_seqno(&h, rq));
1665		intel_gt_set_wedged(engine->gt);
1666		err = -EIO;
1667	}
1668
1669	if (err == 0) {
1670		struct intel_wedge_me w;
1671
1672		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1673			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1674		if (intel_gt_is_wedged(engine->gt))
1675			err = -EIO;
1676	}
1677
1678	i915_request_put(rq);
1679out:
1680	hang_fini(&h);
1681	return err;
1682}
1683
1684static int igt_reset_engines_atomic(void *arg)
1685{
1686	struct intel_gt *gt = arg;
1687	const typeof(*igt_atomic_phases) *p;
1688	int err = 0;
1689
1690	/* Check that the engines resets are usable from atomic context */
1691
1692	if (!intel_has_reset_engine(gt->i915))
1693		return 0;
1694
1695	if (USES_GUC_SUBMISSION(gt->i915))
1696		return 0;
1697
1698	igt_global_reset_lock(gt);
1699	mutex_lock(&gt->i915->drm.struct_mutex);
1700
1701	/* Flush any requests before we get started and check basics */
1702	if (!igt_force_reset(gt))
1703		goto unlock;
1704
1705	for (p = igt_atomic_phases; p->name; p++) {
1706		struct intel_engine_cs *engine;
1707		enum intel_engine_id id;
1708
1709		for_each_engine(engine, gt->i915, id) {
1710			err = igt_atomic_reset_engine(engine, p);
1711			if (err)
1712				goto out;
1713		}
1714	}
1715
1716out:
1717	/* As we poke around the guts, do a full reset before continuing. */
1718	igt_force_reset(gt);
1719
1720unlock:
1721	mutex_unlock(&gt->i915->drm.struct_mutex);
1722	igt_global_reset_unlock(gt);
1723
1724	return err;
1725}
1726
1727int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1728{
1729	static const struct i915_subtest tests[] = {
1730		SUBTEST(igt_hang_sanitycheck),
1731		SUBTEST(igt_reset_nop),
1732		SUBTEST(igt_reset_nop_engine),
1733		SUBTEST(igt_reset_idle_engine),
1734		SUBTEST(igt_reset_active_engine),
1735		SUBTEST(igt_reset_engines),
1736		SUBTEST(igt_reset_engines_atomic),
1737		SUBTEST(igt_reset_queue),
1738		SUBTEST(igt_reset_wait),
1739		SUBTEST(igt_reset_evict_ggtt),
1740		SUBTEST(igt_reset_evict_ppgtt),
1741		SUBTEST(igt_reset_evict_fence),
1742		SUBTEST(igt_handle_error),
1743	};
1744	struct intel_gt *gt = &i915->gt;
1745	intel_wakeref_t wakeref;
1746	bool saved_hangcheck;
1747	int err;
1748
1749	if (!intel_has_gpu_reset(gt->i915))
1750		return 0;
1751
1752	if (intel_gt_is_wedged(gt))
1753		return -EIO; /* we're long past hope of a successful reset */
1754
1755	wakeref = intel_runtime_pm_get(&gt->i915->runtime_pm);
1756	saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1757	drain_delayed_work(&gt->hangcheck.work); /* flush param */
1758
1759	err = intel_gt_live_subtests(tests, gt);
1760
1761	mutex_lock(&gt->i915->drm.struct_mutex);
1762	igt_flush_test(gt->i915, I915_WAIT_LOCKED);
1763	mutex_unlock(&gt->i915->drm.struct_mutex);
1764
1765	i915_modparams.enable_hangcheck = saved_hangcheck;
1766	intel_runtime_pm_put(&gt->i915->runtime_pm, wakeref);
1767
1768	return err;
1769}