selftest_timeline.c - drivers/gpu/drm/i915/gt/selftest_timeline.c - Linux source code v4.6

Note: File does not exist in v4.6.
   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2017-2018 Intel Corporation
   4 */
   5
   6#include <linux/prime_numbers.h>
   7
   8#include "intel_context.h"
   9#include "intel_engine_heartbeat.h"
  10#include "intel_engine_pm.h"
  11#include "intel_gpu_commands.h"
  12#include "intel_gt.h"
  13#include "intel_gt_requests.h"
  14#include "intel_ring.h"
  15#include "selftest_engine_heartbeat.h"
  16
  17#include "../selftests/i915_random.h"
  18#include "../i915_selftest.h"
  19
  20#include "selftests/igt_flush_test.h"
  21#include "selftests/lib_sw_fence.h"
  22#include "selftests/mock_gem_device.h"
  23#include "selftests/mock_timeline.h"
  24
  25static struct page *hwsp_page(struct intel_timeline *tl)
  26{
  27	struct drm_i915_gem_object *obj = tl->hwsp_ggtt->obj;
  28
  29	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
  30	return sg_page(obj->mm.pages->sgl);
  31}
  32
  33static unsigned long hwsp_cacheline(struct intel_timeline *tl)
  34{
  35	unsigned long address = (unsigned long)page_address(hwsp_page(tl));
  36
  37	return (address + offset_in_page(tl->hwsp_offset)) / TIMELINE_SEQNO_BYTES;
  38}
  39
  40static int selftest_tl_pin(struct intel_timeline *tl)
  41{
  42	struct i915_gem_ww_ctx ww;
  43	int err;
  44
  45	i915_gem_ww_ctx_init(&ww, false);
  46retry:
  47	err = i915_gem_object_lock(tl->hwsp_ggtt->obj, &ww);
  48	if (!err)
  49		err = intel_timeline_pin(tl, &ww);
  50
  51	if (err == -EDEADLK) {
  52		err = i915_gem_ww_ctx_backoff(&ww);
  53		if (!err)
  54			goto retry;
  55	}
  56	i915_gem_ww_ctx_fini(&ww);
  57	return err;
  58}
  59
  60/* Only half of seqno's are usable, see __intel_timeline_get_seqno() */
  61#define CACHELINES_PER_PAGE (PAGE_SIZE / TIMELINE_SEQNO_BYTES / 2)
  62
  63struct mock_hwsp_freelist {
  64	struct intel_gt *gt;
  65	struct radix_tree_root cachelines;
  66	struct intel_timeline **history;
  67	unsigned long count, max;
  68	struct rnd_state prng;
  69};
  70
  71enum {
  72	SHUFFLE = BIT(0),
  73};
  74
  75static void __mock_hwsp_record(struct mock_hwsp_freelist *state,
  76			       unsigned int idx,
  77			       struct intel_timeline *tl)
  78{
  79	tl = xchg(&state->history[idx], tl);
  80	if (tl) {
  81		radix_tree_delete(&state->cachelines, hwsp_cacheline(tl));
  82		intel_timeline_unpin(tl);
  83		intel_timeline_put(tl);
  84	}
  85}
  86
  87static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state,
  88				unsigned int count,
  89				unsigned int flags)
  90{
  91	struct intel_timeline *tl;
  92	unsigned int idx;
  93
  94	while (count--) {
  95		unsigned long cacheline;
  96		int err;
  97
  98		tl = intel_timeline_create(state->gt);
  99		if (IS_ERR(tl))
 100			return PTR_ERR(tl);
 101
 102		err = selftest_tl_pin(tl);
 103		if (err) {
 104			intel_timeline_put(tl);
 105			return err;
 106		}
 107
 108		cacheline = hwsp_cacheline(tl);
 109		err = radix_tree_insert(&state->cachelines, cacheline, tl);
 110		if (err) {
 111			if (err == -EEXIST) {
 112				pr_err("HWSP cacheline %lu already used; duplicate allocation!\n",
 113				       cacheline);
 114			}
 115			intel_timeline_unpin(tl);
 116			intel_timeline_put(tl);
 117			return err;
 118		}
 119
 120		idx = state->count++ % state->max;
 121		__mock_hwsp_record(state, idx, tl);
 122	}
 123
 124	if (flags & SHUFFLE)
 125		i915_prandom_shuffle(state->history,
 126				     sizeof(*state->history),
 127				     min(state->count, state->max),
 128				     &state->prng);
 129
 130	count = i915_prandom_u32_max_state(min(state->count, state->max),
 131					   &state->prng);
 132	while (count--) {
 133		idx = --state->count % state->max;
 134		__mock_hwsp_record(state, idx, NULL);
 135	}
 136
 137	return 0;
 138}
 139
 140static int mock_hwsp_freelist(void *arg)
 141{
 142	struct mock_hwsp_freelist state;
 143	struct drm_i915_private *i915;
 144	const struct {
 145		const char *name;
 146		unsigned int flags;
 147	} phases[] = {
 148		{ "linear", 0 },
 149		{ "shuffled", SHUFFLE },
 150		{ },
 151	}, *p;
 152	unsigned int na;
 153	int err = 0;
 154
 155	i915 = mock_gem_device();
 156	if (!i915)
 157		return -ENOMEM;
 158
 159	INIT_RADIX_TREE(&state.cachelines, GFP_KERNEL);
 160	state.prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed);
 161
 162	state.gt = &i915->gt;
 163
 164	/*
 165	 * Create a bunch of timelines and check that their HWSP do not overlap.
 166	 * Free some, and try again.
 167	 */
 168
 169	state.max = PAGE_SIZE / sizeof(*state.history);
 170	state.count = 0;
 171	state.history = kcalloc(state.max, sizeof(*state.history), GFP_KERNEL);
 172	if (!state.history) {
 173		err = -ENOMEM;
 174		goto err_put;
 175	}
 176
 177	for (p = phases; p->name; p++) {
 178		pr_debug("%s(%s)\n", __func__, p->name);
 179		for_each_prime_number_from(na, 1, 2 * CACHELINES_PER_PAGE) {
 180			err = __mock_hwsp_timeline(&state, na, p->flags);
 181			if (err)
 182				goto out;
 183		}
 184	}
 185
 186out:
 187	for (na = 0; na < state.max; na++)
 188		__mock_hwsp_record(&state, na, NULL);
 189	kfree(state.history);
 190err_put:
 191	mock_destroy_device(i915);
 192	return err;
 193}
 194
 195struct __igt_sync {
 196	const char *name;
 197	u32 seqno;
 198	bool expected;
 199	bool set;
 200};
 201
 202static int __igt_sync(struct intel_timeline *tl,
 203		      u64 ctx,
 204		      const struct __igt_sync *p,
 205		      const char *name)
 206{
 207	int ret;
 208
 209	if (__intel_timeline_sync_is_later(tl, ctx, p->seqno) != p->expected) {
 210		pr_err("%s: %s(ctx=%llu, seqno=%u) expected passed %s but failed\n",
 211		       name, p->name, ctx, p->seqno, yesno(p->expected));
 212		return -EINVAL;
 213	}
 214
 215	if (p->set) {
 216		ret = __intel_timeline_sync_set(tl, ctx, p->seqno);
 217		if (ret)
 218			return ret;
 219	}
 220
 221	return 0;
 222}
 223
 224static int igt_sync(void *arg)
 225{
 226	const struct __igt_sync pass[] = {
 227		{ "unset", 0, false, false },
 228		{ "new", 0, false, true },
 229		{ "0a", 0, true, true },
 230		{ "1a", 1, false, true },
 231		{ "1b", 1, true, true },
 232		{ "0b", 0, true, false },
 233		{ "2a", 2, false, true },
 234		{ "4", 4, false, true },
 235		{ "INT_MAX", INT_MAX, false, true },
 236		{ "INT_MAX-1", INT_MAX-1, true, false },
 237		{ "INT_MAX+1", (u32)INT_MAX+1, false, true },
 238		{ "INT_MAX", INT_MAX, true, false },
 239		{ "UINT_MAX", UINT_MAX, false, true },
 240		{ "wrap", 0, false, true },
 241		{ "unwrap", UINT_MAX, true, false },
 242		{},
 243	}, *p;
 244	struct intel_timeline tl;
 245	int order, offset;
 246	int ret = -ENODEV;
 247
 248	mock_timeline_init(&tl, 0);
 249	for (p = pass; p->name; p++) {
 250		for (order = 1; order < 64; order++) {
 251			for (offset = -1; offset <= (order > 1); offset++) {
 252				u64 ctx = BIT_ULL(order) + offset;
 253
 254				ret = __igt_sync(&tl, ctx, p, "1");
 255				if (ret)
 256					goto out;
 257			}
 258		}
 259	}
 260	mock_timeline_fini(&tl);
 261
 262	mock_timeline_init(&tl, 0);
 263	for (order = 1; order < 64; order++) {
 264		for (offset = -1; offset <= (order > 1); offset++) {
 265			u64 ctx = BIT_ULL(order) + offset;
 266
 267			for (p = pass; p->name; p++) {
 268				ret = __igt_sync(&tl, ctx, p, "2");
 269				if (ret)
 270					goto out;
 271			}
 272		}
 273	}
 274
 275out:
 276	mock_timeline_fini(&tl);
 277	return ret;
 278}
 279
 280static unsigned int random_engine(struct rnd_state *rnd)
 281{
 282	return i915_prandom_u32_max_state(I915_NUM_ENGINES, rnd);
 283}
 284
 285static int bench_sync(void *arg)
 286{
 287	struct rnd_state prng;
 288	struct intel_timeline tl;
 289	unsigned long end_time, count;
 290	u64 prng32_1M;
 291	ktime_t kt;
 292	int order, last_order;
 293
 294	mock_timeline_init(&tl, 0);
 295
 296	/* Lookups from cache are very fast and so the random number generation
 297	 * and the loop itself becomes a significant factor in the per-iteration
 298	 * timings. We try to compensate the results by measuring the overhead
 299	 * of the prng and subtract it from the reported results.
 300	 */
 301	prandom_seed_state(&prng, i915_selftest.random_seed);
 302	count = 0;
 303	kt = ktime_get();
 304	end_time = jiffies + HZ/10;
 305	do {
 306		u32 x;
 307
 308		/* Make sure the compiler doesn't optimise away the prng call */
 309		WRITE_ONCE(x, prandom_u32_state(&prng));
 310
 311		count++;
 312	} while (!time_after(jiffies, end_time));
 313	kt = ktime_sub(ktime_get(), kt);
 314	pr_debug("%s: %lu random evaluations, %lluns/prng\n",
 315		 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 316	prng32_1M = div64_ul(ktime_to_ns(kt) << 20, count);
 317
 318	/* Benchmark (only) setting random context ids */
 319	prandom_seed_state(&prng, i915_selftest.random_seed);
 320	count = 0;
 321	kt = ktime_get();
 322	end_time = jiffies + HZ/10;
 323	do {
 324		u64 id = i915_prandom_u64_state(&prng);
 325
 326		__intel_timeline_sync_set(&tl, id, 0);
 327		count++;
 328	} while (!time_after(jiffies, end_time));
 329	kt = ktime_sub(ktime_get(), kt);
 330	kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
 331	pr_info("%s: %lu random insertions, %lluns/insert\n",
 332		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 333
 334	/* Benchmark looking up the exact same context ids as we just set */
 335	prandom_seed_state(&prng, i915_selftest.random_seed);
 336	end_time = count;
 337	kt = ktime_get();
 338	while (end_time--) {
 339		u64 id = i915_prandom_u64_state(&prng);
 340
 341		if (!__intel_timeline_sync_is_later(&tl, id, 0)) {
 342			mock_timeline_fini(&tl);
 343			pr_err("Lookup of %llu failed\n", id);
 344			return -EINVAL;
 345		}
 346	}
 347	kt = ktime_sub(ktime_get(), kt);
 348	kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
 349	pr_info("%s: %lu random lookups, %lluns/lookup\n",
 350		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 351
 352	mock_timeline_fini(&tl);
 353	cond_resched();
 354
 355	mock_timeline_init(&tl, 0);
 356
 357	/* Benchmark setting the first N (in order) contexts */
 358	count = 0;
 359	kt = ktime_get();
 360	end_time = jiffies + HZ/10;
 361	do {
 362		__intel_timeline_sync_set(&tl, count++, 0);
 363	} while (!time_after(jiffies, end_time));
 364	kt = ktime_sub(ktime_get(), kt);
 365	pr_info("%s: %lu in-order insertions, %lluns/insert\n",
 366		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 367
 368	/* Benchmark looking up the exact same context ids as we just set */
 369	end_time = count;
 370	kt = ktime_get();
 371	while (end_time--) {
 372		if (!__intel_timeline_sync_is_later(&tl, end_time, 0)) {
 373			pr_err("Lookup of %lu failed\n", end_time);
 374			mock_timeline_fini(&tl);
 375			return -EINVAL;
 376		}
 377	}
 378	kt = ktime_sub(ktime_get(), kt);
 379	pr_info("%s: %lu in-order lookups, %lluns/lookup\n",
 380		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 381
 382	mock_timeline_fini(&tl);
 383	cond_resched();
 384
 385	mock_timeline_init(&tl, 0);
 386
 387	/* Benchmark searching for a random context id and maybe changing it */
 388	prandom_seed_state(&prng, i915_selftest.random_seed);
 389	count = 0;
 390	kt = ktime_get();
 391	end_time = jiffies + HZ/10;
 392	do {
 393		u32 id = random_engine(&prng);
 394		u32 seqno = prandom_u32_state(&prng);
 395
 396		if (!__intel_timeline_sync_is_later(&tl, id, seqno))
 397			__intel_timeline_sync_set(&tl, id, seqno);
 398
 399		count++;
 400	} while (!time_after(jiffies, end_time));
 401	kt = ktime_sub(ktime_get(), kt);
 402	kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
 403	pr_info("%s: %lu repeated insert/lookups, %lluns/op\n",
 404		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 405	mock_timeline_fini(&tl);
 406	cond_resched();
 407
 408	/* Benchmark searching for a known context id and changing the seqno */
 409	for (last_order = 1, order = 1; order < 32;
 410	     ({ int tmp = last_order; last_order = order; order += tmp; })) {
 411		unsigned int mask = BIT(order) - 1;
 412
 413		mock_timeline_init(&tl, 0);
 414
 415		count = 0;
 416		kt = ktime_get();
 417		end_time = jiffies + HZ/10;
 418		do {
 419			/* Without assuming too many details of the underlying
 420			 * implementation, try to identify its phase-changes
 421			 * (if any)!
 422			 */
 423			u64 id = (u64)(count & mask) << order;
 424
 425			__intel_timeline_sync_is_later(&tl, id, 0);
 426			__intel_timeline_sync_set(&tl, id, 0);
 427
 428			count++;
 429		} while (!time_after(jiffies, end_time));
 430		kt = ktime_sub(ktime_get(), kt);
 431		pr_info("%s: %lu cyclic/%d insert/lookups, %lluns/op\n",
 432			__func__, count, order,
 433			(long long)div64_ul(ktime_to_ns(kt), count));
 434		mock_timeline_fini(&tl);
 435		cond_resched();
 436	}
 437
 438	return 0;
 439}
 440
 441int intel_timeline_mock_selftests(void)
 442{
 443	static const struct i915_subtest tests[] = {
 444		SUBTEST(mock_hwsp_freelist),
 445		SUBTEST(igt_sync),
 446		SUBTEST(bench_sync),
 447	};
 448
 449	return i915_subtests(tests, NULL);
 450}
 451
 452static int emit_ggtt_store_dw(struct i915_request *rq, u32 addr, u32 value)
 453{
 454	u32 *cs;
 455
 456	cs = intel_ring_begin(rq, 4);
 457	if (IS_ERR(cs))
 458		return PTR_ERR(cs);
 459
 460	if (GRAPHICS_VER(rq->engine->i915) >= 8) {
 461		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 462		*cs++ = addr;
 463		*cs++ = 0;
 464		*cs++ = value;
 465	} else if (GRAPHICS_VER(rq->engine->i915) >= 4) {
 466		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 467		*cs++ = 0;
 468		*cs++ = addr;
 469		*cs++ = value;
 470	} else {
 471		*cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
 472		*cs++ = addr;
 473		*cs++ = value;
 474		*cs++ = MI_NOOP;
 475	}
 476
 477	intel_ring_advance(rq, cs);
 478
 479	return 0;
 480}
 481
 482static struct i915_request *
 483checked_tl_write(struct intel_timeline *tl, struct intel_engine_cs *engine, u32 value)
 484{
 485	struct i915_request *rq;
 486	int err;
 487
 488	err = selftest_tl_pin(tl);
 489	if (err) {
 490		rq = ERR_PTR(err);
 491		goto out;
 492	}
 493
 494	if (READ_ONCE(*tl->hwsp_seqno) != tl->seqno) {
 495		pr_err("Timeline created with incorrect breadcrumb, found %x, expected %x\n",
 496		       *tl->hwsp_seqno, tl->seqno);
 497		intel_timeline_unpin(tl);
 498		return ERR_PTR(-EINVAL);
 499	}
 500
 501	rq = intel_engine_create_kernel_request(engine);
 502	if (IS_ERR(rq))
 503		goto out_unpin;
 504
 505	i915_request_get(rq);
 506
 507	err = emit_ggtt_store_dw(rq, tl->hwsp_offset, value);
 508	i915_request_add(rq);
 509	if (err) {
 510		i915_request_put(rq);
 511		rq = ERR_PTR(err);
 512	}
 513
 514out_unpin:
 515	intel_timeline_unpin(tl);
 516out:
 517	if (IS_ERR(rq))
 518		pr_err("Failed to write to timeline!\n");
 519	return rq;
 520}
 521
 522static int live_hwsp_engine(void *arg)
 523{
 524#define NUM_TIMELINES 4096
 525	struct intel_gt *gt = arg;
 526	struct intel_timeline **timelines;
 527	struct intel_engine_cs *engine;
 528	enum intel_engine_id id;
 529	unsigned long count, n;
 530	int err = 0;
 531
 532	/*
 533	 * Create a bunch of timelines and check we can write
 534	 * independently to each of their breadcrumb slots.
 535	 */
 536
 537	timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
 538				   sizeof(*timelines),
 539				   GFP_KERNEL);
 540	if (!timelines)
 541		return -ENOMEM;
 542
 543	count = 0;
 544	for_each_engine(engine, gt, id) {
 545		if (!intel_engine_can_store_dword(engine))
 546			continue;
 547
 548		intel_engine_pm_get(engine);
 549
 550		for (n = 0; n < NUM_TIMELINES; n++) {
 551			struct intel_timeline *tl;
 552			struct i915_request *rq;
 553
 554			tl = intel_timeline_create(gt);
 555			if (IS_ERR(tl)) {
 556				err = PTR_ERR(tl);
 557				break;
 558			}
 559
 560			rq = checked_tl_write(tl, engine, count);
 561			if (IS_ERR(rq)) {
 562				intel_timeline_put(tl);
 563				err = PTR_ERR(rq);
 564				break;
 565			}
 566
 567			timelines[count++] = tl;
 568			i915_request_put(rq);
 569		}
 570
 571		intel_engine_pm_put(engine);
 572		if (err)
 573			break;
 574	}
 575
 576	if (igt_flush_test(gt->i915))
 577		err = -EIO;
 578
 579	for (n = 0; n < count; n++) {
 580		struct intel_timeline *tl = timelines[n];
 581
 582		if (!err && READ_ONCE(*tl->hwsp_seqno) != n) {
 583			GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n",
 584				      n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno);
 585			GEM_TRACE_DUMP();
 586			err = -EINVAL;
 587		}
 588		intel_timeline_put(tl);
 589	}
 590
 591	kvfree(timelines);
 592	return err;
 593#undef NUM_TIMELINES
 594}
 595
 596static int live_hwsp_alternate(void *arg)
 597{
 598#define NUM_TIMELINES 4096
 599	struct intel_gt *gt = arg;
 600	struct intel_timeline **timelines;
 601	struct intel_engine_cs *engine;
 602	enum intel_engine_id id;
 603	unsigned long count, n;
 604	int err = 0;
 605
 606	/*
 607	 * Create a bunch of timelines and check we can write
 608	 * independently to each of their breadcrumb slots with adjacent
 609	 * engines.
 610	 */
 611
 612	timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
 613				   sizeof(*timelines),
 614				   GFP_KERNEL);
 615	if (!timelines)
 616		return -ENOMEM;
 617
 618	count = 0;
 619	for (n = 0; n < NUM_TIMELINES; n++) {
 620		for_each_engine(engine, gt, id) {
 621			struct intel_timeline *tl;
 622			struct i915_request *rq;
 623
 624			if (!intel_engine_can_store_dword(engine))
 625				continue;
 626
 627			tl = intel_timeline_create(gt);
 628			if (IS_ERR(tl)) {
 629				err = PTR_ERR(tl);
 630				goto out;
 631			}
 632
 633			intel_engine_pm_get(engine);
 634			rq = checked_tl_write(tl, engine, count);
 635			intel_engine_pm_put(engine);
 636			if (IS_ERR(rq)) {
 637				intel_timeline_put(tl);
 638				err = PTR_ERR(rq);
 639				goto out;
 640			}
 641
 642			timelines[count++] = tl;
 643			i915_request_put(rq);
 644		}
 645	}
 646
 647out:
 648	if (igt_flush_test(gt->i915))
 649		err = -EIO;
 650
 651	for (n = 0; n < count; n++) {
 652		struct intel_timeline *tl = timelines[n];
 653
 654		if (!err && READ_ONCE(*tl->hwsp_seqno) != n) {
 655			GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n",
 656				      n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno);
 657			GEM_TRACE_DUMP();
 658			err = -EINVAL;
 659		}
 660		intel_timeline_put(tl);
 661	}
 662
 663	kvfree(timelines);
 664	return err;
 665#undef NUM_TIMELINES
 666}
 667
 668static int live_hwsp_wrap(void *arg)
 669{
 670	struct intel_gt *gt = arg;
 671	struct intel_engine_cs *engine;
 672	struct intel_timeline *tl;
 673	enum intel_engine_id id;
 674	int err = 0;
 675
 676	/*
 677	 * Across a seqno wrap, we need to keep the old cacheline alive for
 678	 * foreign GPU references.
 679	 */
 680
 681	tl = intel_timeline_create(gt);
 682	if (IS_ERR(tl))
 683		return PTR_ERR(tl);
 684
 685	if (!tl->has_initial_breadcrumb)
 686		goto out_free;
 687
 688	err = selftest_tl_pin(tl);
 689	if (err)
 690		goto out_free;
 691
 692	for_each_engine(engine, gt, id) {
 693		const u32 *hwsp_seqno[2];
 694		struct i915_request *rq;
 695		u32 seqno[2];
 696
 697		if (!intel_engine_can_store_dword(engine))
 698			continue;
 699
 700		rq = intel_engine_create_kernel_request(engine);
 701		if (IS_ERR(rq)) {
 702			err = PTR_ERR(rq);
 703			goto out;
 704		}
 705
 706		tl->seqno = -4u;
 707
 708		mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
 709		err = intel_timeline_get_seqno(tl, rq, &seqno[0]);
 710		mutex_unlock(&tl->mutex);
 711		if (err) {
 712			i915_request_add(rq);
 713			goto out;
 714		}
 715		pr_debug("seqno[0]:%08x, hwsp_offset:%08x\n",
 716			 seqno[0], tl->hwsp_offset);
 717
 718		err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[0]);
 719		if (err) {
 720			i915_request_add(rq);
 721			goto out;
 722		}
 723		hwsp_seqno[0] = tl->hwsp_seqno;
 724
 725		mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
 726		err = intel_timeline_get_seqno(tl, rq, &seqno[1]);
 727		mutex_unlock(&tl->mutex);
 728		if (err) {
 729			i915_request_add(rq);
 730			goto out;
 731		}
 732		pr_debug("seqno[1]:%08x, hwsp_offset:%08x\n",
 733			 seqno[1], tl->hwsp_offset);
 734
 735		err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[1]);
 736		if (err) {
 737			i915_request_add(rq);
 738			goto out;
 739		}
 740		hwsp_seqno[1] = tl->hwsp_seqno;
 741
 742		/* With wrap should come a new hwsp */
 743		GEM_BUG_ON(seqno[1] >= seqno[0]);
 744		GEM_BUG_ON(hwsp_seqno[0] == hwsp_seqno[1]);
 745
 746		i915_request_add(rq);
 747
 748		if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 749			pr_err("Wait for timeline writes timed out!\n");
 750			err = -EIO;
 751			goto out;
 752		}
 753
 754		if (READ_ONCE(*hwsp_seqno[0]) != seqno[0] ||
 755		    READ_ONCE(*hwsp_seqno[1]) != seqno[1]) {
 756			pr_err("Bad timeline values: found (%x, %x), expected (%x, %x)\n",
 757			       *hwsp_seqno[0], *hwsp_seqno[1],
 758			       seqno[0], seqno[1]);
 759			err = -EINVAL;
 760			goto out;
 761		}
 762
 763		intel_gt_retire_requests(gt); /* recycle HWSP */
 764	}
 765
 766out:
 767	if (igt_flush_test(gt->i915))
 768		err = -EIO;
 769
 770	intel_timeline_unpin(tl);
 771out_free:
 772	intel_timeline_put(tl);
 773	return err;
 774}
 775
 776static int emit_read_hwsp(struct i915_request *rq,
 777			  u32 seqno, u32 hwsp,
 778			  u32 *addr)
 779{
 780	const u32 gpr = i915_mmio_reg_offset(GEN8_RING_CS_GPR(rq->engine->mmio_base, 0));
 781	u32 *cs;
 782
 783	cs = intel_ring_begin(rq, 12);
 784	if (IS_ERR(cs))
 785		return PTR_ERR(cs);
 786
 787	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 788	*cs++ = *addr;
 789	*cs++ = 0;
 790	*cs++ = seqno;
 791	*addr += 4;
 792
 793	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_USE_GGTT;
 794	*cs++ = gpr;
 795	*cs++ = hwsp;
 796	*cs++ = 0;
 797
 798	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
 799	*cs++ = gpr;
 800	*cs++ = *addr;
 801	*cs++ = 0;
 802	*addr += 4;
 803
 804	intel_ring_advance(rq, cs);
 805
 806	return 0;
 807}
 808
 809struct hwsp_watcher {
 810	struct i915_vma *vma;
 811	struct i915_request *rq;
 812	u32 addr;
 813	u32 *map;
 814};
 815
 816static bool cmp_lt(u32 a, u32 b)
 817{
 818	return a < b;
 819}
 820
 821static bool cmp_gte(u32 a, u32 b)
 822{
 823	return a >= b;
 824}
 825
 826static int setup_watcher(struct hwsp_watcher *w, struct intel_gt *gt)
 827{
 828	struct drm_i915_gem_object *obj;
 829	struct i915_vma *vma;
 830
 831	obj = i915_gem_object_create_internal(gt->i915, SZ_2M);
 832	if (IS_ERR(obj))
 833		return PTR_ERR(obj);
 834
 835	w->map = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB);
 836	if (IS_ERR(w->map)) {
 837		i915_gem_object_put(obj);
 838		return PTR_ERR(w->map);
 839	}
 840
 841	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0);
 842	if (IS_ERR(vma)) {
 843		i915_gem_object_put(obj);
 844		return PTR_ERR(vma);
 845	}
 846
 847	w->vma = vma;
 848	w->addr = i915_ggtt_offset(vma);
 849	return 0;
 850}
 851
 852static void switch_tl_lock(struct i915_request *from, struct i915_request *to)
 853{
 854	/* some light mutex juggling required; think co-routines */
 855
 856	if (from) {
 857		lockdep_unpin_lock(&from->context->timeline->mutex, from->cookie);
 858		mutex_unlock(&from->context->timeline->mutex);
 859	}
 860
 861	if (to) {
 862		mutex_lock(&to->context->timeline->mutex);
 863		to->cookie = lockdep_pin_lock(&to->context->timeline->mutex);
 864	}
 865}
 866
 867static int create_watcher(struct hwsp_watcher *w,
 868			  struct intel_engine_cs *engine,
 869			  int ringsz)
 870{
 871	struct intel_context *ce;
 872
 873	ce = intel_context_create(engine);
 874	if (IS_ERR(ce))
 875		return PTR_ERR(ce);
 876
 877	ce->ring = __intel_context_ring_size(ringsz);
 878	w->rq = intel_context_create_request(ce);
 879	intel_context_put(ce);
 880	if (IS_ERR(w->rq))
 881		return PTR_ERR(w->rq);
 882
 883	w->addr = i915_ggtt_offset(w->vma);
 884
 885	switch_tl_lock(w->rq, NULL);
 886
 887	return 0;
 888}
 889
 890static int check_watcher(struct hwsp_watcher *w, const char *name,
 891			 bool (*op)(u32 hwsp, u32 seqno))
 892{
 893	struct i915_request *rq = fetch_and_zero(&w->rq);
 894	u32 offset, end;
 895	int err;
 896
 897	GEM_BUG_ON(w->addr - i915_ggtt_offset(w->vma) > w->vma->size);
 898
 899	i915_request_get(rq);
 900	switch_tl_lock(NULL, rq);
 901	i915_request_add(rq);
 902
 903	if (i915_request_wait(rq, 0, HZ) < 0) {
 904		err = -ETIME;
 905		goto out;
 906	}
 907
 908	err = 0;
 909	offset = 0;
 910	end = (w->addr - i915_ggtt_offset(w->vma)) / sizeof(*w->map);
 911	while (offset < end) {
 912		if (!op(w->map[offset + 1], w->map[offset])) {
 913			pr_err("Watcher '%s' found HWSP value %x for seqno %x\n",
 914			       name, w->map[offset + 1], w->map[offset]);
 915			err = -EINVAL;
 916		}
 917
 918		offset += 2;
 919	}
 920
 921out:
 922	i915_request_put(rq);
 923	return err;
 924}
 925
 926static void cleanup_watcher(struct hwsp_watcher *w)
 927{
 928	if (w->rq) {
 929		switch_tl_lock(NULL, w->rq);
 930
 931		i915_request_add(w->rq);
 932	}
 933
 934	i915_vma_unpin_and_release(&w->vma, I915_VMA_RELEASE_MAP);
 935}
 936
 937static bool retire_requests(struct intel_timeline *tl)
 938{
 939	struct i915_request *rq, *rn;
 940
 941	mutex_lock(&tl->mutex);
 942	list_for_each_entry_safe(rq, rn, &tl->requests, link)
 943		if (!i915_request_retire(rq))
 944			break;
 945	mutex_unlock(&tl->mutex);
 946
 947	return !i915_active_fence_isset(&tl->last_request);
 948}
 949
 950static struct i915_request *wrap_timeline(struct i915_request *rq)
 951{
 952	struct intel_context *ce = rq->context;
 953	struct intel_timeline *tl = ce->timeline;
 954	u32 seqno = rq->fence.seqno;
 955
 956	while (tl->seqno >= seqno) { /* Cause a wrap */
 957		i915_request_put(rq);
 958		rq = intel_context_create_request(ce);
 959		if (IS_ERR(rq))
 960			return rq;
 961
 962		i915_request_get(rq);
 963		i915_request_add(rq);
 964	}
 965
 966	i915_request_put(rq);
 967	rq = i915_request_create(ce);
 968	if (IS_ERR(rq))
 969		return rq;
 970
 971	i915_request_get(rq);
 972	i915_request_add(rq);
 973
 974	return rq;
 975}
 976
 977static int live_hwsp_read(void *arg)
 978{
 979	struct intel_gt *gt = arg;
 980	struct hwsp_watcher watcher[2] = {};
 981	struct intel_engine_cs *engine;
 982	struct intel_timeline *tl;
 983	enum intel_engine_id id;
 984	int err = 0;
 985	int i;
 986
 987	/*
 988	 * If we take a reference to the HWSP for reading on the GPU, that
 989	 * read may be arbitrarily delayed (either by foreign fence or
 990	 * priority saturation) and a wrap can happen within 30 minutes.
 991	 * When the GPU read is finally submitted it should be correct,
 992	 * even across multiple wraps.
 993	 */
 994
 995	if (GRAPHICS_VER(gt->i915) < 8) /* CS convenience [SRM/LRM] */
 996		return 0;
 997
 998	tl = intel_timeline_create(gt);
 999	if (IS_ERR(tl))
1000		return PTR_ERR(tl);
1001
1002	if (!tl->has_initial_breadcrumb)
1003		goto out_free;
1004
1005	for (i = 0; i < ARRAY_SIZE(watcher); i++) {
1006		err = setup_watcher(&watcher[i], gt);
1007		if (err)
1008			goto out;
1009	}
1010
1011	for_each_engine(engine, gt, id) {
1012		struct intel_context *ce;
1013		unsigned long count = 0;
1014		IGT_TIMEOUT(end_time);
1015
1016		/* Create a request we can use for remote reading of the HWSP */
1017		err = create_watcher(&watcher[1], engine, SZ_512K);
1018		if (err)
1019			goto out;
1020
1021		do {
1022			struct i915_sw_fence *submit;
1023			struct i915_request *rq;
1024			u32 hwsp, dummy;
1025
1026			submit = heap_fence_create(GFP_KERNEL);
1027			if (!submit) {
1028				err = -ENOMEM;
1029				goto out;
1030			}
1031
1032			err = create_watcher(&watcher[0], engine, SZ_4K);
1033			if (err)
1034				goto out;
1035
1036			ce = intel_context_create(engine);
1037			if (IS_ERR(ce)) {
1038				err = PTR_ERR(ce);
1039				goto out;
1040			}
1041
1042			ce->timeline = intel_timeline_get(tl);
1043
1044			/* Ensure timeline is mapped, done during first pin */
1045			err = intel_context_pin(ce);
1046			if (err) {
1047				intel_context_put(ce);
1048				goto out;
1049			}
1050
1051			/*
1052			 * Start at a new wrap, and set seqno right before another wrap,
1053			 * saving 30 minutes of nops
1054			 */
1055			tl->seqno = -12u + 2 * (count & 3);
1056			__intel_timeline_get_seqno(tl, &dummy);
1057
1058			rq = i915_request_create(ce);
1059			if (IS_ERR(rq)) {
1060				err = PTR_ERR(rq);
1061				intel_context_unpin(ce);
1062				intel_context_put(ce);
1063				goto out;
1064			}
1065
1066			err = i915_sw_fence_await_dma_fence(&rq->submit,
1067							    &watcher[0].rq->fence, 0,
1068							    GFP_KERNEL);
1069			if (err < 0) {
1070				i915_request_add(rq);
1071				intel_context_unpin(ce);
1072				intel_context_put(ce);
1073				goto out;
1074			}
1075
1076			switch_tl_lock(rq, watcher[0].rq);
1077			err = intel_timeline_read_hwsp(rq, watcher[0].rq, &hwsp);
1078			if (err == 0)
1079				err = emit_read_hwsp(watcher[0].rq, /* before */
1080						     rq->fence.seqno, hwsp,
1081						     &watcher[0].addr);
1082			switch_tl_lock(watcher[0].rq, rq);
1083			if (err) {
1084				i915_request_add(rq);
1085				intel_context_unpin(ce);
1086				intel_context_put(ce);
1087				goto out;
1088			}
1089
1090			switch_tl_lock(rq, watcher[1].rq);
1091			err = intel_timeline_read_hwsp(rq, watcher[1].rq, &hwsp);
1092			if (err == 0)
1093				err = emit_read_hwsp(watcher[1].rq, /* after */
1094						     rq->fence.seqno, hwsp,
1095						     &watcher[1].addr);
1096			switch_tl_lock(watcher[1].rq, rq);
1097			if (err) {
1098				i915_request_add(rq);
1099				intel_context_unpin(ce);
1100				intel_context_put(ce);
1101				goto out;
1102			}
1103
1104			i915_request_get(rq);
1105			i915_request_add(rq);
1106
1107			rq = wrap_timeline(rq);
1108			intel_context_unpin(ce);
1109			intel_context_put(ce);
1110			if (IS_ERR(rq)) {
1111				err = PTR_ERR(rq);
1112				goto out;
1113			}
1114
1115			err = i915_sw_fence_await_dma_fence(&watcher[1].rq->submit,
1116							    &rq->fence, 0,
1117							    GFP_KERNEL);
1118			if (err < 0) {
1119				i915_request_put(rq);
1120				goto out;
1121			}
1122
1123			err = check_watcher(&watcher[0], "before", cmp_lt);
1124			i915_sw_fence_commit(submit);
1125			heap_fence_put(submit);
1126			if (err) {
1127				i915_request_put(rq);
1128				goto out;
1129			}
1130			count++;
1131
1132			/* Flush the timeline before manually wrapping again */
1133			if (i915_request_wait(rq,
1134					      I915_WAIT_INTERRUPTIBLE,
1135					      HZ) < 0) {
1136				err = -ETIME;
1137				i915_request_put(rq);
1138				goto out;
1139			}
1140			retire_requests(tl);
1141			i915_request_put(rq);
1142
1143			/* Single requests are limited to half a ring at most */
1144			if (8 * watcher[1].rq->ring->emit >
1145			    3 * watcher[1].rq->ring->size)
1146				break;
1147
1148		} while (!__igt_timeout(end_time, NULL) &&
1149			 count < (PAGE_SIZE / TIMELINE_SEQNO_BYTES - 1) / 2);
1150
1151		pr_info("%s: simulated %lu wraps\n", engine->name, count);
1152		err = check_watcher(&watcher[1], "after", cmp_gte);
1153		if (err)
1154			goto out;
1155	}
1156
1157out:
1158	for (i = 0; i < ARRAY_SIZE(watcher); i++)
1159		cleanup_watcher(&watcher[i]);
1160
1161	if (igt_flush_test(gt->i915))
1162		err = -EIO;
1163
1164out_free:
1165	intel_timeline_put(tl);
1166	return err;
1167}
1168
1169static int live_hwsp_rollover_kernel(void *arg)
1170{
1171	struct intel_gt *gt = arg;
1172	struct intel_engine_cs *engine;
1173	enum intel_engine_id id;
1174	int err = 0;
1175
1176	/*
1177	 * Run the host for long enough, and even the kernel context will
1178	 * see a seqno rollover.
1179	 */
1180
1181	for_each_engine(engine, gt, id) {
1182		struct intel_context *ce = engine->kernel_context;
1183		struct intel_timeline *tl = ce->timeline;
1184		struct i915_request *rq[3] = {};
1185		int i;
1186
1187		st_engine_heartbeat_disable(engine);
1188		if (intel_gt_wait_for_idle(gt, HZ / 2)) {
1189			err = -EIO;
1190			goto out;
1191		}
1192
1193		GEM_BUG_ON(i915_active_fence_isset(&tl->last_request));
1194		tl->seqno = -2u;
1195		WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1196
1197		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1198			struct i915_request *this;
1199
1200			this = i915_request_create(ce);
1201			if (IS_ERR(this)) {
1202				err = PTR_ERR(this);
1203				goto out;
1204			}
1205
1206			pr_debug("%s: create fence.seqnp:%d\n",
1207				 engine->name,
1208				 lower_32_bits(this->fence.seqno));
1209
1210			GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl);
1211
1212			rq[i] = i915_request_get(this);
1213			i915_request_add(this);
1214		}
1215
1216		/* We expected a wrap! */
1217		GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno);
1218
1219		if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
1220			pr_err("Wait for timeline wrap timed out!\n");
1221			err = -EIO;
1222			goto out;
1223		}
1224
1225		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1226			if (!i915_request_completed(rq[i])) {
1227				pr_err("Pre-wrap request not completed!\n");
1228				err = -EINVAL;
1229				goto out;
1230			}
1231		}
1232
1233out:
1234		for (i = 0; i < ARRAY_SIZE(rq); i++)
1235			i915_request_put(rq[i]);
1236		st_engine_heartbeat_enable(engine);
1237		if (err)
1238			break;
1239	}
1240
1241	if (igt_flush_test(gt->i915))
1242		err = -EIO;
1243
1244	return err;
1245}
1246
1247static int live_hwsp_rollover_user(void *arg)
1248{
1249	struct intel_gt *gt = arg;
1250	struct intel_engine_cs *engine;
1251	enum intel_engine_id id;
1252	int err = 0;
1253
1254	/*
1255	 * Simulate a long running user context, and force the seqno wrap
1256	 * on the user's timeline.
1257	 */
1258
1259	for_each_engine(engine, gt, id) {
1260		struct i915_request *rq[3] = {};
1261		struct intel_timeline *tl;
1262		struct intel_context *ce;
1263		int i;
1264
1265		ce = intel_context_create(engine);
1266		if (IS_ERR(ce))
1267			return PTR_ERR(ce);
1268
1269		err = intel_context_alloc_state(ce);
1270		if (err)
1271			goto out;
1272
1273		tl = ce->timeline;
1274		if (!tl->has_initial_breadcrumb)
1275			goto out;
1276
1277		err = intel_context_pin(ce);
1278		if (err)
1279			goto out;
1280
1281		tl->seqno = -4u;
1282		WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1283
1284		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1285			struct i915_request *this;
1286
1287			this = intel_context_create_request(ce);
1288			if (IS_ERR(this)) {
1289				err = PTR_ERR(this);
1290				goto out_unpin;
1291			}
1292
1293			pr_debug("%s: create fence.seqnp:%d\n",
1294				 engine->name,
1295				 lower_32_bits(this->fence.seqno));
1296
1297			GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl);
1298
1299			rq[i] = i915_request_get(this);
1300			i915_request_add(this);
1301		}
1302
1303		/* We expected a wrap! */
1304		GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno);
1305
1306		if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
1307			pr_err("Wait for timeline wrap timed out!\n");
1308			err = -EIO;
1309			goto out_unpin;
1310		}
1311
1312		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1313			if (!i915_request_completed(rq[i])) {
1314				pr_err("Pre-wrap request not completed!\n");
1315				err = -EINVAL;
1316				goto out_unpin;
1317			}
1318		}
1319out_unpin:
1320		intel_context_unpin(ce);
1321out:
1322		for (i = 0; i < ARRAY_SIZE(rq); i++)
1323			i915_request_put(rq[i]);
1324		intel_context_put(ce);
1325		if (err)
1326			break;
1327	}
1328
1329	if (igt_flush_test(gt->i915))
1330		err = -EIO;
1331
1332	return err;
1333}
1334
1335static int live_hwsp_recycle(void *arg)
1336{
1337	struct intel_gt *gt = arg;
1338	struct intel_engine_cs *engine;
1339	enum intel_engine_id id;
1340	unsigned long count;
1341	int err = 0;
1342
1343	/*
1344	 * Check seqno writes into one timeline at a time. We expect to
1345	 * recycle the breadcrumb slot between iterations and neither
1346	 * want to confuse ourselves or the GPU.
1347	 */
1348
1349	count = 0;
1350	for_each_engine(engine, gt, id) {
1351		IGT_TIMEOUT(end_time);
1352
1353		if (!intel_engine_can_store_dword(engine))
1354			continue;
1355
1356		intel_engine_pm_get(engine);
1357
1358		do {
1359			struct intel_timeline *tl;
1360			struct i915_request *rq;
1361
1362			tl = intel_timeline_create(gt);
1363			if (IS_ERR(tl)) {
1364				err = PTR_ERR(tl);
1365				break;
1366			}
1367
1368			rq = checked_tl_write(tl, engine, count);
1369			if (IS_ERR(rq)) {
1370				intel_timeline_put(tl);
1371				err = PTR_ERR(rq);
1372				break;
1373			}
1374
1375			if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1376				pr_err("Wait for timeline writes timed out!\n");
1377				i915_request_put(rq);
1378				intel_timeline_put(tl);
1379				err = -EIO;
1380				break;
1381			}
1382
1383			if (READ_ONCE(*tl->hwsp_seqno) != count) {
1384				GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x found 0x%x\n",
1385					      count, tl->fence_context,
1386					      tl->hwsp_offset, *tl->hwsp_seqno);
1387				GEM_TRACE_DUMP();
1388				err = -EINVAL;
1389			}
1390
1391			i915_request_put(rq);
1392			intel_timeline_put(tl);
1393			count++;
1394
1395			if (err)
1396				break;
1397		} while (!__igt_timeout(end_time, NULL));
1398
1399		intel_engine_pm_put(engine);
1400		if (err)
1401			break;
1402	}
1403
1404	return err;
1405}
1406
1407int intel_timeline_live_selftests(struct drm_i915_private *i915)
1408{
1409	static const struct i915_subtest tests[] = {
1410		SUBTEST(live_hwsp_recycle),
1411		SUBTEST(live_hwsp_engine),
1412		SUBTEST(live_hwsp_alternate),
1413		SUBTEST(live_hwsp_wrap),
1414		SUBTEST(live_hwsp_read),
1415		SUBTEST(live_hwsp_rollover_kernel),
1416		SUBTEST(live_hwsp_rollover_user),
1417	};
1418
1419	if (intel_gt_is_wedged(&i915->gt))
1420		return 0;
1421
1422	return intel_gt_live_subtests(tests, &i915->gt);
1423}