Linux Audio

Check our new training course

Linux debugging, profiling, tracing and performance analysis training

Apr 14-17, 2025
Register
Loading...
Note: File does not exist in v3.1.
   1/*
   2 * Copyright © 2016 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 */
  24
  25#include <linux/prime_numbers.h>
  26#include <linux/pm_qos.h>
  27#include <linux/sort.h>
  28
  29#include "gem/i915_gem_pm.h"
  30#include "gem/selftests/mock_context.h"
  31
  32#include "gt/intel_engine_heartbeat.h"
  33#include "gt/intel_engine_pm.h"
  34#include "gt/intel_engine_user.h"
  35#include "gt/intel_gt.h"
  36#include "gt/intel_gt_clock_utils.h"
  37#include "gt/intel_gt_requests.h"
  38#include "gt/selftest_engine_heartbeat.h"
  39
  40#include "i915_random.h"
  41#include "i915_selftest.h"
  42#include "igt_flush_test.h"
  43#include "igt_live_test.h"
  44#include "igt_spinner.h"
  45#include "lib_sw_fence.h"
  46
  47#include "mock_drm.h"
  48#include "mock_gem_device.h"
  49
  50static unsigned int num_uabi_engines(struct drm_i915_private *i915)
  51{
  52	struct intel_engine_cs *engine;
  53	unsigned int count;
  54
  55	count = 0;
  56	for_each_uabi_engine(engine, i915)
  57		count++;
  58
  59	return count;
  60}
  61
  62static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
  63{
  64	return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
  65}
  66
  67static int igt_add_request(void *arg)
  68{
  69	struct drm_i915_private *i915 = arg;
  70	struct i915_request *request;
  71
  72	/* Basic preliminary test to create a request and let it loose! */
  73
  74	request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
  75	if (!request)
  76		return -ENOMEM;
  77
  78	i915_request_add(request);
  79
  80	return 0;
  81}
  82
  83static int igt_wait_request(void *arg)
  84{
  85	const long T = HZ / 4;
  86	struct drm_i915_private *i915 = arg;
  87	struct i915_request *request;
  88	int err = -EINVAL;
  89
  90	/* Submit a request, then wait upon it */
  91
  92	request = mock_request(rcs0(i915)->kernel_context, T);
  93	if (!request)
  94		return -ENOMEM;
  95
  96	i915_request_get(request);
  97
  98	if (i915_request_wait(request, 0, 0) != -ETIME) {
  99		pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
 100		goto out_request;
 101	}
 102
 103	if (i915_request_wait(request, 0, T) != -ETIME) {
 104		pr_err("request wait succeeded (expected timeout before submit!)\n");
 105		goto out_request;
 106	}
 107
 108	if (i915_request_completed(request)) {
 109		pr_err("request completed before submit!!\n");
 110		goto out_request;
 111	}
 112
 113	i915_request_add(request);
 114
 115	if (i915_request_wait(request, 0, 0) != -ETIME) {
 116		pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
 117		goto out_request;
 118	}
 119
 120	if (i915_request_completed(request)) {
 121		pr_err("request completed immediately!\n");
 122		goto out_request;
 123	}
 124
 125	if (i915_request_wait(request, 0, T / 2) != -ETIME) {
 126		pr_err("request wait succeeded (expected timeout!)\n");
 127		goto out_request;
 128	}
 129
 130	if (i915_request_wait(request, 0, T) == -ETIME) {
 131		pr_err("request wait timed out!\n");
 132		goto out_request;
 133	}
 134
 135	if (!i915_request_completed(request)) {
 136		pr_err("request not complete after waiting!\n");
 137		goto out_request;
 138	}
 139
 140	if (i915_request_wait(request, 0, T) == -ETIME) {
 141		pr_err("request wait timed out when already complete!\n");
 142		goto out_request;
 143	}
 144
 145	err = 0;
 146out_request:
 147	i915_request_put(request);
 148	mock_device_flush(i915);
 149	return err;
 150}
 151
 152static int igt_fence_wait(void *arg)
 153{
 154	const long T = HZ / 4;
 155	struct drm_i915_private *i915 = arg;
 156	struct i915_request *request;
 157	int err = -EINVAL;
 158
 159	/* Submit a request, treat it as a fence and wait upon it */
 160
 161	request = mock_request(rcs0(i915)->kernel_context, T);
 162	if (!request)
 163		return -ENOMEM;
 164
 165	if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
 166		pr_err("fence wait success before submit (expected timeout)!\n");
 167		goto out;
 168	}
 169
 170	i915_request_add(request);
 171
 172	if (dma_fence_is_signaled(&request->fence)) {
 173		pr_err("fence signaled immediately!\n");
 174		goto out;
 175	}
 176
 177	if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
 178		pr_err("fence wait success after submit (expected timeout)!\n");
 179		goto out;
 180	}
 181
 182	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
 183		pr_err("fence wait timed out (expected success)!\n");
 184		goto out;
 185	}
 186
 187	if (!dma_fence_is_signaled(&request->fence)) {
 188		pr_err("fence unsignaled after waiting!\n");
 189		goto out;
 190	}
 191
 192	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
 193		pr_err("fence wait timed out when complete (expected success)!\n");
 194		goto out;
 195	}
 196
 197	err = 0;
 198out:
 199	mock_device_flush(i915);
 200	return err;
 201}
 202
 203static int igt_request_rewind(void *arg)
 204{
 205	struct drm_i915_private *i915 = arg;
 206	struct i915_request *request, *vip;
 207	struct i915_gem_context *ctx[2];
 208	struct intel_context *ce;
 209	int err = -EINVAL;
 210
 211	ctx[0] = mock_context(i915, "A");
 212
 213	ce = i915_gem_context_get_engine(ctx[0], RCS0);
 214	GEM_BUG_ON(IS_ERR(ce));
 215	request = mock_request(ce, 2 * HZ);
 216	intel_context_put(ce);
 217	if (!request) {
 218		err = -ENOMEM;
 219		goto err_context_0;
 220	}
 221
 222	i915_request_get(request);
 223	i915_request_add(request);
 224
 225	ctx[1] = mock_context(i915, "B");
 226
 227	ce = i915_gem_context_get_engine(ctx[1], RCS0);
 228	GEM_BUG_ON(IS_ERR(ce));
 229	vip = mock_request(ce, 0);
 230	intel_context_put(ce);
 231	if (!vip) {
 232		err = -ENOMEM;
 233		goto err_context_1;
 234	}
 235
 236	/* Simulate preemption by manual reordering */
 237	if (!mock_cancel_request(request)) {
 238		pr_err("failed to cancel request (already executed)!\n");
 239		i915_request_add(vip);
 240		goto err_context_1;
 241	}
 242	i915_request_get(vip);
 243	i915_request_add(vip);
 244	rcu_read_lock();
 245	request->engine->submit_request(request);
 246	rcu_read_unlock();
 247
 248
 249	if (i915_request_wait(vip, 0, HZ) == -ETIME) {
 250		pr_err("timed out waiting for high priority request\n");
 251		goto err;
 252	}
 253
 254	if (i915_request_completed(request)) {
 255		pr_err("low priority request already completed\n");
 256		goto err;
 257	}
 258
 259	err = 0;
 260err:
 261	i915_request_put(vip);
 262err_context_1:
 263	mock_context_close(ctx[1]);
 264	i915_request_put(request);
 265err_context_0:
 266	mock_context_close(ctx[0]);
 267	mock_device_flush(i915);
 268	return err;
 269}
 270
 271struct smoketest {
 272	struct intel_engine_cs *engine;
 273	struct i915_gem_context **contexts;
 274	atomic_long_t num_waits, num_fences;
 275	int ncontexts, max_batch;
 276	struct i915_request *(*request_alloc)(struct intel_context *ce);
 277};
 278
 279static struct i915_request *
 280__mock_request_alloc(struct intel_context *ce)
 281{
 282	return mock_request(ce, 0);
 283}
 284
 285static struct i915_request *
 286__live_request_alloc(struct intel_context *ce)
 287{
 288	return intel_context_create_request(ce);
 289}
 290
 291static int __igt_breadcrumbs_smoketest(void *arg)
 292{
 293	struct smoketest *t = arg;
 294	const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
 295	const unsigned int total = 4 * t->ncontexts + 1;
 296	unsigned int num_waits = 0, num_fences = 0;
 297	struct i915_request **requests;
 298	I915_RND_STATE(prng);
 299	unsigned int *order;
 300	int err = 0;
 301
 302	/*
 303	 * A very simple test to catch the most egregious of list handling bugs.
 304	 *
 305	 * At its heart, we simply create oodles of requests running across
 306	 * multiple kthreads and enable signaling on them, for the sole purpose
 307	 * of stressing our breadcrumb handling. The only inspection we do is
 308	 * that the fences were marked as signaled.
 309	 */
 310
 311	requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
 312	if (!requests)
 313		return -ENOMEM;
 314
 315	order = i915_random_order(total, &prng);
 316	if (!order) {
 317		err = -ENOMEM;
 318		goto out_requests;
 319	}
 320
 321	while (!kthread_should_stop()) {
 322		struct i915_sw_fence *submit, *wait;
 323		unsigned int n, count;
 324
 325		submit = heap_fence_create(GFP_KERNEL);
 326		if (!submit) {
 327			err = -ENOMEM;
 328			break;
 329		}
 330
 331		wait = heap_fence_create(GFP_KERNEL);
 332		if (!wait) {
 333			i915_sw_fence_commit(submit);
 334			heap_fence_put(submit);
 335			err = -ENOMEM;
 336			break;
 337		}
 338
 339		i915_random_reorder(order, total, &prng);
 340		count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
 341
 342		for (n = 0; n < count; n++) {
 343			struct i915_gem_context *ctx =
 344				t->contexts[order[n] % t->ncontexts];
 345			struct i915_request *rq;
 346			struct intel_context *ce;
 347
 348			ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
 349			GEM_BUG_ON(IS_ERR(ce));
 350			rq = t->request_alloc(ce);
 351			intel_context_put(ce);
 352			if (IS_ERR(rq)) {
 353				err = PTR_ERR(rq);
 354				count = n;
 355				break;
 356			}
 357
 358			err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
 359							       submit,
 360							       GFP_KERNEL);
 361
 362			requests[n] = i915_request_get(rq);
 363			i915_request_add(rq);
 364
 365			if (err >= 0)
 366				err = i915_sw_fence_await_dma_fence(wait,
 367								    &rq->fence,
 368								    0,
 369								    GFP_KERNEL);
 370
 371			if (err < 0) {
 372				i915_request_put(rq);
 373				count = n;
 374				break;
 375			}
 376		}
 377
 378		i915_sw_fence_commit(submit);
 379		i915_sw_fence_commit(wait);
 380
 381		if (!wait_event_timeout(wait->wait,
 382					i915_sw_fence_done(wait),
 383					5 * HZ)) {
 384			struct i915_request *rq = requests[count - 1];
 385
 386			pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
 387			       atomic_read(&wait->pending), count,
 388			       rq->fence.context, rq->fence.seqno,
 389			       t->engine->name);
 390			GEM_TRACE_DUMP();
 391
 392			intel_gt_set_wedged(t->engine->gt);
 393			GEM_BUG_ON(!i915_request_completed(rq));
 394			i915_sw_fence_wait(wait);
 395			err = -EIO;
 396		}
 397
 398		for (n = 0; n < count; n++) {
 399			struct i915_request *rq = requests[n];
 400
 401			if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
 402				      &rq->fence.flags)) {
 403				pr_err("%llu:%llu was not signaled!\n",
 404				       rq->fence.context, rq->fence.seqno);
 405				err = -EINVAL;
 406			}
 407
 408			i915_request_put(rq);
 409		}
 410
 411		heap_fence_put(wait);
 412		heap_fence_put(submit);
 413
 414		if (err < 0)
 415			break;
 416
 417		num_fences += count;
 418		num_waits++;
 419
 420		cond_resched();
 421	}
 422
 423	atomic_long_add(num_fences, &t->num_fences);
 424	atomic_long_add(num_waits, &t->num_waits);
 425
 426	kfree(order);
 427out_requests:
 428	kfree(requests);
 429	return err;
 430}
 431
 432static int mock_breadcrumbs_smoketest(void *arg)
 433{
 434	struct drm_i915_private *i915 = arg;
 435	struct smoketest t = {
 436		.engine = rcs0(i915),
 437		.ncontexts = 1024,
 438		.max_batch = 1024,
 439		.request_alloc = __mock_request_alloc
 440	};
 441	unsigned int ncpus = num_online_cpus();
 442	struct task_struct **threads;
 443	unsigned int n;
 444	int ret = 0;
 445
 446	/*
 447	 * Smoketest our breadcrumb/signal handling for requests across multiple
 448	 * threads. A very simple test to only catch the most egregious of bugs.
 449	 * See __igt_breadcrumbs_smoketest();
 450	 */
 451
 452	threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
 453	if (!threads)
 454		return -ENOMEM;
 455
 456	t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
 457	if (!t.contexts) {
 458		ret = -ENOMEM;
 459		goto out_threads;
 460	}
 461
 462	for (n = 0; n < t.ncontexts; n++) {
 463		t.contexts[n] = mock_context(t.engine->i915, "mock");
 464		if (!t.contexts[n]) {
 465			ret = -ENOMEM;
 466			goto out_contexts;
 467		}
 468	}
 469
 470	for (n = 0; n < ncpus; n++) {
 471		threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
 472					 &t, "igt/%d", n);
 473		if (IS_ERR(threads[n])) {
 474			ret = PTR_ERR(threads[n]);
 475			ncpus = n;
 476			break;
 477		}
 478
 479		get_task_struct(threads[n]);
 480	}
 481
 482	yield(); /* start all threads before we begin */
 483	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
 484
 485	for (n = 0; n < ncpus; n++) {
 486		int err;
 487
 488		err = kthread_stop(threads[n]);
 489		if (err < 0 && !ret)
 490			ret = err;
 491
 492		put_task_struct(threads[n]);
 493	}
 494	pr_info("Completed %lu waits for %lu fence across %d cpus\n",
 495		atomic_long_read(&t.num_waits),
 496		atomic_long_read(&t.num_fences),
 497		ncpus);
 498
 499out_contexts:
 500	for (n = 0; n < t.ncontexts; n++) {
 501		if (!t.contexts[n])
 502			break;
 503		mock_context_close(t.contexts[n]);
 504	}
 505	kfree(t.contexts);
 506out_threads:
 507	kfree(threads);
 508	return ret;
 509}
 510
 511int i915_request_mock_selftests(void)
 512{
 513	static const struct i915_subtest tests[] = {
 514		SUBTEST(igt_add_request),
 515		SUBTEST(igt_wait_request),
 516		SUBTEST(igt_fence_wait),
 517		SUBTEST(igt_request_rewind),
 518		SUBTEST(mock_breadcrumbs_smoketest),
 519	};
 520	struct drm_i915_private *i915;
 521	intel_wakeref_t wakeref;
 522	int err = 0;
 523
 524	i915 = mock_gem_device();
 525	if (!i915)
 526		return -ENOMEM;
 527
 528	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
 529		err = i915_subtests(tests, i915);
 530
 531	mock_destroy_device(i915);
 532
 533	return err;
 534}
 535
 536static int live_nop_request(void *arg)
 537{
 538	struct drm_i915_private *i915 = arg;
 539	struct intel_engine_cs *engine;
 540	struct igt_live_test t;
 541	int err = -ENODEV;
 542
 543	/*
 544	 * Submit various sized batches of empty requests, to each engine
 545	 * (individually), and wait for the batch to complete. We can check
 546	 * the overhead of submitting requests to the hardware.
 547	 */
 548
 549	for_each_uabi_engine(engine, i915) {
 550		unsigned long n, prime;
 551		IGT_TIMEOUT(end_time);
 552		ktime_t times[2] = {};
 553
 554		err = igt_live_test_begin(&t, i915, __func__, engine->name);
 555		if (err)
 556			return err;
 557
 558		intel_engine_pm_get(engine);
 559		for_each_prime_number_from(prime, 1, 8192) {
 560			struct i915_request *request = NULL;
 561
 562			times[1] = ktime_get_raw();
 563
 564			for (n = 0; n < prime; n++) {
 565				i915_request_put(request);
 566				request = i915_request_create(engine->kernel_context);
 567				if (IS_ERR(request))
 568					return PTR_ERR(request);
 569
 570				/*
 571				 * This space is left intentionally blank.
 572				 *
 573				 * We do not actually want to perform any
 574				 * action with this request, we just want
 575				 * to measure the latency in allocation
 576				 * and submission of our breadcrumbs -
 577				 * ensuring that the bare request is sufficient
 578				 * for the system to work (i.e. proper HEAD
 579				 * tracking of the rings, interrupt handling,
 580				 * etc). It also gives us the lowest bounds
 581				 * for latency.
 582				 */
 583
 584				i915_request_get(request);
 585				i915_request_add(request);
 586			}
 587			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
 588			i915_request_put(request);
 589
 590			times[1] = ktime_sub(ktime_get_raw(), times[1]);
 591			if (prime == 1)
 592				times[0] = times[1];
 593
 594			if (__igt_timeout(end_time, NULL))
 595				break;
 596		}
 597		intel_engine_pm_put(engine);
 598
 599		err = igt_live_test_end(&t);
 600		if (err)
 601			return err;
 602
 603		pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
 604			engine->name,
 605			ktime_to_ns(times[0]),
 606			prime, div64_u64(ktime_to_ns(times[1]), prime));
 607	}
 608
 609	return err;
 610}
 611
 612static int __cancel_inactive(struct intel_engine_cs *engine)
 613{
 614	struct intel_context *ce;
 615	struct igt_spinner spin;
 616	struct i915_request *rq;
 617	int err = 0;
 618
 619	if (igt_spinner_init(&spin, engine->gt))
 620		return -ENOMEM;
 621
 622	ce = intel_context_create(engine);
 623	if (IS_ERR(ce)) {
 624		err = PTR_ERR(ce);
 625		goto out_spin;
 626	}
 627
 628	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
 629	if (IS_ERR(rq)) {
 630		err = PTR_ERR(rq);
 631		goto out_ce;
 632	}
 633
 634	pr_debug("%s: Cancelling inactive request\n", engine->name);
 635	i915_request_cancel(rq, -EINTR);
 636	i915_request_get(rq);
 637	i915_request_add(rq);
 638
 639	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 640		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
 641
 642		pr_err("%s: Failed to cancel inactive request\n", engine->name);
 643		intel_engine_dump(engine, &p, "%s\n", engine->name);
 644		err = -ETIME;
 645		goto out_rq;
 646	}
 647
 648	if (rq->fence.error != -EINTR) {
 649		pr_err("%s: fence not cancelled (%u)\n",
 650		       engine->name, rq->fence.error);
 651		err = -EINVAL;
 652	}
 653
 654out_rq:
 655	i915_request_put(rq);
 656out_ce:
 657	intel_context_put(ce);
 658out_spin:
 659	igt_spinner_fini(&spin);
 660	if (err)
 661		pr_err("%s: %s error %d\n", __func__, engine->name, err);
 662	return err;
 663}
 664
 665static int __cancel_active(struct intel_engine_cs *engine)
 666{
 667	struct intel_context *ce;
 668	struct igt_spinner spin;
 669	struct i915_request *rq;
 670	int err = 0;
 671
 672	if (igt_spinner_init(&spin, engine->gt))
 673		return -ENOMEM;
 674
 675	ce = intel_context_create(engine);
 676	if (IS_ERR(ce)) {
 677		err = PTR_ERR(ce);
 678		goto out_spin;
 679	}
 680
 681	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
 682	if (IS_ERR(rq)) {
 683		err = PTR_ERR(rq);
 684		goto out_ce;
 685	}
 686
 687	pr_debug("%s: Cancelling active request\n", engine->name);
 688	i915_request_get(rq);
 689	i915_request_add(rq);
 690	if (!igt_wait_for_spinner(&spin, rq)) {
 691		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
 692
 693		pr_err("Failed to start spinner on %s\n", engine->name);
 694		intel_engine_dump(engine, &p, "%s\n", engine->name);
 695		err = -ETIME;
 696		goto out_rq;
 697	}
 698	i915_request_cancel(rq, -EINTR);
 699
 700	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 701		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
 702
 703		pr_err("%s: Failed to cancel active request\n", engine->name);
 704		intel_engine_dump(engine, &p, "%s\n", engine->name);
 705		err = -ETIME;
 706		goto out_rq;
 707	}
 708
 709	if (rq->fence.error != -EINTR) {
 710		pr_err("%s: fence not cancelled (%u)\n",
 711		       engine->name, rq->fence.error);
 712		err = -EINVAL;
 713	}
 714
 715out_rq:
 716	i915_request_put(rq);
 717out_ce:
 718	intel_context_put(ce);
 719out_spin:
 720	igt_spinner_fini(&spin);
 721	if (err)
 722		pr_err("%s: %s error %d\n", __func__, engine->name, err);
 723	return err;
 724}
 725
 726static int __cancel_completed(struct intel_engine_cs *engine)
 727{
 728	struct intel_context *ce;
 729	struct igt_spinner spin;
 730	struct i915_request *rq;
 731	int err = 0;
 732
 733	if (igt_spinner_init(&spin, engine->gt))
 734		return -ENOMEM;
 735
 736	ce = intel_context_create(engine);
 737	if (IS_ERR(ce)) {
 738		err = PTR_ERR(ce);
 739		goto out_spin;
 740	}
 741
 742	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
 743	if (IS_ERR(rq)) {
 744		err = PTR_ERR(rq);
 745		goto out_ce;
 746	}
 747	igt_spinner_end(&spin);
 748	i915_request_get(rq);
 749	i915_request_add(rq);
 750
 751	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 752		err = -ETIME;
 753		goto out_rq;
 754	}
 755
 756	pr_debug("%s: Cancelling completed request\n", engine->name);
 757	i915_request_cancel(rq, -EINTR);
 758	if (rq->fence.error) {
 759		pr_err("%s: fence not cancelled (%u)\n",
 760		       engine->name, rq->fence.error);
 761		err = -EINVAL;
 762	}
 763
 764out_rq:
 765	i915_request_put(rq);
 766out_ce:
 767	intel_context_put(ce);
 768out_spin:
 769	igt_spinner_fini(&spin);
 770	if (err)
 771		pr_err("%s: %s error %d\n", __func__, engine->name, err);
 772	return err;
 773}
 774
 775static int live_cancel_request(void *arg)
 776{
 777	struct drm_i915_private *i915 = arg;
 778	struct intel_engine_cs *engine;
 779
 780	/*
 781	 * Check cancellation of requests. We expect to be able to immediately
 782	 * cancel active requests, even if they are currently on the GPU.
 783	 */
 784
 785	for_each_uabi_engine(engine, i915) {
 786		struct igt_live_test t;
 787		int err, err2;
 788
 789		if (!intel_engine_has_preemption(engine))
 790			continue;
 791
 792		err = igt_live_test_begin(&t, i915, __func__, engine->name);
 793		if (err)
 794			return err;
 795
 796		err = __cancel_inactive(engine);
 797		if (err == 0)
 798			err = __cancel_active(engine);
 799		if (err == 0)
 800			err = __cancel_completed(engine);
 801
 802		err2 = igt_live_test_end(&t);
 803		if (err)
 804			return err;
 805		if (err2)
 806			return err2;
 807	}
 808
 809	return 0;
 810}
 811
 812static struct i915_vma *empty_batch(struct drm_i915_private *i915)
 813{
 814	struct drm_i915_gem_object *obj;
 815	struct i915_vma *vma;
 816	u32 *cmd;
 817	int err;
 818
 819	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
 820	if (IS_ERR(obj))
 821		return ERR_CAST(obj);
 822
 823	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB);
 824	if (IS_ERR(cmd)) {
 825		err = PTR_ERR(cmd);
 826		goto err;
 827	}
 828
 829	*cmd = MI_BATCH_BUFFER_END;
 830
 831	__i915_gem_object_flush_map(obj, 0, 64);
 832	i915_gem_object_unpin_map(obj);
 833
 834	intel_gt_chipset_flush(&i915->gt);
 835
 836	vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
 837	if (IS_ERR(vma)) {
 838		err = PTR_ERR(vma);
 839		goto err;
 840	}
 841
 842	err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
 843	if (err)
 844		goto err;
 845
 846	/* Force the wait wait now to avoid including it in the benchmark */
 847	err = i915_vma_sync(vma);
 848	if (err)
 849		goto err_pin;
 850
 851	return vma;
 852
 853err_pin:
 854	i915_vma_unpin(vma);
 855err:
 856	i915_gem_object_put(obj);
 857	return ERR_PTR(err);
 858}
 859
 860static struct i915_request *
 861empty_request(struct intel_engine_cs *engine,
 862	      struct i915_vma *batch)
 863{
 864	struct i915_request *request;
 865	int err;
 866
 867	request = i915_request_create(engine->kernel_context);
 868	if (IS_ERR(request))
 869		return request;
 870
 871	err = engine->emit_bb_start(request,
 872				    batch->node.start,
 873				    batch->node.size,
 874				    I915_DISPATCH_SECURE);
 875	if (err)
 876		goto out_request;
 877
 878	i915_request_get(request);
 879out_request:
 880	i915_request_add(request);
 881	return err ? ERR_PTR(err) : request;
 882}
 883
 884static int live_empty_request(void *arg)
 885{
 886	struct drm_i915_private *i915 = arg;
 887	struct intel_engine_cs *engine;
 888	struct igt_live_test t;
 889	struct i915_vma *batch;
 890	int err = 0;
 891
 892	/*
 893	 * Submit various sized batches of empty requests, to each engine
 894	 * (individually), and wait for the batch to complete. We can check
 895	 * the overhead of submitting requests to the hardware.
 896	 */
 897
 898	batch = empty_batch(i915);
 899	if (IS_ERR(batch))
 900		return PTR_ERR(batch);
 901
 902	for_each_uabi_engine(engine, i915) {
 903		IGT_TIMEOUT(end_time);
 904		struct i915_request *request;
 905		unsigned long n, prime;
 906		ktime_t times[2] = {};
 907
 908		err = igt_live_test_begin(&t, i915, __func__, engine->name);
 909		if (err)
 910			goto out_batch;
 911
 912		intel_engine_pm_get(engine);
 913
 914		/* Warmup / preload */
 915		request = empty_request(engine, batch);
 916		if (IS_ERR(request)) {
 917			err = PTR_ERR(request);
 918			intel_engine_pm_put(engine);
 919			goto out_batch;
 920		}
 921		i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
 922
 923		for_each_prime_number_from(prime, 1, 8192) {
 924			times[1] = ktime_get_raw();
 925
 926			for (n = 0; n < prime; n++) {
 927				i915_request_put(request);
 928				request = empty_request(engine, batch);
 929				if (IS_ERR(request)) {
 930					err = PTR_ERR(request);
 931					intel_engine_pm_put(engine);
 932					goto out_batch;
 933				}
 934			}
 935			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
 936
 937			times[1] = ktime_sub(ktime_get_raw(), times[1]);
 938			if (prime == 1)
 939				times[0] = times[1];
 940
 941			if (__igt_timeout(end_time, NULL))
 942				break;
 943		}
 944		i915_request_put(request);
 945		intel_engine_pm_put(engine);
 946
 947		err = igt_live_test_end(&t);
 948		if (err)
 949			goto out_batch;
 950
 951		pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
 952			engine->name,
 953			ktime_to_ns(times[0]),
 954			prime, div64_u64(ktime_to_ns(times[1]), prime));
 955	}
 956
 957out_batch:
 958	i915_vma_unpin(batch);
 959	i915_vma_put(batch);
 960	return err;
 961}
 962
 963static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
 964{
 965	struct drm_i915_gem_object *obj;
 966	const int ver = GRAPHICS_VER(i915);
 967	struct i915_vma *vma;
 968	u32 *cmd;
 969	int err;
 970
 971	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
 972	if (IS_ERR(obj))
 973		return ERR_CAST(obj);
 974
 975	vma = i915_vma_instance(obj, i915->gt.vm, NULL);
 976	if (IS_ERR(vma)) {
 977		err = PTR_ERR(vma);
 978		goto err;
 979	}
 980
 981	err = i915_vma_pin(vma, 0, 0, PIN_USER);
 982	if (err)
 983		goto err;
 984
 985	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
 986	if (IS_ERR(cmd)) {
 987		err = PTR_ERR(cmd);
 988		goto err;
 989	}
 990
 991	if (ver >= 8) {
 992		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 993		*cmd++ = lower_32_bits(vma->node.start);
 994		*cmd++ = upper_32_bits(vma->node.start);
 995	} else if (ver >= 6) {
 996		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
 997		*cmd++ = lower_32_bits(vma->node.start);
 998	} else {
 999		*cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1000		*cmd++ = lower_32_bits(vma->node.start);
1001	}
1002	*cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
1003
1004	__i915_gem_object_flush_map(obj, 0, 64);
1005	i915_gem_object_unpin_map(obj);
1006
1007	intel_gt_chipset_flush(&i915->gt);
1008
1009	return vma;
1010
1011err:
1012	i915_gem_object_put(obj);
1013	return ERR_PTR(err);
1014}
1015
1016static int recursive_batch_resolve(struct i915_vma *batch)
1017{
1018	u32 *cmd;
1019
1020	cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
1021	if (IS_ERR(cmd))
1022		return PTR_ERR(cmd);
1023
1024	*cmd = MI_BATCH_BUFFER_END;
1025
1026	__i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
1027	i915_gem_object_unpin_map(batch->obj);
1028
1029	intel_gt_chipset_flush(batch->vm->gt);
1030
1031	return 0;
1032}
1033
1034static int live_all_engines(void *arg)
1035{
1036	struct drm_i915_private *i915 = arg;
1037	const unsigned int nengines = num_uabi_engines(i915);
1038	struct intel_engine_cs *engine;
1039	struct i915_request **request;
1040	struct igt_live_test t;
1041	struct i915_vma *batch;
1042	unsigned int idx;
1043	int err;
1044
1045	/*
1046	 * Check we can submit requests to all engines simultaneously. We
1047	 * send a recursive batch to each engine - checking that we don't
1048	 * block doing so, and that they don't complete too soon.
1049	 */
1050
1051	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1052	if (!request)
1053		return -ENOMEM;
1054
1055	err = igt_live_test_begin(&t, i915, __func__, "");
1056	if (err)
1057		goto out_free;
1058
1059	batch = recursive_batch(i915);
1060	if (IS_ERR(batch)) {
1061		err = PTR_ERR(batch);
1062		pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
1063		goto out_free;
1064	}
1065
1066	i915_vma_lock(batch);
1067
1068	idx = 0;
1069	for_each_uabi_engine(engine, i915) {
1070		request[idx] = intel_engine_create_kernel_request(engine);
1071		if (IS_ERR(request[idx])) {
1072			err = PTR_ERR(request[idx]);
1073			pr_err("%s: Request allocation failed with err=%d\n",
1074			       __func__, err);
1075			goto out_request;
1076		}
1077
1078		err = i915_request_await_object(request[idx], batch->obj, 0);
1079		if (err == 0)
1080			err = i915_vma_move_to_active(batch, request[idx], 0);
1081		GEM_BUG_ON(err);
1082
1083		err = engine->emit_bb_start(request[idx],
1084					    batch->node.start,
1085					    batch->node.size,
1086					    0);
1087		GEM_BUG_ON(err);
1088		request[idx]->batch = batch;
1089
1090		i915_request_get(request[idx]);
1091		i915_request_add(request[idx]);
1092		idx++;
1093	}
1094
1095	i915_vma_unlock(batch);
1096
1097	idx = 0;
1098	for_each_uabi_engine(engine, i915) {
1099		if (i915_request_completed(request[idx])) {
1100			pr_err("%s(%s): request completed too early!\n",
1101			       __func__, engine->name);
1102			err = -EINVAL;
1103			goto out_request;
1104		}
1105		idx++;
1106	}
1107
1108	err = recursive_batch_resolve(batch);
1109	if (err) {
1110		pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
1111		goto out_request;
1112	}
1113
1114	idx = 0;
1115	for_each_uabi_engine(engine, i915) {
1116		long timeout;
1117
1118		timeout = i915_request_wait(request[idx], 0,
1119					    MAX_SCHEDULE_TIMEOUT);
1120		if (timeout < 0) {
1121			err = timeout;
1122			pr_err("%s: error waiting for request on %s, err=%d\n",
1123			       __func__, engine->name, err);
1124			goto out_request;
1125		}
1126
1127		GEM_BUG_ON(!i915_request_completed(request[idx]));
1128		i915_request_put(request[idx]);
1129		request[idx] = NULL;
1130		idx++;
1131	}
1132
1133	err = igt_live_test_end(&t);
1134
1135out_request:
1136	idx = 0;
1137	for_each_uabi_engine(engine, i915) {
1138		if (request[idx])
1139			i915_request_put(request[idx]);
1140		idx++;
1141	}
1142	i915_vma_unpin(batch);
1143	i915_vma_put(batch);
1144out_free:
1145	kfree(request);
1146	return err;
1147}
1148
1149static int live_sequential_engines(void *arg)
1150{
1151	struct drm_i915_private *i915 = arg;
1152	const unsigned int nengines = num_uabi_engines(i915);
1153	struct i915_request **request;
1154	struct i915_request *prev = NULL;
1155	struct intel_engine_cs *engine;
1156	struct igt_live_test t;
1157	unsigned int idx;
1158	int err;
1159
1160	/*
1161	 * Check we can submit requests to all engines sequentially, such
1162	 * that each successive request waits for the earlier ones. This
1163	 * tests that we don't execute requests out of order, even though
1164	 * they are running on independent engines.
1165	 */
1166
1167	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1168	if (!request)
1169		return -ENOMEM;
1170
1171	err = igt_live_test_begin(&t, i915, __func__, "");
1172	if (err)
1173		goto out_free;
1174
1175	idx = 0;
1176	for_each_uabi_engine(engine, i915) {
1177		struct i915_vma *batch;
1178
1179		batch = recursive_batch(i915);
1180		if (IS_ERR(batch)) {
1181			err = PTR_ERR(batch);
1182			pr_err("%s: Unable to create batch for %s, err=%d\n",
1183			       __func__, engine->name, err);
1184			goto out_free;
1185		}
1186
1187		i915_vma_lock(batch);
1188		request[idx] = intel_engine_create_kernel_request(engine);
1189		if (IS_ERR(request[idx])) {
1190			err = PTR_ERR(request[idx]);
1191			pr_err("%s: Request allocation failed for %s with err=%d\n",
1192			       __func__, engine->name, err);
1193			goto out_unlock;
1194		}
1195
1196		if (prev) {
1197			err = i915_request_await_dma_fence(request[idx],
1198							   &prev->fence);
1199			if (err) {
1200				i915_request_add(request[idx]);
1201				pr_err("%s: Request await failed for %s with err=%d\n",
1202				       __func__, engine->name, err);
1203				goto out_unlock;
1204			}
1205		}
1206
1207		err = i915_request_await_object(request[idx],
1208						batch->obj, false);
1209		if (err == 0)
1210			err = i915_vma_move_to_active(batch, request[idx], 0);
1211		GEM_BUG_ON(err);
1212
1213		err = engine->emit_bb_start(request[idx],
1214					    batch->node.start,
1215					    batch->node.size,
1216					    0);
1217		GEM_BUG_ON(err);
1218		request[idx]->batch = batch;
1219
1220		i915_request_get(request[idx]);
1221		i915_request_add(request[idx]);
1222
1223		prev = request[idx];
1224		idx++;
1225
1226out_unlock:
1227		i915_vma_unlock(batch);
1228		if (err)
1229			goto out_request;
1230	}
1231
1232	idx = 0;
1233	for_each_uabi_engine(engine, i915) {
1234		long timeout;
1235
1236		if (i915_request_completed(request[idx])) {
1237			pr_err("%s(%s): request completed too early!\n",
1238			       __func__, engine->name);
1239			err = -EINVAL;
1240			goto out_request;
1241		}
1242
1243		err = recursive_batch_resolve(request[idx]->batch);
1244		if (err) {
1245			pr_err("%s: failed to resolve batch, err=%d\n",
1246			       __func__, err);
1247			goto out_request;
1248		}
1249
1250		timeout = i915_request_wait(request[idx], 0,
1251					    MAX_SCHEDULE_TIMEOUT);
1252		if (timeout < 0) {
1253			err = timeout;
1254			pr_err("%s: error waiting for request on %s, err=%d\n",
1255			       __func__, engine->name, err);
1256			goto out_request;
1257		}
1258
1259		GEM_BUG_ON(!i915_request_completed(request[idx]));
1260		idx++;
1261	}
1262
1263	err = igt_live_test_end(&t);
1264
1265out_request:
1266	idx = 0;
1267	for_each_uabi_engine(engine, i915) {
1268		u32 *cmd;
1269
1270		if (!request[idx])
1271			break;
1272
1273		cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
1274						       I915_MAP_WC);
1275		if (!IS_ERR(cmd)) {
1276			*cmd = MI_BATCH_BUFFER_END;
1277
1278			__i915_gem_object_flush_map(request[idx]->batch->obj,
1279						    0, sizeof(*cmd));
1280			i915_gem_object_unpin_map(request[idx]->batch->obj);
1281
1282			intel_gt_chipset_flush(engine->gt);
1283		}
1284
1285		i915_vma_put(request[idx]->batch);
1286		i915_request_put(request[idx]);
1287		idx++;
1288	}
1289out_free:
1290	kfree(request);
1291	return err;
1292}
1293
1294static int __live_parallel_engine1(void *arg)
1295{
1296	struct intel_engine_cs *engine = arg;
1297	IGT_TIMEOUT(end_time);
1298	unsigned long count;
1299	int err = 0;
1300
1301	count = 0;
1302	intel_engine_pm_get(engine);
1303	do {
1304		struct i915_request *rq;
1305
1306		rq = i915_request_create(engine->kernel_context);
1307		if (IS_ERR(rq)) {
1308			err = PTR_ERR(rq);
1309			break;
1310		}
1311
1312		i915_request_get(rq);
1313		i915_request_add(rq);
1314
1315		err = 0;
1316		if (i915_request_wait(rq, 0, HZ / 5) < 0)
1317			err = -ETIME;
1318		i915_request_put(rq);
1319		if (err)
1320			break;
1321
1322		count++;
1323	} while (!__igt_timeout(end_time, NULL));
1324	intel_engine_pm_put(engine);
1325
1326	pr_info("%s: %lu request + sync\n", engine->name, count);
1327	return err;
1328}
1329
1330static int __live_parallel_engineN(void *arg)
1331{
1332	struct intel_engine_cs *engine = arg;
1333	IGT_TIMEOUT(end_time);
1334	unsigned long count;
1335	int err = 0;
1336
1337	count = 0;
1338	intel_engine_pm_get(engine);
1339	do {
1340		struct i915_request *rq;
1341
1342		rq = i915_request_create(engine->kernel_context);
1343		if (IS_ERR(rq)) {
1344			err = PTR_ERR(rq);
1345			break;
1346		}
1347
1348		i915_request_add(rq);
1349		count++;
1350	} while (!__igt_timeout(end_time, NULL));
1351	intel_engine_pm_put(engine);
1352
1353	pr_info("%s: %lu requests\n", engine->name, count);
1354	return err;
1355}
1356
1357static bool wake_all(struct drm_i915_private *i915)
1358{
1359	if (atomic_dec_and_test(&i915->selftest.counter)) {
1360		wake_up_var(&i915->selftest.counter);
1361		return true;
1362	}
1363
1364	return false;
1365}
1366
1367static int wait_for_all(struct drm_i915_private *i915)
1368{
1369	if (wake_all(i915))
1370		return 0;
1371
1372	if (wait_var_event_timeout(&i915->selftest.counter,
1373				   !atomic_read(&i915->selftest.counter),
1374				   i915_selftest.timeout_jiffies))
1375		return 0;
1376
1377	return -ETIME;
1378}
1379
1380static int __live_parallel_spin(void *arg)
1381{
1382	struct intel_engine_cs *engine = arg;
1383	struct igt_spinner spin;
1384	struct i915_request *rq;
1385	int err = 0;
1386
1387	/*
1388	 * Create a spinner running for eternity on each engine. If a second
1389	 * spinner is incorrectly placed on the same engine, it will not be
1390	 * able to start in time.
1391	 */
1392
1393	if (igt_spinner_init(&spin, engine->gt)) {
1394		wake_all(engine->i915);
1395		return -ENOMEM;
1396	}
1397
1398	intel_engine_pm_get(engine);
1399	rq = igt_spinner_create_request(&spin,
1400					engine->kernel_context,
1401					MI_NOOP); /* no preemption */
1402	intel_engine_pm_put(engine);
1403	if (IS_ERR(rq)) {
1404		err = PTR_ERR(rq);
1405		if (err == -ENODEV)
1406			err = 0;
1407		wake_all(engine->i915);
1408		goto out_spin;
1409	}
1410
1411	i915_request_get(rq);
1412	i915_request_add(rq);
1413	if (igt_wait_for_spinner(&spin, rq)) {
1414		/* Occupy this engine for the whole test */
1415		err = wait_for_all(engine->i915);
1416	} else {
1417		pr_err("Failed to start spinner on %s\n", engine->name);
1418		err = -EINVAL;
1419	}
1420	igt_spinner_end(&spin);
1421
1422	if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0)
1423		err = -EIO;
1424	i915_request_put(rq);
1425
1426out_spin:
1427	igt_spinner_fini(&spin);
1428	return err;
1429}
1430
1431static int live_parallel_engines(void *arg)
1432{
1433	struct drm_i915_private *i915 = arg;
1434	static int (* const func[])(void *arg) = {
1435		__live_parallel_engine1,
1436		__live_parallel_engineN,
1437		__live_parallel_spin,
1438		NULL,
1439	};
1440	const unsigned int nengines = num_uabi_engines(i915);
1441	struct intel_engine_cs *engine;
1442	int (* const *fn)(void *arg);
1443	struct task_struct **tsk;
1444	int err = 0;
1445
1446	/*
1447	 * Check we can submit requests to all engines concurrently. This
1448	 * tests that we load up the system maximally.
1449	 */
1450
1451	tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1452	if (!tsk)
1453		return -ENOMEM;
1454
1455	for (fn = func; !err && *fn; fn++) {
1456		char name[KSYM_NAME_LEN];
1457		struct igt_live_test t;
1458		unsigned int idx;
1459
1460		snprintf(name, sizeof(name), "%ps", *fn);
1461		err = igt_live_test_begin(&t, i915, __func__, name);
1462		if (err)
1463			break;
1464
1465		atomic_set(&i915->selftest.counter, nengines);
1466
1467		idx = 0;
1468		for_each_uabi_engine(engine, i915) {
1469			tsk[idx] = kthread_run(*fn, engine,
1470					       "igt/parallel:%s",
1471					       engine->name);
1472			if (IS_ERR(tsk[idx])) {
1473				err = PTR_ERR(tsk[idx]);
1474				break;
1475			}
1476			get_task_struct(tsk[idx++]);
1477		}
1478
1479		yield(); /* start all threads before we kthread_stop() */
1480
1481		idx = 0;
1482		for_each_uabi_engine(engine, i915) {
1483			int status;
1484
1485			if (IS_ERR(tsk[idx]))
1486				break;
1487
1488			status = kthread_stop(tsk[idx]);
1489			if (status && !err)
1490				err = status;
1491
1492			put_task_struct(tsk[idx++]);
1493		}
1494
1495		if (igt_live_test_end(&t))
1496			err = -EIO;
1497	}
1498
1499	kfree(tsk);
1500	return err;
1501}
1502
1503static int
1504max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1505{
1506	struct i915_request *rq;
1507	int ret;
1508
1509	/*
1510	 * Before execlists, all contexts share the same ringbuffer. With
1511	 * execlists, each context/engine has a separate ringbuffer and
1512	 * for the purposes of this test, inexhaustible.
1513	 *
1514	 * For the global ringbuffer though, we have to be very careful
1515	 * that we do not wrap while preventing the execution of requests
1516	 * with a unsignaled fence.
1517	 */
1518	if (HAS_EXECLISTS(ctx->i915))
1519		return INT_MAX;
1520
1521	rq = igt_request_alloc(ctx, engine);
1522	if (IS_ERR(rq)) {
1523		ret = PTR_ERR(rq);
1524	} else {
1525		int sz;
1526
1527		ret = rq->ring->size - rq->reserved_space;
1528		i915_request_add(rq);
1529
1530		sz = rq->ring->emit - rq->head;
1531		if (sz < 0)
1532			sz += rq->ring->size;
1533		ret /= sz;
1534		ret /= 2; /* leave half spare, in case of emergency! */
1535	}
1536
1537	return ret;
1538}
1539
1540static int live_breadcrumbs_smoketest(void *arg)
1541{
1542	struct drm_i915_private *i915 = arg;
1543	const unsigned int nengines = num_uabi_engines(i915);
1544	const unsigned int ncpus = num_online_cpus();
1545	unsigned long num_waits, num_fences;
1546	struct intel_engine_cs *engine;
1547	struct task_struct **threads;
1548	struct igt_live_test live;
1549	intel_wakeref_t wakeref;
1550	struct smoketest *smoke;
1551	unsigned int n, idx;
1552	struct file *file;
1553	int ret = 0;
1554
1555	/*
1556	 * Smoketest our breadcrumb/signal handling for requests across multiple
1557	 * threads. A very simple test to only catch the most egregious of bugs.
1558	 * See __igt_breadcrumbs_smoketest();
1559	 *
1560	 * On real hardware this time.
1561	 */
1562
1563	wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1564
1565	file = mock_file(i915);
1566	if (IS_ERR(file)) {
1567		ret = PTR_ERR(file);
1568		goto out_rpm;
1569	}
1570
1571	smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1572	if (!smoke) {
1573		ret = -ENOMEM;
1574		goto out_file;
1575	}
1576
1577	threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1578	if (!threads) {
1579		ret = -ENOMEM;
1580		goto out_smoke;
1581	}
1582
1583	smoke[0].request_alloc = __live_request_alloc;
1584	smoke[0].ncontexts = 64;
1585	smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1586				    sizeof(*smoke[0].contexts),
1587				    GFP_KERNEL);
1588	if (!smoke[0].contexts) {
1589		ret = -ENOMEM;
1590		goto out_threads;
1591	}
1592
1593	for (n = 0; n < smoke[0].ncontexts; n++) {
1594		smoke[0].contexts[n] = live_context(i915, file);
1595		if (IS_ERR(smoke[0].contexts[n])) {
1596			ret = PTR_ERR(smoke[0].contexts[n]);
1597			goto out_contexts;
1598		}
1599	}
1600
1601	ret = igt_live_test_begin(&live, i915, __func__, "");
1602	if (ret)
1603		goto out_contexts;
1604
1605	idx = 0;
1606	for_each_uabi_engine(engine, i915) {
1607		smoke[idx] = smoke[0];
1608		smoke[idx].engine = engine;
1609		smoke[idx].max_batch =
1610			max_batches(smoke[0].contexts[0], engine);
1611		if (smoke[idx].max_batch < 0) {
1612			ret = smoke[idx].max_batch;
1613			goto out_flush;
1614		}
1615		/* One ring interleaved between requests from all cpus */
1616		smoke[idx].max_batch /= num_online_cpus() + 1;
1617		pr_debug("Limiting batches to %d requests on %s\n",
1618			 smoke[idx].max_batch, engine->name);
1619
1620		for (n = 0; n < ncpus; n++) {
1621			struct task_struct *tsk;
1622
1623			tsk = kthread_run(__igt_breadcrumbs_smoketest,
1624					  &smoke[idx], "igt/%d.%d", idx, n);
1625			if (IS_ERR(tsk)) {
1626				ret = PTR_ERR(tsk);
1627				goto out_flush;
1628			}
1629
1630			get_task_struct(tsk);
1631			threads[idx * ncpus + n] = tsk;
1632		}
1633
1634		idx++;
1635	}
1636
1637	yield(); /* start all threads before we begin */
1638	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1639
1640out_flush:
1641	idx = 0;
1642	num_waits = 0;
1643	num_fences = 0;
1644	for_each_uabi_engine(engine, i915) {
1645		for (n = 0; n < ncpus; n++) {
1646			struct task_struct *tsk = threads[idx * ncpus + n];
1647			int err;
1648
1649			if (!tsk)
1650				continue;
1651
1652			err = kthread_stop(tsk);
1653			if (err < 0 && !ret)
1654				ret = err;
1655
1656			put_task_struct(tsk);
1657		}
1658
1659		num_waits += atomic_long_read(&smoke[idx].num_waits);
1660		num_fences += atomic_long_read(&smoke[idx].num_fences);
1661		idx++;
1662	}
1663	pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1664		num_waits, num_fences, idx, ncpus);
1665
1666	ret = igt_live_test_end(&live) ?: ret;
1667out_contexts:
1668	kfree(smoke[0].contexts);
1669out_threads:
1670	kfree(threads);
1671out_smoke:
1672	kfree(smoke);
1673out_file:
1674	fput(file);
1675out_rpm:
1676	intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1677
1678	return ret;
1679}
1680
1681int i915_request_live_selftests(struct drm_i915_private *i915)
1682{
1683	static const struct i915_subtest tests[] = {
1684		SUBTEST(live_nop_request),
1685		SUBTEST(live_all_engines),
1686		SUBTEST(live_sequential_engines),
1687		SUBTEST(live_parallel_engines),
1688		SUBTEST(live_empty_request),
1689		SUBTEST(live_cancel_request),
1690		SUBTEST(live_breadcrumbs_smoketest),
1691	};
1692
1693	if (intel_gt_is_wedged(&i915->gt))
1694		return 0;
1695
1696	return i915_subtests(tests, i915);
1697}
1698
1699static int switch_to_kernel_sync(struct intel_context *ce, int err)
1700{
1701	struct i915_request *rq;
1702	struct dma_fence *fence;
1703
1704	rq = intel_engine_create_kernel_request(ce->engine);
1705	if (IS_ERR(rq))
1706		return PTR_ERR(rq);
1707
1708	fence = i915_active_fence_get(&ce->timeline->last_request);
1709	if (fence) {
1710		i915_request_await_dma_fence(rq, fence);
1711		dma_fence_put(fence);
1712	}
1713
1714	rq = i915_request_get(rq);
1715	i915_request_add(rq);
1716	if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1717		err = -ETIME;
1718	i915_request_put(rq);
1719
1720	while (!err && !intel_engine_is_idle(ce->engine))
1721		intel_engine_flush_submission(ce->engine);
1722
1723	return err;
1724}
1725
1726struct perf_stats {
1727	struct intel_engine_cs *engine;
1728	unsigned long count;
1729	ktime_t time;
1730	ktime_t busy;
1731	u64 runtime;
1732};
1733
1734struct perf_series {
1735	struct drm_i915_private *i915;
1736	unsigned int nengines;
1737	struct intel_context *ce[];
1738};
1739
1740static int cmp_u32(const void *A, const void *B)
1741{
1742	const u32 *a = A, *b = B;
1743
1744	return *a - *b;
1745}
1746
1747static u32 trifilter(u32 *a)
1748{
1749	u64 sum;
1750
1751#define TF_COUNT 5
1752	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1753
1754	sum = mul_u32_u32(a[2], 2);
1755	sum += a[1];
1756	sum += a[3];
1757
1758	GEM_BUG_ON(sum > U32_MAX);
1759	return sum;
1760#define TF_BIAS 2
1761}
1762
1763static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1764{
1765	u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1766
1767	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1768}
1769
1770static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1771{
1772	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1773	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1774	*cs++ = offset;
1775	*cs++ = 0;
1776
1777	return cs;
1778}
1779
1780static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1781{
1782	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1783	*cs++ = offset;
1784	*cs++ = 0;
1785	*cs++ = value;
1786
1787	return cs;
1788}
1789
1790static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1791{
1792	*cs++ = MI_SEMAPHORE_WAIT |
1793		MI_SEMAPHORE_GLOBAL_GTT |
1794		MI_SEMAPHORE_POLL |
1795		mode;
1796	*cs++ = value;
1797	*cs++ = offset;
1798	*cs++ = 0;
1799
1800	return cs;
1801}
1802
1803static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1804{
1805	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1806}
1807
1808static void semaphore_set(u32 *sema, u32 value)
1809{
1810	WRITE_ONCE(*sema, value);
1811	wmb(); /* flush the update to the cache, and beyond */
1812}
1813
1814static u32 *hwsp_scratch(const struct intel_context *ce)
1815{
1816	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1817}
1818
1819static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1820{
1821	return (i915_ggtt_offset(ce->engine->status_page.vma) +
1822		offset_in_page(dw));
1823}
1824
1825static int measure_semaphore_response(struct intel_context *ce)
1826{
1827	u32 *sema = hwsp_scratch(ce);
1828	const u32 offset = hwsp_offset(ce, sema);
1829	u32 elapsed[TF_COUNT], cycles;
1830	struct i915_request *rq;
1831	u32 *cs;
1832	int err;
1833	int i;
1834
1835	/*
1836	 * Measure how many cycles it takes for the HW to detect the change
1837	 * in a semaphore value.
1838	 *
1839	 *    A: read CS_TIMESTAMP from CPU
1840	 *    poke semaphore
1841	 *    B: read CS_TIMESTAMP on GPU
1842	 *
1843	 * Semaphore latency: B - A
1844	 */
1845
1846	semaphore_set(sema, -1);
1847
1848	rq = i915_request_create(ce);
1849	if (IS_ERR(rq))
1850		return PTR_ERR(rq);
1851
1852	cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1853	if (IS_ERR(cs)) {
1854		i915_request_add(rq);
1855		err = PTR_ERR(cs);
1856		goto err;
1857	}
1858
1859	cs = emit_store_dw(cs, offset, 0);
1860	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1861		cs = emit_semaphore_poll_until(cs, offset, i);
1862		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1863		cs = emit_store_dw(cs, offset, 0);
1864	}
1865
1866	intel_ring_advance(rq, cs);
1867	i915_request_add(rq);
1868
1869	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1870		err = -EIO;
1871		goto err;
1872	}
1873
1874	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1875		preempt_disable();
1876		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1877		semaphore_set(sema, i);
1878		preempt_enable();
1879
1880		if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1881			err = -EIO;
1882			goto err;
1883		}
1884
1885		elapsed[i - 1] = sema[i] - cycles;
1886	}
1887
1888	cycles = trifilter(elapsed);
1889	pr_info("%s: semaphore response %d cycles, %lluns\n",
1890		ce->engine->name, cycles >> TF_BIAS,
1891		cycles_to_ns(ce->engine, cycles));
1892
1893	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1894
1895err:
1896	intel_gt_set_wedged(ce->engine->gt);
1897	return err;
1898}
1899
1900static int measure_idle_dispatch(struct intel_context *ce)
1901{
1902	u32 *sema = hwsp_scratch(ce);
1903	const u32 offset = hwsp_offset(ce, sema);
1904	u32 elapsed[TF_COUNT], cycles;
1905	u32 *cs;
1906	int err;
1907	int i;
1908
1909	/*
1910	 * Measure how long it takes for us to submit a request while the
1911	 * engine is idle, but is resting in our context.
1912	 *
1913	 *    A: read CS_TIMESTAMP from CPU
1914	 *    submit request
1915	 *    B: read CS_TIMESTAMP on GPU
1916	 *
1917	 * Submission latency: B - A
1918	 */
1919
1920	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
1921		struct i915_request *rq;
1922
1923		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1924		if (err)
1925			return err;
1926
1927		rq = i915_request_create(ce);
1928		if (IS_ERR(rq)) {
1929			err = PTR_ERR(rq);
1930			goto err;
1931		}
1932
1933		cs = intel_ring_begin(rq, 4);
1934		if (IS_ERR(cs)) {
1935			i915_request_add(rq);
1936			err = PTR_ERR(cs);
1937			goto err;
1938		}
1939
1940		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1941
1942		intel_ring_advance(rq, cs);
1943
1944		preempt_disable();
1945		local_bh_disable();
1946		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1947		i915_request_add(rq);
1948		local_bh_enable();
1949		preempt_enable();
1950	}
1951
1952	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1953	if (err)
1954		goto err;
1955
1956	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
1957		elapsed[i] = sema[i] - elapsed[i];
1958
1959	cycles = trifilter(elapsed);
1960	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
1961		ce->engine->name, cycles >> TF_BIAS,
1962		cycles_to_ns(ce->engine, cycles));
1963
1964	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1965
1966err:
1967	intel_gt_set_wedged(ce->engine->gt);
1968	return err;
1969}
1970
1971static int measure_busy_dispatch(struct intel_context *ce)
1972{
1973	u32 *sema = hwsp_scratch(ce);
1974	const u32 offset = hwsp_offset(ce, sema);
1975	u32 elapsed[TF_COUNT + 1], cycles;
1976	u32 *cs;
1977	int err;
1978	int i;
1979
1980	/*
1981	 * Measure how long it takes for us to submit a request while the
1982	 * engine is busy, polling on a semaphore in our context. With
1983	 * direct submission, this will include the cost of a lite restore.
1984	 *
1985	 *    A: read CS_TIMESTAMP from CPU
1986	 *    submit request
1987	 *    B: read CS_TIMESTAMP on GPU
1988	 *
1989	 * Submission latency: B - A
1990	 */
1991
1992	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1993		struct i915_request *rq;
1994
1995		rq = i915_request_create(ce);
1996		if (IS_ERR(rq)) {
1997			err = PTR_ERR(rq);
1998			goto err;
1999		}
2000
2001		cs = intel_ring_begin(rq, 12);
2002		if (IS_ERR(cs)) {
2003			i915_request_add(rq);
2004			err = PTR_ERR(cs);
2005			goto err;
2006		}
2007
2008		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2009		cs = emit_semaphore_poll_until(cs, offset, i);
2010		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2011
2012		intel_ring_advance(rq, cs);
2013
2014		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
2015			err = -EIO;
2016			goto err;
2017		}
2018
2019		preempt_disable();
2020		local_bh_disable();
2021		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2022		i915_request_add(rq);
2023		local_bh_enable();
2024		semaphore_set(sema, i - 1);
2025		preempt_enable();
2026	}
2027
2028	wait_for(READ_ONCE(sema[i - 1]), 500);
2029	semaphore_set(sema, i - 1);
2030
2031	for (i = 1; i <= TF_COUNT; i++) {
2032		GEM_BUG_ON(sema[i] == -1);
2033		elapsed[i - 1] = sema[i] - elapsed[i];
2034	}
2035
2036	cycles = trifilter(elapsed);
2037	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
2038		ce->engine->name, cycles >> TF_BIAS,
2039		cycles_to_ns(ce->engine, cycles));
2040
2041	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2042
2043err:
2044	intel_gt_set_wedged(ce->engine->gt);
2045	return err;
2046}
2047
2048static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
2049{
2050	const u32 offset =
2051		i915_ggtt_offset(engine->status_page.vma) +
2052		offset_in_page(sema);
2053	struct i915_request *rq;
2054	u32 *cs;
2055
2056	rq = i915_request_create(engine->kernel_context);
2057	if (IS_ERR(rq))
2058		return PTR_ERR(rq);
2059
2060	cs = intel_ring_begin(rq, 4);
2061	if (IS_ERR(cs)) {
2062		i915_request_add(rq);
2063		return PTR_ERR(cs);
2064	}
2065
2066	cs = emit_semaphore_poll(cs, mode, value, offset);
2067
2068	intel_ring_advance(rq, cs);
2069	i915_request_add(rq);
2070
2071	return 0;
2072}
2073
2074static int measure_inter_request(struct intel_context *ce)
2075{
2076	u32 *sema = hwsp_scratch(ce);
2077	const u32 offset = hwsp_offset(ce, sema);
2078	u32 elapsed[TF_COUNT + 1], cycles;
2079	struct i915_sw_fence *submit;
2080	int i, err;
2081
2082	/*
2083	 * Measure how long it takes to advance from one request into the
2084	 * next. Between each request we flush the GPU caches to memory,
2085	 * update the breadcrumbs, and then invalidate those caches.
2086	 * We queue up all the requests to be submitted in one batch so
2087	 * it should be one set of contiguous measurements.
2088	 *
2089	 *    A: read CS_TIMESTAMP on GPU
2090	 *    advance request
2091	 *    B: read CS_TIMESTAMP on GPU
2092	 *
2093	 * Request latency: B - A
2094	 */
2095
2096	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2097	if (err)
2098		return err;
2099
2100	submit = heap_fence_create(GFP_KERNEL);
2101	if (!submit) {
2102		semaphore_set(sema, 1);
2103		return -ENOMEM;
2104	}
2105
2106	intel_engine_flush_submission(ce->engine);
2107	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2108		struct i915_request *rq;
2109		u32 *cs;
2110
2111		rq = i915_request_create(ce);
2112		if (IS_ERR(rq)) {
2113			err = PTR_ERR(rq);
2114			goto err_submit;
2115		}
2116
2117		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
2118						       submit,
2119						       GFP_KERNEL);
2120		if (err < 0) {
2121			i915_request_add(rq);
2122			goto err_submit;
2123		}
2124
2125		cs = intel_ring_begin(rq, 4);
2126		if (IS_ERR(cs)) {
2127			i915_request_add(rq);
2128			err = PTR_ERR(cs);
2129			goto err_submit;
2130		}
2131
2132		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2133
2134		intel_ring_advance(rq, cs);
2135		i915_request_add(rq);
2136	}
2137	i915_sw_fence_commit(submit);
2138	intel_engine_flush_submission(ce->engine);
2139	heap_fence_put(submit);
2140
2141	semaphore_set(sema, 1);
2142	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2143	if (err)
2144		goto err;
2145
2146	for (i = 1; i <= TF_COUNT; i++)
2147		elapsed[i - 1] = sema[i + 1] - sema[i];
2148
2149	cycles = trifilter(elapsed);
2150	pr_info("%s: inter-request latency %d cycles, %lluns\n",
2151		ce->engine->name, cycles >> TF_BIAS,
2152		cycles_to_ns(ce->engine, cycles));
2153
2154	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2155
2156err_submit:
2157	i915_sw_fence_commit(submit);
2158	heap_fence_put(submit);
2159	semaphore_set(sema, 1);
2160err:
2161	intel_gt_set_wedged(ce->engine->gt);
2162	return err;
2163}
2164
2165static int measure_context_switch(struct intel_context *ce)
2166{
2167	u32 *sema = hwsp_scratch(ce);
2168	const u32 offset = hwsp_offset(ce, sema);
2169	struct i915_request *fence = NULL;
2170	u32 elapsed[TF_COUNT + 1], cycles;
2171	int i, j, err;
2172	u32 *cs;
2173
2174	/*
2175	 * Measure how long it takes to advance from one request in one
2176	 * context to a request in another context. This allows us to
2177	 * measure how long the context save/restore take, along with all
2178	 * the inter-context setup we require.
2179	 *
2180	 *    A: read CS_TIMESTAMP on GPU
2181	 *    switch context
2182	 *    B: read CS_TIMESTAMP on GPU
2183	 *
2184	 * Context switch latency: B - A
2185	 */
2186
2187	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2188	if (err)
2189		return err;
2190
2191	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2192		struct intel_context *arr[] = {
2193			ce, ce->engine->kernel_context
2194		};
2195		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
2196
2197		for (j = 0; j < ARRAY_SIZE(arr); j++) {
2198			struct i915_request *rq;
2199
2200			rq = i915_request_create(arr[j]);
2201			if (IS_ERR(rq)) {
2202				err = PTR_ERR(rq);
2203				goto err_fence;
2204			}
2205
2206			if (fence) {
2207				err = i915_request_await_dma_fence(rq,
2208								   &fence->fence);
2209				if (err) {
2210					i915_request_add(rq);
2211					goto err_fence;
2212				}
2213			}
2214
2215			cs = intel_ring_begin(rq, 4);
2216			if (IS_ERR(cs)) {
2217				i915_request_add(rq);
2218				err = PTR_ERR(cs);
2219				goto err_fence;
2220			}
2221
2222			cs = emit_timestamp_store(cs, ce, addr);
2223			addr += sizeof(u32);
2224
2225			intel_ring_advance(rq, cs);
2226
2227			i915_request_put(fence);
2228			fence = i915_request_get(rq);
2229
2230			i915_request_add(rq);
2231		}
2232	}
2233	i915_request_put(fence);
2234	intel_engine_flush_submission(ce->engine);
2235
2236	semaphore_set(sema, 1);
2237	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2238	if (err)
2239		goto err;
2240
2241	for (i = 1; i <= TF_COUNT; i++)
2242		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2243
2244	cycles = trifilter(elapsed);
2245	pr_info("%s: context switch latency %d cycles, %lluns\n",
2246		ce->engine->name, cycles >> TF_BIAS,
2247		cycles_to_ns(ce->engine, cycles));
2248
2249	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2250
2251err_fence:
2252	i915_request_put(fence);
2253	semaphore_set(sema, 1);
2254err:
2255	intel_gt_set_wedged(ce->engine->gt);
2256	return err;
2257}
2258
2259static int measure_preemption(struct intel_context *ce)
2260{
2261	u32 *sema = hwsp_scratch(ce);
2262	const u32 offset = hwsp_offset(ce, sema);
2263	u32 elapsed[TF_COUNT], cycles;
2264	u32 *cs;
2265	int err;
2266	int i;
2267
2268	/*
2269	 * We measure two latencies while triggering preemption. The first
2270	 * latency is how long it takes for us to submit a preempting request.
2271	 * The second latency is how it takes for us to return from the
2272	 * preemption back to the original context.
2273	 *
2274	 *    A: read CS_TIMESTAMP from CPU
2275	 *    submit preemption
2276	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
2277	 *    context switch
2278	 *    C: read CS_TIMESTAMP on GPU (in original context)
2279	 *
2280	 * Preemption dispatch latency: B - A
2281	 * Preemption switch latency: C - B
2282	 */
2283
2284	if (!intel_engine_has_preemption(ce->engine))
2285		return 0;
2286
2287	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2288		u32 addr = offset + 2 * i * sizeof(u32);
2289		struct i915_request *rq;
2290
2291		rq = i915_request_create(ce);
2292		if (IS_ERR(rq)) {
2293			err = PTR_ERR(rq);
2294			goto err;
2295		}
2296
2297		cs = intel_ring_begin(rq, 12);
2298		if (IS_ERR(cs)) {
2299			i915_request_add(rq);
2300			err = PTR_ERR(cs);
2301			goto err;
2302		}
2303
2304		cs = emit_store_dw(cs, addr, -1);
2305		cs = emit_semaphore_poll_until(cs, offset, i);
2306		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2307
2308		intel_ring_advance(rq, cs);
2309		i915_request_add(rq);
2310
2311		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2312			err = -EIO;
2313			goto err;
2314		}
2315
2316		rq = i915_request_create(ce->engine->kernel_context);
2317		if (IS_ERR(rq)) {
2318			err = PTR_ERR(rq);
2319			goto err;
2320		}
2321
2322		cs = intel_ring_begin(rq, 8);
2323		if (IS_ERR(cs)) {
2324			i915_request_add(rq);
2325			err = PTR_ERR(cs);
2326			goto err;
2327		}
2328
2329		cs = emit_timestamp_store(cs, ce, addr);
2330		cs = emit_store_dw(cs, offset, i);
2331
2332		intel_ring_advance(rq, cs);
2333		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2334
2335		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2336		i915_request_add(rq);
2337	}
2338
2339	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2340		err = -EIO;
2341		goto err;
2342	}
2343
2344	for (i = 1; i <= TF_COUNT; i++)
2345		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2346
2347	cycles = trifilter(elapsed);
2348	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2349		ce->engine->name, cycles >> TF_BIAS,
2350		cycles_to_ns(ce->engine, cycles));
2351
2352	for (i = 1; i <= TF_COUNT; i++)
2353		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2354
2355	cycles = trifilter(elapsed);
2356	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2357		ce->engine->name, cycles >> TF_BIAS,
2358		cycles_to_ns(ce->engine, cycles));
2359
2360	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2361
2362err:
2363	intel_gt_set_wedged(ce->engine->gt);
2364	return err;
2365}
2366
2367struct signal_cb {
2368	struct dma_fence_cb base;
2369	bool seen;
2370};
2371
2372static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2373{
2374	struct signal_cb *s = container_of(cb, typeof(*s), base);
2375
2376	smp_store_mb(s->seen, true); /* be safe, be strong */
2377}
2378
2379static int measure_completion(struct intel_context *ce)
2380{
2381	u32 *sema = hwsp_scratch(ce);
2382	const u32 offset = hwsp_offset(ce, sema);
2383	u32 elapsed[TF_COUNT], cycles;
2384	u32 *cs;
2385	int err;
2386	int i;
2387
2388	/*
2389	 * Measure how long it takes for the signal (interrupt) to be
2390	 * sent from the GPU to be processed by the CPU.
2391	 *
2392	 *    A: read CS_TIMESTAMP on GPU
2393	 *    signal
2394	 *    B: read CS_TIMESTAMP from CPU
2395	 *
2396	 * Completion latency: B - A
2397	 */
2398
2399	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2400		struct signal_cb cb = { .seen = false };
2401		struct i915_request *rq;
2402
2403		rq = i915_request_create(ce);
2404		if (IS_ERR(rq)) {
2405			err = PTR_ERR(rq);
2406			goto err;
2407		}
2408
2409		cs = intel_ring_begin(rq, 12);
2410		if (IS_ERR(cs)) {
2411			i915_request_add(rq);
2412			err = PTR_ERR(cs);
2413			goto err;
2414		}
2415
2416		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2417		cs = emit_semaphore_poll_until(cs, offset, i);
2418		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2419
2420		intel_ring_advance(rq, cs);
2421
2422		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2423		i915_request_add(rq);
2424
2425		intel_engine_flush_submission(ce->engine);
2426		if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2427			err = -EIO;
2428			goto err;
2429		}
2430
2431		preempt_disable();
2432		semaphore_set(sema, i);
2433		while (!READ_ONCE(cb.seen))
2434			cpu_relax();
2435
2436		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2437		preempt_enable();
2438	}
2439
2440	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2441	if (err)
2442		goto err;
2443
2444	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2445		GEM_BUG_ON(sema[i + 1] == -1);
2446		elapsed[i] = elapsed[i] - sema[i + 1];
2447	}
2448
2449	cycles = trifilter(elapsed);
2450	pr_info("%s: completion latency %d cycles, %lluns\n",
2451		ce->engine->name, cycles >> TF_BIAS,
2452		cycles_to_ns(ce->engine, cycles));
2453
2454	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2455
2456err:
2457	intel_gt_set_wedged(ce->engine->gt);
2458	return err;
2459}
2460
2461static void rps_pin(struct intel_gt *gt)
2462{
2463	/* Pin the frequency to max */
2464	atomic_inc(&gt->rps.num_waiters);
2465	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2466
2467	mutex_lock(&gt->rps.lock);
2468	intel_rps_set(&gt->rps, gt->rps.max_freq);
2469	mutex_unlock(&gt->rps.lock);
2470}
2471
2472static void rps_unpin(struct intel_gt *gt)
2473{
2474	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2475	atomic_dec(&gt->rps.num_waiters);
2476}
2477
2478static int perf_request_latency(void *arg)
2479{
2480	struct drm_i915_private *i915 = arg;
2481	struct intel_engine_cs *engine;
2482	struct pm_qos_request qos;
2483	int err = 0;
2484
2485	if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */
2486		return 0;
2487
2488	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2489
2490	for_each_uabi_engine(engine, i915) {
2491		struct intel_context *ce;
2492
2493		ce = intel_context_create(engine);
2494		if (IS_ERR(ce)) {
2495			err = PTR_ERR(ce);
2496			goto out;
2497		}
2498
2499		err = intel_context_pin(ce);
2500		if (err) {
2501			intel_context_put(ce);
2502			goto out;
2503		}
2504
2505		st_engine_heartbeat_disable(engine);
2506		rps_pin(engine->gt);
2507
2508		if (err == 0)
2509			err = measure_semaphore_response(ce);
2510		if (err == 0)
2511			err = measure_idle_dispatch(ce);
2512		if (err == 0)
2513			err = measure_busy_dispatch(ce);
2514		if (err == 0)
2515			err = measure_inter_request(ce);
2516		if (err == 0)
2517			err = measure_context_switch(ce);
2518		if (err == 0)
2519			err = measure_preemption(ce);
2520		if (err == 0)
2521			err = measure_completion(ce);
2522
2523		rps_unpin(engine->gt);
2524		st_engine_heartbeat_enable(engine);
2525
2526		intel_context_unpin(ce);
2527		intel_context_put(ce);
2528		if (err)
2529			goto out;
2530	}
2531
2532out:
2533	if (igt_flush_test(i915))
2534		err = -EIO;
2535
2536	cpu_latency_qos_remove_request(&qos);
2537	return err;
2538}
2539
2540static int s_sync0(void *arg)
2541{
2542	struct perf_series *ps = arg;
2543	IGT_TIMEOUT(end_time);
2544	unsigned int idx = 0;
2545	int err = 0;
2546
2547	GEM_BUG_ON(!ps->nengines);
2548	do {
2549		struct i915_request *rq;
2550
2551		rq = i915_request_create(ps->ce[idx]);
2552		if (IS_ERR(rq)) {
2553			err = PTR_ERR(rq);
2554			break;
2555		}
2556
2557		i915_request_get(rq);
2558		i915_request_add(rq);
2559
2560		if (i915_request_wait(rq, 0, HZ / 5) < 0)
2561			err = -ETIME;
2562		i915_request_put(rq);
2563		if (err)
2564			break;
2565
2566		if (++idx == ps->nengines)
2567			idx = 0;
2568	} while (!__igt_timeout(end_time, NULL));
2569
2570	return err;
2571}
2572
2573static int s_sync1(void *arg)
2574{
2575	struct perf_series *ps = arg;
2576	struct i915_request *prev = NULL;
2577	IGT_TIMEOUT(end_time);
2578	unsigned int idx = 0;
2579	int err = 0;
2580
2581	GEM_BUG_ON(!ps->nengines);
2582	do {
2583		struct i915_request *rq;
2584
2585		rq = i915_request_create(ps->ce[idx]);
2586		if (IS_ERR(rq)) {
2587			err = PTR_ERR(rq);
2588			break;
2589		}
2590
2591		i915_request_get(rq);
2592		i915_request_add(rq);
2593
2594		if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2595			err = -ETIME;
2596		i915_request_put(prev);
2597		prev = rq;
2598		if (err)
2599			break;
2600
2601		if (++idx == ps->nengines)
2602			idx = 0;
2603	} while (!__igt_timeout(end_time, NULL));
2604	i915_request_put(prev);
2605
2606	return err;
2607}
2608
2609static int s_many(void *arg)
2610{
2611	struct perf_series *ps = arg;
2612	IGT_TIMEOUT(end_time);
2613	unsigned int idx = 0;
2614
2615	GEM_BUG_ON(!ps->nengines);
2616	do {
2617		struct i915_request *rq;
2618
2619		rq = i915_request_create(ps->ce[idx]);
2620		if (IS_ERR(rq))
2621			return PTR_ERR(rq);
2622
2623		i915_request_add(rq);
2624
2625		if (++idx == ps->nengines)
2626			idx = 0;
2627	} while (!__igt_timeout(end_time, NULL));
2628
2629	return 0;
2630}
2631
2632static int perf_series_engines(void *arg)
2633{
2634	struct drm_i915_private *i915 = arg;
2635	static int (* const func[])(void *arg) = {
2636		s_sync0,
2637		s_sync1,
2638		s_many,
2639		NULL,
2640	};
2641	const unsigned int nengines = num_uabi_engines(i915);
2642	struct intel_engine_cs *engine;
2643	int (* const *fn)(void *arg);
2644	struct pm_qos_request qos;
2645	struct perf_stats *stats;
2646	struct perf_series *ps;
2647	unsigned int idx;
2648	int err = 0;
2649
2650	stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2651	if (!stats)
2652		return -ENOMEM;
2653
2654	ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2655	if (!ps) {
2656		kfree(stats);
2657		return -ENOMEM;
2658	}
2659
2660	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2661
2662	ps->i915 = i915;
2663	ps->nengines = nengines;
2664
2665	idx = 0;
2666	for_each_uabi_engine(engine, i915) {
2667		struct intel_context *ce;
2668
2669		ce = intel_context_create(engine);
2670		if (IS_ERR(ce)) {
2671			err = PTR_ERR(ce);
2672			goto out;
2673		}
2674
2675		err = intel_context_pin(ce);
2676		if (err) {
2677			intel_context_put(ce);
2678			goto out;
2679		}
2680
2681		ps->ce[idx++] = ce;
2682	}
2683	GEM_BUG_ON(idx != ps->nengines);
2684
2685	for (fn = func; *fn && !err; fn++) {
2686		char name[KSYM_NAME_LEN];
2687		struct igt_live_test t;
2688
2689		snprintf(name, sizeof(name), "%ps", *fn);
2690		err = igt_live_test_begin(&t, i915, __func__, name);
2691		if (err)
2692			break;
2693
2694		for (idx = 0; idx < nengines; idx++) {
2695			struct perf_stats *p =
2696				memset(&stats[idx], 0, sizeof(stats[idx]));
2697			struct intel_context *ce = ps->ce[idx];
2698
2699			p->engine = ps->ce[idx]->engine;
2700			intel_engine_pm_get(p->engine);
2701
2702			if (intel_engine_supports_stats(p->engine))
2703				p->busy = intel_engine_get_busy_time(p->engine,
2704								     &p->time) + 1;
2705			else
2706				p->time = ktime_get();
2707			p->runtime = -intel_context_get_total_runtime_ns(ce);
2708		}
2709
2710		err = (*fn)(ps);
2711		if (igt_live_test_end(&t))
2712			err = -EIO;
2713
2714		for (idx = 0; idx < nengines; idx++) {
2715			struct perf_stats *p = &stats[idx];
2716			struct intel_context *ce = ps->ce[idx];
2717			int integer, decimal;
2718			u64 busy, dt, now;
2719
2720			if (p->busy)
2721				p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2722									       &now),
2723						    p->busy - 1);
2724			else
2725				now = ktime_get();
2726			p->time = ktime_sub(now, p->time);
2727
2728			err = switch_to_kernel_sync(ce, err);
2729			p->runtime += intel_context_get_total_runtime_ns(ce);
2730			intel_engine_pm_put(p->engine);
2731
2732			busy = 100 * ktime_to_ns(p->busy);
2733			dt = ktime_to_ns(p->time);
2734			if (dt) {
2735				integer = div64_u64(busy, dt);
2736				busy -= integer * dt;
2737				decimal = div64_u64(100 * busy, dt);
2738			} else {
2739				integer = 0;
2740				decimal = 0;
2741			}
2742
2743			pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2744				name, p->engine->name, ce->timeline->seqno,
2745				integer, decimal,
2746				div_u64(p->runtime, 1000 * 1000),
2747				div_u64(ktime_to_ns(p->time), 1000 * 1000));
2748		}
2749	}
2750
2751out:
2752	for (idx = 0; idx < nengines; idx++) {
2753		if (IS_ERR_OR_NULL(ps->ce[idx]))
2754			break;
2755
2756		intel_context_unpin(ps->ce[idx]);
2757		intel_context_put(ps->ce[idx]);
2758	}
2759	kfree(ps);
2760
2761	cpu_latency_qos_remove_request(&qos);
2762	kfree(stats);
2763	return err;
2764}
2765
2766static int p_sync0(void *arg)
2767{
2768	struct perf_stats *p = arg;
2769	struct intel_engine_cs *engine = p->engine;
2770	struct intel_context *ce;
2771	IGT_TIMEOUT(end_time);
2772	unsigned long count;
2773	bool busy;
2774	int err = 0;
2775
2776	ce = intel_context_create(engine);
2777	if (IS_ERR(ce))
2778		return PTR_ERR(ce);
2779
2780	err = intel_context_pin(ce);
2781	if (err) {
2782		intel_context_put(ce);
2783		return err;
2784	}
2785
2786	if (intel_engine_supports_stats(engine)) {
2787		p->busy = intel_engine_get_busy_time(engine, &p->time);
2788		busy = true;
2789	} else {
2790		p->time = ktime_get();
2791		busy = false;
2792	}
2793
2794	count = 0;
2795	do {
2796		struct i915_request *rq;
2797
2798		rq = i915_request_create(ce);
2799		if (IS_ERR(rq)) {
2800			err = PTR_ERR(rq);
2801			break;
2802		}
2803
2804		i915_request_get(rq);
2805		i915_request_add(rq);
2806
2807		err = 0;
2808		if (i915_request_wait(rq, 0, HZ / 5) < 0)
2809			err = -ETIME;
2810		i915_request_put(rq);
2811		if (err)
2812			break;
2813
2814		count++;
2815	} while (!__igt_timeout(end_time, NULL));
2816
2817	if (busy) {
2818		ktime_t now;
2819
2820		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2821				    p->busy);
2822		p->time = ktime_sub(now, p->time);
2823	} else {
2824		p->time = ktime_sub(ktime_get(), p->time);
2825	}
2826
2827	err = switch_to_kernel_sync(ce, err);
2828	p->runtime = intel_context_get_total_runtime_ns(ce);
2829	p->count = count;
2830
2831	intel_context_unpin(ce);
2832	intel_context_put(ce);
2833	return err;
2834}
2835
2836static int p_sync1(void *arg)
2837{
2838	struct perf_stats *p = arg;
2839	struct intel_engine_cs *engine = p->engine;
2840	struct i915_request *prev = NULL;
2841	struct intel_context *ce;
2842	IGT_TIMEOUT(end_time);
2843	unsigned long count;
2844	bool busy;
2845	int err = 0;
2846
2847	ce = intel_context_create(engine);
2848	if (IS_ERR(ce))
2849		return PTR_ERR(ce);
2850
2851	err = intel_context_pin(ce);
2852	if (err) {
2853		intel_context_put(ce);
2854		return err;
2855	}
2856
2857	if (intel_engine_supports_stats(engine)) {
2858		p->busy = intel_engine_get_busy_time(engine, &p->time);
2859		busy = true;
2860	} else {
2861		p->time = ktime_get();
2862		busy = false;
2863	}
2864
2865	count = 0;
2866	do {
2867		struct i915_request *rq;
2868
2869		rq = i915_request_create(ce);
2870		if (IS_ERR(rq)) {
2871			err = PTR_ERR(rq);
2872			break;
2873		}
2874
2875		i915_request_get(rq);
2876		i915_request_add(rq);
2877
2878		err = 0;
2879		if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2880			err = -ETIME;
2881		i915_request_put(prev);
2882		prev = rq;
2883		if (err)
2884			break;
2885
2886		count++;
2887	} while (!__igt_timeout(end_time, NULL));
2888	i915_request_put(prev);
2889
2890	if (busy) {
2891		ktime_t now;
2892
2893		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2894				    p->busy);
2895		p->time = ktime_sub(now, p->time);
2896	} else {
2897		p->time = ktime_sub(ktime_get(), p->time);
2898	}
2899
2900	err = switch_to_kernel_sync(ce, err);
2901	p->runtime = intel_context_get_total_runtime_ns(ce);
2902	p->count = count;
2903
2904	intel_context_unpin(ce);
2905	intel_context_put(ce);
2906	return err;
2907}
2908
2909static int p_many(void *arg)
2910{
2911	struct perf_stats *p = arg;
2912	struct intel_engine_cs *engine = p->engine;
2913	struct intel_context *ce;
2914	IGT_TIMEOUT(end_time);
2915	unsigned long count;
2916	int err = 0;
2917	bool busy;
2918
2919	ce = intel_context_create(engine);
2920	if (IS_ERR(ce))
2921		return PTR_ERR(ce);
2922
2923	err = intel_context_pin(ce);
2924	if (err) {
2925		intel_context_put(ce);
2926		return err;
2927	}
2928
2929	if (intel_engine_supports_stats(engine)) {
2930		p->busy = intel_engine_get_busy_time(engine, &p->time);
2931		busy = true;
2932	} else {
2933		p->time = ktime_get();
2934		busy = false;
2935	}
2936
2937	count = 0;
2938	do {
2939		struct i915_request *rq;
2940
2941		rq = i915_request_create(ce);
2942		if (IS_ERR(rq)) {
2943			err = PTR_ERR(rq);
2944			break;
2945		}
2946
2947		i915_request_add(rq);
2948		count++;
2949	} while (!__igt_timeout(end_time, NULL));
2950
2951	if (busy) {
2952		ktime_t now;
2953
2954		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2955				    p->busy);
2956		p->time = ktime_sub(now, p->time);
2957	} else {
2958		p->time = ktime_sub(ktime_get(), p->time);
2959	}
2960
2961	err = switch_to_kernel_sync(ce, err);
2962	p->runtime = intel_context_get_total_runtime_ns(ce);
2963	p->count = count;
2964
2965	intel_context_unpin(ce);
2966	intel_context_put(ce);
2967	return err;
2968}
2969
2970static int perf_parallel_engines(void *arg)
2971{
2972	struct drm_i915_private *i915 = arg;
2973	static int (* const func[])(void *arg) = {
2974		p_sync0,
2975		p_sync1,
2976		p_many,
2977		NULL,
2978	};
2979	const unsigned int nengines = num_uabi_engines(i915);
2980	struct intel_engine_cs *engine;
2981	int (* const *fn)(void *arg);
2982	struct pm_qos_request qos;
2983	struct {
2984		struct perf_stats p;
2985		struct task_struct *tsk;
2986	} *engines;
2987	int err = 0;
2988
2989	engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
2990	if (!engines)
2991		return -ENOMEM;
2992
2993	cpu_latency_qos_add_request(&qos, 0);
2994
2995	for (fn = func; *fn; fn++) {
2996		char name[KSYM_NAME_LEN];
2997		struct igt_live_test t;
2998		unsigned int idx;
2999
3000		snprintf(name, sizeof(name), "%ps", *fn);
3001		err = igt_live_test_begin(&t, i915, __func__, name);
3002		if (err)
3003			break;
3004
3005		atomic_set(&i915->selftest.counter, nengines);
3006
3007		idx = 0;
3008		for_each_uabi_engine(engine, i915) {
3009			intel_engine_pm_get(engine);
3010
3011			memset(&engines[idx].p, 0, sizeof(engines[idx].p));
3012			engines[idx].p.engine = engine;
3013
3014			engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
3015						       "igt:%s", engine->name);
3016			if (IS_ERR(engines[idx].tsk)) {
3017				err = PTR_ERR(engines[idx].tsk);
3018				intel_engine_pm_put(engine);
3019				break;
3020			}
3021			get_task_struct(engines[idx++].tsk);
3022		}
3023
3024		yield(); /* start all threads before we kthread_stop() */
3025
3026		idx = 0;
3027		for_each_uabi_engine(engine, i915) {
3028			int status;
3029
3030			if (IS_ERR(engines[idx].tsk))
3031				break;
3032
3033			status = kthread_stop(engines[idx].tsk);
3034			if (status && !err)
3035				err = status;
3036
3037			intel_engine_pm_put(engine);
3038			put_task_struct(engines[idx++].tsk);
3039		}
3040
3041		if (igt_live_test_end(&t))
3042			err = -EIO;
3043		if (err)
3044			break;
3045
3046		idx = 0;
3047		for_each_uabi_engine(engine, i915) {
3048			struct perf_stats *p = &engines[idx].p;
3049			u64 busy = 100 * ktime_to_ns(p->busy);
3050			u64 dt = ktime_to_ns(p->time);
3051			int integer, decimal;
3052
3053			if (dt) {
3054				integer = div64_u64(busy, dt);
3055				busy -= integer * dt;
3056				decimal = div64_u64(100 * busy, dt);
3057			} else {
3058				integer = 0;
3059				decimal = 0;
3060			}
3061
3062			GEM_BUG_ON(engine != p->engine);
3063			pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
3064				name, engine->name, p->count, integer, decimal,
3065				div_u64(p->runtime, 1000 * 1000),
3066				div_u64(ktime_to_ns(p->time), 1000 * 1000));
3067			idx++;
3068		}
3069	}
3070
3071	cpu_latency_qos_remove_request(&qos);
3072	kfree(engines);
3073	return err;
3074}
3075
3076int i915_request_perf_selftests(struct drm_i915_private *i915)
3077{
3078	static const struct i915_subtest tests[] = {
3079		SUBTEST(perf_request_latency),
3080		SUBTEST(perf_series_engines),
3081		SUBTEST(perf_parallel_engines),
3082	};
3083
3084	if (intel_gt_is_wedged(&i915->gt))
3085		return 0;
3086
3087	return i915_subtests(tests, i915);
3088}