i915_request.c - drivers/gpu/drm/i915/selftests/i915_request.c - Linux diff v5.14.15

   1/*
   2 * Copyright © 2016 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 */
  24
  25#include <linux/prime_numbers.h>
  26#include <linux/pm_qos.h>
  27#include <linux/sort.h>
  28
 
  29#include "gem/i915_gem_pm.h"
  30#include "gem/selftests/mock_context.h"
  31
  32#include "gt/intel_engine_heartbeat.h"
  33#include "gt/intel_engine_pm.h"
  34#include "gt/intel_engine_user.h"
  35#include "gt/intel_gt.h"
  36#include "gt/intel_gt_clock_utils.h"
  37#include "gt/intel_gt_requests.h"
  38#include "gt/selftest_engine_heartbeat.h"
  39
  40#include "i915_random.h"
  41#include "i915_selftest.h"
  42#include "igt_flush_test.h"
  43#include "igt_live_test.h"
  44#include "igt_spinner.h"
  45#include "lib_sw_fence.h"
  46
  47#include "mock_drm.h"
  48#include "mock_gem_device.h"
  49
  50static unsigned int num_uabi_engines(struct drm_i915_private *i915)
  51{
  52	struct intel_engine_cs *engine;
  53	unsigned int count;
  54
  55	count = 0;
  56	for_each_uabi_engine(engine, i915)
  57		count++;
  58
  59	return count;
  60}
  61
  62static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
  63{
  64	return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
  65}
  66
  67static int igt_add_request(void *arg)
  68{
  69	struct drm_i915_private *i915 = arg;
  70	struct i915_request *request;
  71
  72	/* Basic preliminary test to create a request and let it loose! */
  73
  74	request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
  75	if (!request)
  76		return -ENOMEM;
  77
  78	i915_request_add(request);
  79
  80	return 0;
  81}
  82
  83static int igt_wait_request(void *arg)
  84{
  85	const long T = HZ / 4;
  86	struct drm_i915_private *i915 = arg;
  87	struct i915_request *request;
  88	int err = -EINVAL;
  89
  90	/* Submit a request, then wait upon it */
  91
  92	request = mock_request(rcs0(i915)->kernel_context, T);
  93	if (!request)
  94		return -ENOMEM;
  95
  96	i915_request_get(request);
  97
  98	if (i915_request_wait(request, 0, 0) != -ETIME) {
  99		pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
 100		goto out_request;
 101	}
 102
 103	if (i915_request_wait(request, 0, T) != -ETIME) {
 104		pr_err("request wait succeeded (expected timeout before submit!)\n");
 105		goto out_request;
 106	}
 107
 108	if (i915_request_completed(request)) {
 109		pr_err("request completed before submit!!\n");
 110		goto out_request;
 111	}
 112
 113	i915_request_add(request);
 114
 115	if (i915_request_wait(request, 0, 0) != -ETIME) {
 116		pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
 117		goto out_request;
 118	}
 119
 120	if (i915_request_completed(request)) {
 121		pr_err("request completed immediately!\n");
 122		goto out_request;
 123	}
 124
 125	if (i915_request_wait(request, 0, T / 2) != -ETIME) {
 126		pr_err("request wait succeeded (expected timeout!)\n");
 127		goto out_request;
 128	}
 129
 130	if (i915_request_wait(request, 0, T) == -ETIME) {
 131		pr_err("request wait timed out!\n");
 132		goto out_request;
 133	}
 134
 135	if (!i915_request_completed(request)) {
 136		pr_err("request not complete after waiting!\n");
 137		goto out_request;
 138	}
 139
 140	if (i915_request_wait(request, 0, T) == -ETIME) {
 141		pr_err("request wait timed out when already complete!\n");
 142		goto out_request;
 143	}
 144
 145	err = 0;
 146out_request:
 147	i915_request_put(request);
 148	mock_device_flush(i915);
 149	return err;
 150}
 151
 152static int igt_fence_wait(void *arg)
 153{
 154	const long T = HZ / 4;
 155	struct drm_i915_private *i915 = arg;
 156	struct i915_request *request;
 157	int err = -EINVAL;
 158
 159	/* Submit a request, treat it as a fence and wait upon it */
 160
 161	request = mock_request(rcs0(i915)->kernel_context, T);
 162	if (!request)
 163		return -ENOMEM;
 164
 165	if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
 166		pr_err("fence wait success before submit (expected timeout)!\n");
 167		goto out;
 168	}
 169
 170	i915_request_add(request);
 171
 172	if (dma_fence_is_signaled(&request->fence)) {
 173		pr_err("fence signaled immediately!\n");
 174		goto out;
 175	}
 176
 177	if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
 178		pr_err("fence wait success after submit (expected timeout)!\n");
 179		goto out;
 180	}
 181
 182	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
 183		pr_err("fence wait timed out (expected success)!\n");
 184		goto out;
 185	}
 186
 187	if (!dma_fence_is_signaled(&request->fence)) {
 188		pr_err("fence unsignaled after waiting!\n");
 189		goto out;
 190	}
 191
 192	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
 193		pr_err("fence wait timed out when complete (expected success)!\n");
 194		goto out;
 195	}
 196
 197	err = 0;
 198out:
 199	mock_device_flush(i915);
 200	return err;
 201}
 202
 203static int igt_request_rewind(void *arg)
 204{
 205	struct drm_i915_private *i915 = arg;
 206	struct i915_request *request, *vip;
 207	struct i915_gem_context *ctx[2];
 208	struct intel_context *ce;
 209	int err = -EINVAL;
 210
 211	ctx[0] = mock_context(i915, "A");
 
 
 
 
 212
 213	ce = i915_gem_context_get_engine(ctx[0], RCS0);
 214	GEM_BUG_ON(IS_ERR(ce));
 215	request = mock_request(ce, 2 * HZ);
 216	intel_context_put(ce);
 217	if (!request) {
 218		err = -ENOMEM;
 219		goto err_context_0;
 220	}
 221
 222	i915_request_get(request);
 223	i915_request_add(request);
 224
 225	ctx[1] = mock_context(i915, "B");
 
 
 
 
 226
 227	ce = i915_gem_context_get_engine(ctx[1], RCS0);
 228	GEM_BUG_ON(IS_ERR(ce));
 229	vip = mock_request(ce, 0);
 230	intel_context_put(ce);
 231	if (!vip) {
 232		err = -ENOMEM;
 233		goto err_context_1;
 234	}
 235
 236	/* Simulate preemption by manual reordering */
 237	if (!mock_cancel_request(request)) {
 238		pr_err("failed to cancel request (already executed)!\n");
 239		i915_request_add(vip);
 240		goto err_context_1;
 241	}
 242	i915_request_get(vip);
 243	i915_request_add(vip);
 244	rcu_read_lock();
 245	request->engine->submit_request(request);
 246	rcu_read_unlock();
 247
 248
 249	if (i915_request_wait(vip, 0, HZ) == -ETIME) {
 250		pr_err("timed out waiting for high priority request\n");
 251		goto err;
 252	}
 253
 254	if (i915_request_completed(request)) {
 255		pr_err("low priority request already completed\n");
 256		goto err;
 257	}
 258
 259	err = 0;
 260err:
 261	i915_request_put(vip);
 262err_context_1:
 263	mock_context_close(ctx[1]);
 
 264	i915_request_put(request);
 265err_context_0:
 266	mock_context_close(ctx[0]);
 
 267	mock_device_flush(i915);
 268	return err;
 269}
 270
 271struct smoketest {
 272	struct intel_engine_cs *engine;
 273	struct i915_gem_context **contexts;
 274	atomic_long_t num_waits, num_fences;
 275	int ncontexts, max_batch;
 276	struct i915_request *(*request_alloc)(struct intel_context *ce);
 277};
 278
 279static struct i915_request *
 280__mock_request_alloc(struct intel_context *ce)
 281{
 282	return mock_request(ce, 0);
 283}
 284
 285static struct i915_request *
 286__live_request_alloc(struct intel_context *ce)
 287{
 288	return intel_context_create_request(ce);
 289}
 290
 291static int __igt_breadcrumbs_smoketest(void *arg)
 
 
 
 
 
 
 
 
 292{
 293	struct smoketest *t = arg;
 
 294	const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
 295	const unsigned int total = 4 * t->ncontexts + 1;
 296	unsigned int num_waits = 0, num_fences = 0;
 297	struct i915_request **requests;
 298	I915_RND_STATE(prng);
 299	unsigned int *order;
 300	int err = 0;
 301
 302	/*
 303	 * A very simple test to catch the most egregious of list handling bugs.
 304	 *
 305	 * At its heart, we simply create oodles of requests running across
 306	 * multiple kthreads and enable signaling on them, for the sole purpose
 307	 * of stressing our breadcrumb handling. The only inspection we do is
 308	 * that the fences were marked as signaled.
 309	 */
 310
 311	requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
 312	if (!requests)
 313		return -ENOMEM;
 
 
 314
 315	order = i915_random_order(total, &prng);
 316	if (!order) {
 317		err = -ENOMEM;
 318		goto out_requests;
 319	}
 320
 321	while (!kthread_should_stop()) {
 322		struct i915_sw_fence *submit, *wait;
 323		unsigned int n, count;
 324
 325		submit = heap_fence_create(GFP_KERNEL);
 326		if (!submit) {
 327			err = -ENOMEM;
 328			break;
 329		}
 330
 331		wait = heap_fence_create(GFP_KERNEL);
 332		if (!wait) {
 333			i915_sw_fence_commit(submit);
 334			heap_fence_put(submit);
 335			err = -ENOMEM;
 336			break;
 337		}
 338
 339		i915_random_reorder(order, total, &prng);
 340		count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
 341
 342		for (n = 0; n < count; n++) {
 343			struct i915_gem_context *ctx =
 344				t->contexts[order[n] % t->ncontexts];
 345			struct i915_request *rq;
 346			struct intel_context *ce;
 347
 348			ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
 349			GEM_BUG_ON(IS_ERR(ce));
 350			rq = t->request_alloc(ce);
 351			intel_context_put(ce);
 352			if (IS_ERR(rq)) {
 353				err = PTR_ERR(rq);
 354				count = n;
 355				break;
 356			}
 357
 358			err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
 359							       submit,
 360							       GFP_KERNEL);
 361
 362			requests[n] = i915_request_get(rq);
 363			i915_request_add(rq);
 364
 365			if (err >= 0)
 366				err = i915_sw_fence_await_dma_fence(wait,
 367								    &rq->fence,
 368								    0,
 369								    GFP_KERNEL);
 370
 371			if (err < 0) {
 372				i915_request_put(rq);
 373				count = n;
 374				break;
 375			}
 376		}
 377
 378		i915_sw_fence_commit(submit);
 379		i915_sw_fence_commit(wait);
 380
 381		if (!wait_event_timeout(wait->wait,
 382					i915_sw_fence_done(wait),
 383					5 * HZ)) {
 384			struct i915_request *rq = requests[count - 1];
 385
 386			pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
 387			       atomic_read(&wait->pending), count,
 388			       rq->fence.context, rq->fence.seqno,
 389			       t->engine->name);
 390			GEM_TRACE_DUMP();
 391
 392			intel_gt_set_wedged(t->engine->gt);
 393			GEM_BUG_ON(!i915_request_completed(rq));
 394			i915_sw_fence_wait(wait);
 395			err = -EIO;
 396		}
 397
 398		for (n = 0; n < count; n++) {
 399			struct i915_request *rq = requests[n];
 400
 401			if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
 402				      &rq->fence.flags)) {
 403				pr_err("%llu:%llu was not signaled!\n",
 404				       rq->fence.context, rq->fence.seqno);
 405				err = -EINVAL;
 406			}
 407
 408			i915_request_put(rq);
 409		}
 410
 411		heap_fence_put(wait);
 412		heap_fence_put(submit);
 413
 414		if (err < 0)
 415			break;
 416
 417		num_fences += count;
 418		num_waits++;
 419
 420		cond_resched();
 421	}
 422
 423	atomic_long_add(num_fences, &t->num_fences);
 424	atomic_long_add(num_waits, &t->num_waits);
 425
 426	kfree(order);
 427out_requests:
 428	kfree(requests);
 429	return err;
 430}
 431
 432static int mock_breadcrumbs_smoketest(void *arg)
 433{
 434	struct drm_i915_private *i915 = arg;
 435	struct smoketest t = {
 436		.engine = rcs0(i915),
 437		.ncontexts = 1024,
 438		.max_batch = 1024,
 439		.request_alloc = __mock_request_alloc
 440	};
 441	unsigned int ncpus = num_online_cpus();
 442	struct task_struct **threads;
 443	unsigned int n;
 444	int ret = 0;
 445
 446	/*
 447	 * Smoketest our breadcrumb/signal handling for requests across multiple
 448	 * threads. A very simple test to only catch the most egregious of bugs.
 449	 * See __igt_breadcrumbs_smoketest();
 450	 */
 451
 452	threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
 453	if (!threads)
 454		return -ENOMEM;
 455
 456	t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
 457	if (!t.contexts) {
 458		ret = -ENOMEM;
 459		goto out_threads;
 460	}
 461
 462	for (n = 0; n < t.ncontexts; n++) {
 463		t.contexts[n] = mock_context(t.engine->i915, "mock");
 464		if (!t.contexts[n]) {
 465			ret = -ENOMEM;
 466			goto out_contexts;
 467		}
 468	}
 469
 470	for (n = 0; n < ncpus; n++) {
 471		threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
 472					 &t, "igt/%d", n);
 473		if (IS_ERR(threads[n])) {
 474			ret = PTR_ERR(threads[n]);
 
 475			ncpus = n;
 476			break;
 477		}
 478
 479		get_task_struct(threads[n]);
 
 
 
 
 
 
 
 480	}
 481
 482	yield(); /* start all threads before we begin */
 483	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
 484
 485	for (n = 0; n < ncpus; n++) {
 486		int err;
 487
 488		err = kthread_stop(threads[n]);
 
 
 489		if (err < 0 && !ret)
 490			ret = err;
 491
 492		put_task_struct(threads[n]);
 493	}
 494	pr_info("Completed %lu waits for %lu fence across %d cpus\n",
 495		atomic_long_read(&t.num_waits),
 496		atomic_long_read(&t.num_fences),
 497		ncpus);
 498
 499out_contexts:
 500	for (n = 0; n < t.ncontexts; n++) {
 501		if (!t.contexts[n])
 502			break;
 503		mock_context_close(t.contexts[n]);
 504	}
 505	kfree(t.contexts);
 506out_threads:
 507	kfree(threads);
 508	return ret;
 509}
 510
 511int i915_request_mock_selftests(void)
 512{
 513	static const struct i915_subtest tests[] = {
 514		SUBTEST(igt_add_request),
 515		SUBTEST(igt_wait_request),
 516		SUBTEST(igt_fence_wait),
 517		SUBTEST(igt_request_rewind),
 518		SUBTEST(mock_breadcrumbs_smoketest),
 519	};
 520	struct drm_i915_private *i915;
 521	intel_wakeref_t wakeref;
 522	int err = 0;
 523
 524	i915 = mock_gem_device();
 525	if (!i915)
 526		return -ENOMEM;
 527
 528	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
 529		err = i915_subtests(tests, i915);
 530
 531	mock_destroy_device(i915);
 532
 533	return err;
 534}
 535
 536static int live_nop_request(void *arg)
 537{
 538	struct drm_i915_private *i915 = arg;
 539	struct intel_engine_cs *engine;
 540	struct igt_live_test t;
 541	int err = -ENODEV;
 542
 543	/*
 544	 * Submit various sized batches of empty requests, to each engine
 545	 * (individually), and wait for the batch to complete. We can check
 546	 * the overhead of submitting requests to the hardware.
 547	 */
 548
 549	for_each_uabi_engine(engine, i915) {
 550		unsigned long n, prime;
 551		IGT_TIMEOUT(end_time);
 552		ktime_t times[2] = {};
 553
 554		err = igt_live_test_begin(&t, i915, __func__, engine->name);
 555		if (err)
 556			return err;
 557
 558		intel_engine_pm_get(engine);
 559		for_each_prime_number_from(prime, 1, 8192) {
 560			struct i915_request *request = NULL;
 561
 562			times[1] = ktime_get_raw();
 563
 564			for (n = 0; n < prime; n++) {
 565				i915_request_put(request);
 566				request = i915_request_create(engine->kernel_context);
 567				if (IS_ERR(request))
 568					return PTR_ERR(request);
 569
 570				/*
 571				 * This space is left intentionally blank.
 572				 *
 573				 * We do not actually want to perform any
 574				 * action with this request, we just want
 575				 * to measure the latency in allocation
 576				 * and submission of our breadcrumbs -
 577				 * ensuring that the bare request is sufficient
 578				 * for the system to work (i.e. proper HEAD
 579				 * tracking of the rings, interrupt handling,
 580				 * etc). It also gives us the lowest bounds
 581				 * for latency.
 582				 */
 583
 584				i915_request_get(request);
 585				i915_request_add(request);
 586			}
 587			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
 588			i915_request_put(request);
 589
 590			times[1] = ktime_sub(ktime_get_raw(), times[1]);
 591			if (prime == 1)
 592				times[0] = times[1];
 593
 594			if (__igt_timeout(end_time, NULL))
 595				break;
 596		}
 597		intel_engine_pm_put(engine);
 598
 599		err = igt_live_test_end(&t);
 600		if (err)
 601			return err;
 602
 603		pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
 604			engine->name,
 605			ktime_to_ns(times[0]),
 606			prime, div64_u64(ktime_to_ns(times[1]), prime));
 607	}
 608
 609	return err;
 610}
 611
 612static int __cancel_inactive(struct intel_engine_cs *engine)
 613{
 614	struct intel_context *ce;
 615	struct igt_spinner spin;
 616	struct i915_request *rq;
 617	int err = 0;
 618
 619	if (igt_spinner_init(&spin, engine->gt))
 620		return -ENOMEM;
 621
 622	ce = intel_context_create(engine);
 623	if (IS_ERR(ce)) {
 624		err = PTR_ERR(ce);
 625		goto out_spin;
 626	}
 627
 628	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
 629	if (IS_ERR(rq)) {
 630		err = PTR_ERR(rq);
 631		goto out_ce;
 632	}
 633
 634	pr_debug("%s: Cancelling inactive request\n", engine->name);
 635	i915_request_cancel(rq, -EINTR);
 636	i915_request_get(rq);
 637	i915_request_add(rq);
 638
 639	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 640		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
 641
 642		pr_err("%s: Failed to cancel inactive request\n", engine->name);
 643		intel_engine_dump(engine, &p, "%s\n", engine->name);
 644		err = -ETIME;
 645		goto out_rq;
 646	}
 647
 648	if (rq->fence.error != -EINTR) {
 649		pr_err("%s: fence not cancelled (%u)\n",
 650		       engine->name, rq->fence.error);
 651		err = -EINVAL;
 652	}
 653
 654out_rq:
 655	i915_request_put(rq);
 656out_ce:
 657	intel_context_put(ce);
 658out_spin:
 659	igt_spinner_fini(&spin);
 660	if (err)
 661		pr_err("%s: %s error %d\n", __func__, engine->name, err);
 662	return err;
 663}
 664
 665static int __cancel_active(struct intel_engine_cs *engine)
 666{
 667	struct intel_context *ce;
 668	struct igt_spinner spin;
 669	struct i915_request *rq;
 670	int err = 0;
 671
 672	if (igt_spinner_init(&spin, engine->gt))
 673		return -ENOMEM;
 674
 675	ce = intel_context_create(engine);
 676	if (IS_ERR(ce)) {
 677		err = PTR_ERR(ce);
 678		goto out_spin;
 679	}
 680
 681	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
 682	if (IS_ERR(rq)) {
 683		err = PTR_ERR(rq);
 684		goto out_ce;
 685	}
 686
 687	pr_debug("%s: Cancelling active request\n", engine->name);
 688	i915_request_get(rq);
 689	i915_request_add(rq);
 690	if (!igt_wait_for_spinner(&spin, rq)) {
 691		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
 692
 693		pr_err("Failed to start spinner on %s\n", engine->name);
 694		intel_engine_dump(engine, &p, "%s\n", engine->name);
 695		err = -ETIME;
 696		goto out_rq;
 697	}
 698	i915_request_cancel(rq, -EINTR);
 699
 700	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 701		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
 702
 703		pr_err("%s: Failed to cancel active request\n", engine->name);
 704		intel_engine_dump(engine, &p, "%s\n", engine->name);
 705		err = -ETIME;
 706		goto out_rq;
 707	}
 708
 709	if (rq->fence.error != -EINTR) {
 710		pr_err("%s: fence not cancelled (%u)\n",
 711		       engine->name, rq->fence.error);
 712		err = -EINVAL;
 713	}
 714
 715out_rq:
 716	i915_request_put(rq);
 717out_ce:
 718	intel_context_put(ce);
 719out_spin:
 720	igt_spinner_fini(&spin);
 721	if (err)
 722		pr_err("%s: %s error %d\n", __func__, engine->name, err);
 723	return err;
 724}
 725
 726static int __cancel_completed(struct intel_engine_cs *engine)
 727{
 728	struct intel_context *ce;
 729	struct igt_spinner spin;
 730	struct i915_request *rq;
 731	int err = 0;
 732
 733	if (igt_spinner_init(&spin, engine->gt))
 734		return -ENOMEM;
 735
 736	ce = intel_context_create(engine);
 737	if (IS_ERR(ce)) {
 738		err = PTR_ERR(ce);
 739		goto out_spin;
 740	}
 741
 742	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
 743	if (IS_ERR(rq)) {
 744		err = PTR_ERR(rq);
 745		goto out_ce;
 746	}
 747	igt_spinner_end(&spin);
 748	i915_request_get(rq);
 749	i915_request_add(rq);
 750
 751	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 752		err = -ETIME;
 753		goto out_rq;
 754	}
 755
 756	pr_debug("%s: Cancelling completed request\n", engine->name);
 757	i915_request_cancel(rq, -EINTR);
 758	if (rq->fence.error) {
 759		pr_err("%s: fence not cancelled (%u)\n",
 760		       engine->name, rq->fence.error);
 761		err = -EINVAL;
 762	}
 763
 764out_rq:
 765	i915_request_put(rq);
 766out_ce:
 767	intel_context_put(ce);
 768out_spin:
 769	igt_spinner_fini(&spin);
 770	if (err)
 771		pr_err("%s: %s error %d\n", __func__, engine->name, err);
 772	return err;
 773}
 774
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 775static int live_cancel_request(void *arg)
 776{
 777	struct drm_i915_private *i915 = arg;
 778	struct intel_engine_cs *engine;
 779
 780	/*
 781	 * Check cancellation of requests. We expect to be able to immediately
 782	 * cancel active requests, even if they are currently on the GPU.
 783	 */
 784
 785	for_each_uabi_engine(engine, i915) {
 786		struct igt_live_test t;
 787		int err, err2;
 788
 789		if (!intel_engine_has_preemption(engine))
 790			continue;
 791
 792		err = igt_live_test_begin(&t, i915, __func__, engine->name);
 793		if (err)
 794			return err;
 795
 796		err = __cancel_inactive(engine);
 797		if (err == 0)
 798			err = __cancel_active(engine);
 799		if (err == 0)
 800			err = __cancel_completed(engine);
 801
 802		err2 = igt_live_test_end(&t);
 803		if (err)
 804			return err;
 805		if (err2)
 806			return err2;
 
 
 
 
 
 
 
 
 807	}
 808
 809	return 0;
 810}
 811
 812static struct i915_vma *empty_batch(struct drm_i915_private *i915)
 813{
 814	struct drm_i915_gem_object *obj;
 815	struct i915_vma *vma;
 816	u32 *cmd;
 817	int err;
 818
 819	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
 820	if (IS_ERR(obj))
 821		return ERR_CAST(obj);
 822
 823	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB);
 824	if (IS_ERR(cmd)) {
 825		err = PTR_ERR(cmd);
 826		goto err;
 827	}
 828
 829	*cmd = MI_BATCH_BUFFER_END;
 830
 831	__i915_gem_object_flush_map(obj, 0, 64);
 832	i915_gem_object_unpin_map(obj);
 833
 834	intel_gt_chipset_flush(&i915->gt);
 835
 836	vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
 837	if (IS_ERR(vma)) {
 838		err = PTR_ERR(vma);
 839		goto err;
 840	}
 841
 842	err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
 843	if (err)
 844		goto err;
 845
 846	/* Force the wait wait now to avoid including it in the benchmark */
 847	err = i915_vma_sync(vma);
 848	if (err)
 849		goto err_pin;
 850
 851	return vma;
 852
 853err_pin:
 854	i915_vma_unpin(vma);
 855err:
 856	i915_gem_object_put(obj);
 857	return ERR_PTR(err);
 858}
 859
 
 
 
 
 
 
 
 
 860static struct i915_request *
 861empty_request(struct intel_engine_cs *engine,
 862	      struct i915_vma *batch)
 863{
 864	struct i915_request *request;
 865	int err;
 866
 867	request = i915_request_create(engine->kernel_context);
 868	if (IS_ERR(request))
 869		return request;
 870
 871	err = engine->emit_bb_start(request,
 872				    batch->node.start,
 873				    batch->node.size,
 874				    I915_DISPATCH_SECURE);
 875	if (err)
 876		goto out_request;
 877
 878	i915_request_get(request);
 879out_request:
 880	i915_request_add(request);
 881	return err ? ERR_PTR(err) : request;
 882}
 883
 884static int live_empty_request(void *arg)
 885{
 886	struct drm_i915_private *i915 = arg;
 887	struct intel_engine_cs *engine;
 888	struct igt_live_test t;
 889	struct i915_vma *batch;
 890	int err = 0;
 891
 892	/*
 893	 * Submit various sized batches of empty requests, to each engine
 894	 * (individually), and wait for the batch to complete. We can check
 895	 * the overhead of submitting requests to the hardware.
 896	 */
 897
 898	batch = empty_batch(i915);
 899	if (IS_ERR(batch))
 900		return PTR_ERR(batch);
 901
 902	for_each_uabi_engine(engine, i915) {
 903		IGT_TIMEOUT(end_time);
 904		struct i915_request *request;
 
 905		unsigned long n, prime;
 906		ktime_t times[2] = {};
 907
 
 
 
 
 908		err = igt_live_test_begin(&t, i915, __func__, engine->name);
 909		if (err)
 910			goto out_batch;
 911
 912		intel_engine_pm_get(engine);
 913
 914		/* Warmup / preload */
 915		request = empty_request(engine, batch);
 916		if (IS_ERR(request)) {
 917			err = PTR_ERR(request);
 918			intel_engine_pm_put(engine);
 919			goto out_batch;
 920		}
 921		i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
 922
 923		for_each_prime_number_from(prime, 1, 8192) {
 924			times[1] = ktime_get_raw();
 925
 926			for (n = 0; n < prime; n++) {
 927				i915_request_put(request);
 928				request = empty_request(engine, batch);
 929				if (IS_ERR(request)) {
 930					err = PTR_ERR(request);
 931					intel_engine_pm_put(engine);
 932					goto out_batch;
 933				}
 934			}
 935			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
 936
 937			times[1] = ktime_sub(ktime_get_raw(), times[1]);
 938			if (prime == 1)
 939				times[0] = times[1];
 940
 941			if (__igt_timeout(end_time, NULL))
 942				break;
 943		}
 944		i915_request_put(request);
 945		intel_engine_pm_put(engine);
 946
 947		err = igt_live_test_end(&t);
 948		if (err)
 949			goto out_batch;
 950
 951		pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
 952			engine->name,
 953			ktime_to_ns(times[0]),
 954			prime, div64_u64(ktime_to_ns(times[1]), prime));
 
 
 
 
 
 955	}
 956
 957out_batch:
 958	i915_vma_unpin(batch);
 959	i915_vma_put(batch);
 960	return err;
 961}
 962
 963static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
 964{
 965	struct drm_i915_gem_object *obj;
 966	const int ver = GRAPHICS_VER(i915);
 967	struct i915_vma *vma;
 968	u32 *cmd;
 969	int err;
 970
 971	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
 972	if (IS_ERR(obj))
 973		return ERR_CAST(obj);
 974
 975	vma = i915_vma_instance(obj, i915->gt.vm, NULL);
 976	if (IS_ERR(vma)) {
 977		err = PTR_ERR(vma);
 978		goto err;
 979	}
 980
 981	err = i915_vma_pin(vma, 0, 0, PIN_USER);
 982	if (err)
 983		goto err;
 984
 985	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
 986	if (IS_ERR(cmd)) {
 987		err = PTR_ERR(cmd);
 988		goto err;
 989	}
 990
 991	if (ver >= 8) {
 992		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 993		*cmd++ = lower_32_bits(vma->node.start);
 994		*cmd++ = upper_32_bits(vma->node.start);
 995	} else if (ver >= 6) {
 996		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
 997		*cmd++ = lower_32_bits(vma->node.start);
 998	} else {
 999		*cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1000		*cmd++ = lower_32_bits(vma->node.start);
1001	}
1002	*cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
1003
1004	__i915_gem_object_flush_map(obj, 0, 64);
1005	i915_gem_object_unpin_map(obj);
1006
1007	intel_gt_chipset_flush(&i915->gt);
1008
1009	return vma;
1010
1011err:
1012	i915_gem_object_put(obj);
1013	return ERR_PTR(err);
1014}
1015
1016static int recursive_batch_resolve(struct i915_vma *batch)
1017{
1018	u32 *cmd;
1019
1020	cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
1021	if (IS_ERR(cmd))
1022		return PTR_ERR(cmd);
1023
1024	*cmd = MI_BATCH_BUFFER_END;
1025
1026	__i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
1027	i915_gem_object_unpin_map(batch->obj);
1028
1029	intel_gt_chipset_flush(batch->vm->gt);
1030
1031	return 0;
1032}
1033
1034static int live_all_engines(void *arg)
1035{
1036	struct drm_i915_private *i915 = arg;
1037	const unsigned int nengines = num_uabi_engines(i915);
1038	struct intel_engine_cs *engine;
1039	struct i915_request **request;
1040	struct igt_live_test t;
1041	struct i915_vma *batch;
1042	unsigned int idx;
1043	int err;
1044
1045	/*
1046	 * Check we can submit requests to all engines simultaneously. We
1047	 * send a recursive batch to each engine - checking that we don't
1048	 * block doing so, and that they don't complete too soon.
1049	 */
1050
1051	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1052	if (!request)
1053		return -ENOMEM;
1054
1055	err = igt_live_test_begin(&t, i915, __func__, "");
1056	if (err)
1057		goto out_free;
1058
1059	batch = recursive_batch(i915);
1060	if (IS_ERR(batch)) {
1061		err = PTR_ERR(batch);
1062		pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
1063		goto out_free;
1064	}
1065
1066	i915_vma_lock(batch);
1067
1068	idx = 0;
1069	for_each_uabi_engine(engine, i915) {
 
 
 
 
 
 
 
 
 
 
 
1070		request[idx] = intel_engine_create_kernel_request(engine);
1071		if (IS_ERR(request[idx])) {
1072			err = PTR_ERR(request[idx]);
1073			pr_err("%s: Request allocation failed with err=%d\n",
1074			       __func__, err);
1075			goto out_request;
1076		}
 
1077
1078		err = i915_request_await_object(request[idx], batch->obj, 0);
1079		if (err == 0)
1080			err = i915_vma_move_to_active(batch, request[idx], 0);
1081		GEM_BUG_ON(err);
1082
1083		err = engine->emit_bb_start(request[idx],
1084					    batch->node.start,
1085					    batch->node.size,
1086					    0);
1087		GEM_BUG_ON(err);
1088		request[idx]->batch = batch;
1089
1090		i915_request_get(request[idx]);
1091		i915_request_add(request[idx]);
1092		idx++;
 
 
 
 
1093	}
1094
1095	i915_vma_unlock(batch);
1096
1097	idx = 0;
1098	for_each_uabi_engine(engine, i915) {
1099		if (i915_request_completed(request[idx])) {
1100			pr_err("%s(%s): request completed too early!\n",
1101			       __func__, engine->name);
1102			err = -EINVAL;
1103			goto out_request;
1104		}
1105		idx++;
1106	}
1107
1108	err = recursive_batch_resolve(batch);
1109	if (err) {
1110		pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
1111		goto out_request;
 
 
 
 
 
1112	}
1113
1114	idx = 0;
1115	for_each_uabi_engine(engine, i915) {
 
1116		long timeout;
1117
1118		timeout = i915_request_wait(request[idx], 0,
1119					    MAX_SCHEDULE_TIMEOUT);
1120		if (timeout < 0) {
1121			err = timeout;
1122			pr_err("%s: error waiting for request on %s, err=%d\n",
1123			       __func__, engine->name, err);
1124			goto out_request;
1125		}
1126
1127		GEM_BUG_ON(!i915_request_completed(request[idx]));
1128		i915_request_put(request[idx]);
 
 
1129		request[idx] = NULL;
1130		idx++;
1131	}
1132
1133	err = igt_live_test_end(&t);
1134
1135out_request:
1136	idx = 0;
1137	for_each_uabi_engine(engine, i915) {
1138		if (request[idx])
1139			i915_request_put(request[idx]);
 
 
 
 
 
 
 
 
1140		idx++;
1141	}
1142	i915_vma_unpin(batch);
1143	i915_vma_put(batch);
1144out_free:
1145	kfree(request);
1146	return err;
1147}
1148
1149static int live_sequential_engines(void *arg)
1150{
1151	struct drm_i915_private *i915 = arg;
1152	const unsigned int nengines = num_uabi_engines(i915);
1153	struct i915_request **request;
1154	struct i915_request *prev = NULL;
1155	struct intel_engine_cs *engine;
1156	struct igt_live_test t;
1157	unsigned int idx;
1158	int err;
1159
1160	/*
1161	 * Check we can submit requests to all engines sequentially, such
1162	 * that each successive request waits for the earlier ones. This
1163	 * tests that we don't execute requests out of order, even though
1164	 * they are running on independent engines.
1165	 */
1166
1167	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1168	if (!request)
1169		return -ENOMEM;
1170
1171	err = igt_live_test_begin(&t, i915, __func__, "");
1172	if (err)
1173		goto out_free;
1174
1175	idx = 0;
1176	for_each_uabi_engine(engine, i915) {
1177		struct i915_vma *batch;
1178
1179		batch = recursive_batch(i915);
1180		if (IS_ERR(batch)) {
1181			err = PTR_ERR(batch);
1182			pr_err("%s: Unable to create batch for %s, err=%d\n",
1183			       __func__, engine->name, err);
1184			goto out_free;
1185		}
1186
1187		i915_vma_lock(batch);
1188		request[idx] = intel_engine_create_kernel_request(engine);
1189		if (IS_ERR(request[idx])) {
1190			err = PTR_ERR(request[idx]);
1191			pr_err("%s: Request allocation failed for %s with err=%d\n",
1192			       __func__, engine->name, err);
1193			goto out_unlock;
1194		}
 
1195
1196		if (prev) {
1197			err = i915_request_await_dma_fence(request[idx],
1198							   &prev->fence);
1199			if (err) {
1200				i915_request_add(request[idx]);
1201				pr_err("%s: Request await failed for %s with err=%d\n",
1202				       __func__, engine->name, err);
1203				goto out_unlock;
1204			}
1205		}
1206
1207		err = i915_request_await_object(request[idx],
1208						batch->obj, false);
1209		if (err == 0)
1210			err = i915_vma_move_to_active(batch, request[idx], 0);
1211		GEM_BUG_ON(err);
1212
1213		err = engine->emit_bb_start(request[idx],
1214					    batch->node.start,
1215					    batch->node.size,
1216					    0);
1217		GEM_BUG_ON(err);
1218		request[idx]->batch = batch;
1219
1220		i915_request_get(request[idx]);
1221		i915_request_add(request[idx]);
1222
1223		prev = request[idx];
1224		idx++;
1225
1226out_unlock:
1227		i915_vma_unlock(batch);
1228		if (err)
1229			goto out_request;
1230	}
1231
1232	idx = 0;
1233	for_each_uabi_engine(engine, i915) {
1234		long timeout;
1235
1236		if (i915_request_completed(request[idx])) {
1237			pr_err("%s(%s): request completed too early!\n",
1238			       __func__, engine->name);
1239			err = -EINVAL;
1240			goto out_request;
1241		}
1242
1243		err = recursive_batch_resolve(request[idx]->batch);
1244		if (err) {
1245			pr_err("%s: failed to resolve batch, err=%d\n",
1246			       __func__, err);
1247			goto out_request;
1248		}
1249
1250		timeout = i915_request_wait(request[idx], 0,
1251					    MAX_SCHEDULE_TIMEOUT);
1252		if (timeout < 0) {
1253			err = timeout;
1254			pr_err("%s: error waiting for request on %s, err=%d\n",
1255			       __func__, engine->name, err);
1256			goto out_request;
1257		}
1258
1259		GEM_BUG_ON(!i915_request_completed(request[idx]));
1260		idx++;
1261	}
1262
1263	err = igt_live_test_end(&t);
1264
1265out_request:
1266	idx = 0;
1267	for_each_uabi_engine(engine, i915) {
1268		u32 *cmd;
1269
1270		if (!request[idx])
1271			break;
1272
1273		cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
1274						       I915_MAP_WC);
1275		if (!IS_ERR(cmd)) {
1276			*cmd = MI_BATCH_BUFFER_END;
1277
1278			__i915_gem_object_flush_map(request[idx]->batch->obj,
1279						    0, sizeof(*cmd));
1280			i915_gem_object_unpin_map(request[idx]->batch->obj);
1281
1282			intel_gt_chipset_flush(engine->gt);
1283		}
1284
1285		i915_vma_put(request[idx]->batch);
1286		i915_request_put(request[idx]);
1287		idx++;
1288	}
1289out_free:
1290	kfree(request);
1291	return err;
1292}
1293
1294static int __live_parallel_engine1(void *arg)
 
 
 
 
 
 
 
1295{
1296	struct intel_engine_cs *engine = arg;
 
 
1297	IGT_TIMEOUT(end_time);
1298	unsigned long count;
1299	int err = 0;
1300
1301	count = 0;
1302	intel_engine_pm_get(engine);
1303	do {
1304		struct i915_request *rq;
1305
1306		rq = i915_request_create(engine->kernel_context);
1307		if (IS_ERR(rq)) {
1308			err = PTR_ERR(rq);
1309			break;
1310		}
1311
1312		i915_request_get(rq);
1313		i915_request_add(rq);
1314
1315		err = 0;
1316		if (i915_request_wait(rq, 0, HZ / 5) < 0)
1317			err = -ETIME;
1318		i915_request_put(rq);
1319		if (err)
1320			break;
1321
1322		count++;
1323	} while (!__igt_timeout(end_time, NULL));
1324	intel_engine_pm_put(engine);
1325
1326	pr_info("%s: %lu request + sync\n", engine->name, count);
1327	return err;
1328}
1329
1330static int __live_parallel_engineN(void *arg)
1331{
1332	struct intel_engine_cs *engine = arg;
 
 
1333	IGT_TIMEOUT(end_time);
1334	unsigned long count;
1335	int err = 0;
1336
1337	count = 0;
1338	intel_engine_pm_get(engine);
1339	do {
1340		struct i915_request *rq;
1341
1342		rq = i915_request_create(engine->kernel_context);
1343		if (IS_ERR(rq)) {
1344			err = PTR_ERR(rq);
1345			break;
1346		}
1347
1348		i915_request_add(rq);
1349		count++;
1350	} while (!__igt_timeout(end_time, NULL));
1351	intel_engine_pm_put(engine);
1352
1353	pr_info("%s: %lu requests\n", engine->name, count);
1354	return err;
1355}
1356
1357static bool wake_all(struct drm_i915_private *i915)
1358{
1359	if (atomic_dec_and_test(&i915->selftest.counter)) {
1360		wake_up_var(&i915->selftest.counter);
1361		return true;
1362	}
1363
1364	return false;
1365}
1366
1367static int wait_for_all(struct drm_i915_private *i915)
1368{
1369	if (wake_all(i915))
1370		return 0;
1371
1372	if (wait_var_event_timeout(&i915->selftest.counter,
1373				   !atomic_read(&i915->selftest.counter),
1374				   i915_selftest.timeout_jiffies))
1375		return 0;
1376
1377	return -ETIME;
1378}
1379
1380static int __live_parallel_spin(void *arg)
1381{
1382	struct intel_engine_cs *engine = arg;
 
 
1383	struct igt_spinner spin;
1384	struct i915_request *rq;
1385	int err = 0;
1386
1387	/*
1388	 * Create a spinner running for eternity on each engine. If a second
1389	 * spinner is incorrectly placed on the same engine, it will not be
1390	 * able to start in time.
1391	 */
1392
1393	if (igt_spinner_init(&spin, engine->gt)) {
1394		wake_all(engine->i915);
1395		return -ENOMEM;
 
1396	}
1397
1398	intel_engine_pm_get(engine);
1399	rq = igt_spinner_create_request(&spin,
1400					engine->kernel_context,
1401					MI_NOOP); /* no preemption */
1402	intel_engine_pm_put(engine);
1403	if (IS_ERR(rq)) {
1404		err = PTR_ERR(rq);
1405		if (err == -ENODEV)
1406			err = 0;
1407		wake_all(engine->i915);
1408		goto out_spin;
1409	}
1410
1411	i915_request_get(rq);
1412	i915_request_add(rq);
1413	if (igt_wait_for_spinner(&spin, rq)) {
1414		/* Occupy this engine for the whole test */
1415		err = wait_for_all(engine->i915);
1416	} else {
1417		pr_err("Failed to start spinner on %s\n", engine->name);
1418		err = -EINVAL;
1419	}
1420	igt_spinner_end(&spin);
1421
1422	if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0)
1423		err = -EIO;
1424	i915_request_put(rq);
1425
1426out_spin:
1427	igt_spinner_fini(&spin);
1428	return err;
1429}
1430
1431static int live_parallel_engines(void *arg)
1432{
1433	struct drm_i915_private *i915 = arg;
1434	static int (* const func[])(void *arg) = {
1435		__live_parallel_engine1,
1436		__live_parallel_engineN,
1437		__live_parallel_spin,
1438		NULL,
1439	};
1440	const unsigned int nengines = num_uabi_engines(i915);
 
1441	struct intel_engine_cs *engine;
1442	int (* const *fn)(void *arg);
1443	struct task_struct **tsk;
1444	int err = 0;
1445
1446	/*
1447	 * Check we can submit requests to all engines concurrently. This
1448	 * tests that we load up the system maximally.
1449	 */
1450
1451	tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1452	if (!tsk)
1453		return -ENOMEM;
1454
1455	for (fn = func; !err && *fn; fn++) {
1456		char name[KSYM_NAME_LEN];
1457		struct igt_live_test t;
1458		unsigned int idx;
1459
1460		snprintf(name, sizeof(name), "%ps", *fn);
1461		err = igt_live_test_begin(&t, i915, __func__, name);
1462		if (err)
1463			break;
1464
1465		atomic_set(&i915->selftest.counter, nengines);
1466
1467		idx = 0;
1468		for_each_uabi_engine(engine, i915) {
1469			tsk[idx] = kthread_run(*fn, engine,
1470					       "igt/parallel:%s",
1471					       engine->name);
1472			if (IS_ERR(tsk[idx])) {
1473				err = PTR_ERR(tsk[idx]);
 
1474				break;
1475			}
1476			get_task_struct(tsk[idx++]);
1477		}
1478
1479		yield(); /* start all threads before we kthread_stop() */
 
 
 
 
 
 
 
1480
1481		idx = 0;
1482		for_each_uabi_engine(engine, i915) {
1483			int status;
1484
1485			if (IS_ERR(tsk[idx]))
1486				break;
1487
1488			status = kthread_stop(tsk[idx]);
 
1489			if (status && !err)
1490				err = status;
1491
1492			put_task_struct(tsk[idx++]);
1493		}
1494
1495		if (igt_live_test_end(&t))
1496			err = -EIO;
1497	}
1498
1499	kfree(tsk);
1500	return err;
1501}
1502
1503static int
1504max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1505{
1506	struct i915_request *rq;
1507	int ret;
1508
1509	/*
1510	 * Before execlists, all contexts share the same ringbuffer. With
1511	 * execlists, each context/engine has a separate ringbuffer and
1512	 * for the purposes of this test, inexhaustible.
1513	 *
1514	 * For the global ringbuffer though, we have to be very careful
1515	 * that we do not wrap while preventing the execution of requests
1516	 * with a unsignaled fence.
1517	 */
1518	if (HAS_EXECLISTS(ctx->i915))
1519		return INT_MAX;
1520
1521	rq = igt_request_alloc(ctx, engine);
1522	if (IS_ERR(rq)) {
1523		ret = PTR_ERR(rq);
1524	} else {
1525		int sz;
1526
1527		ret = rq->ring->size - rq->reserved_space;
1528		i915_request_add(rq);
1529
1530		sz = rq->ring->emit - rq->head;
1531		if (sz < 0)
1532			sz += rq->ring->size;
1533		ret /= sz;
1534		ret /= 2; /* leave half spare, in case of emergency! */
1535	}
1536
1537	return ret;
1538}
1539
1540static int live_breadcrumbs_smoketest(void *arg)
1541{
1542	struct drm_i915_private *i915 = arg;
1543	const unsigned int nengines = num_uabi_engines(i915);
1544	const unsigned int ncpus = num_online_cpus();
 
1545	unsigned long num_waits, num_fences;
1546	struct intel_engine_cs *engine;
1547	struct task_struct **threads;
1548	struct igt_live_test live;
1549	intel_wakeref_t wakeref;
1550	struct smoketest *smoke;
1551	unsigned int n, idx;
1552	struct file *file;
1553	int ret = 0;
1554
1555	/*
1556	 * Smoketest our breadcrumb/signal handling for requests across multiple
1557	 * threads. A very simple test to only catch the most egregious of bugs.
1558	 * See __igt_breadcrumbs_smoketest();
1559	 *
1560	 * On real hardware this time.
1561	 */
1562
1563	wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1564
1565	file = mock_file(i915);
1566	if (IS_ERR(file)) {
1567		ret = PTR_ERR(file);
1568		goto out_rpm;
1569	}
1570
1571	smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1572	if (!smoke) {
1573		ret = -ENOMEM;
1574		goto out_file;
1575	}
1576
1577	threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1578	if (!threads) {
1579		ret = -ENOMEM;
1580		goto out_smoke;
1581	}
1582
1583	smoke[0].request_alloc = __live_request_alloc;
1584	smoke[0].ncontexts = 64;
1585	smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1586				    sizeof(*smoke[0].contexts),
1587				    GFP_KERNEL);
1588	if (!smoke[0].contexts) {
1589		ret = -ENOMEM;
1590		goto out_threads;
1591	}
1592
1593	for (n = 0; n < smoke[0].ncontexts; n++) {
1594		smoke[0].contexts[n] = live_context(i915, file);
1595		if (IS_ERR(smoke[0].contexts[n])) {
1596			ret = PTR_ERR(smoke[0].contexts[n]);
1597			goto out_contexts;
1598		}
1599	}
1600
1601	ret = igt_live_test_begin(&live, i915, __func__, "");
1602	if (ret)
1603		goto out_contexts;
1604
1605	idx = 0;
1606	for_each_uabi_engine(engine, i915) {
1607		smoke[idx] = smoke[0];
1608		smoke[idx].engine = engine;
1609		smoke[idx].max_batch =
1610			max_batches(smoke[0].contexts[0], engine);
1611		if (smoke[idx].max_batch < 0) {
1612			ret = smoke[idx].max_batch;
1613			goto out_flush;
1614		}
1615		/* One ring interleaved between requests from all cpus */
1616		smoke[idx].max_batch /= num_online_cpus() + 1;
1617		pr_debug("Limiting batches to %d requests on %s\n",
1618			 smoke[idx].max_batch, engine->name);
1619
1620		for (n = 0; n < ncpus; n++) {
1621			struct task_struct *tsk;
 
1622
1623			tsk = kthread_run(__igt_breadcrumbs_smoketest,
1624					  &smoke[idx], "igt/%d.%d", idx, n);
1625			if (IS_ERR(tsk)) {
1626				ret = PTR_ERR(tsk);
1627				goto out_flush;
1628			}
1629
1630			get_task_struct(tsk);
1631			threads[idx * ncpus + n] = tsk;
 
 
 
 
1632		}
1633
1634		idx++;
1635	}
1636
1637	yield(); /* start all threads before we begin */
1638	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1639
1640out_flush:
1641	idx = 0;
1642	num_waits = 0;
1643	num_fences = 0;
1644	for_each_uabi_engine(engine, i915) {
1645		for (n = 0; n < ncpus; n++) {
1646			struct task_struct *tsk = threads[idx * ncpus + n];
1647			int err;
1648
1649			if (!tsk)
1650				continue;
1651
1652			err = kthread_stop(tsk);
 
 
1653			if (err < 0 && !ret)
1654				ret = err;
1655
1656			put_task_struct(tsk);
1657		}
1658
1659		num_waits += atomic_long_read(&smoke[idx].num_waits);
1660		num_fences += atomic_long_read(&smoke[idx].num_fences);
1661		idx++;
1662	}
1663	pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1664		num_waits, num_fences, idx, ncpus);
1665
1666	ret = igt_live_test_end(&live) ?: ret;
1667out_contexts:
1668	kfree(smoke[0].contexts);
1669out_threads:
1670	kfree(threads);
1671out_smoke:
1672	kfree(smoke);
1673out_file:
1674	fput(file);
1675out_rpm:
1676	intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1677
1678	return ret;
1679}
1680
1681int i915_request_live_selftests(struct drm_i915_private *i915)
1682{
1683	static const struct i915_subtest tests[] = {
1684		SUBTEST(live_nop_request),
1685		SUBTEST(live_all_engines),
1686		SUBTEST(live_sequential_engines),
1687		SUBTEST(live_parallel_engines),
1688		SUBTEST(live_empty_request),
1689		SUBTEST(live_cancel_request),
1690		SUBTEST(live_breadcrumbs_smoketest),
1691	};
1692
1693	if (intel_gt_is_wedged(&i915->gt))
1694		return 0;
1695
1696	return i915_subtests(tests, i915);
1697}
1698
1699static int switch_to_kernel_sync(struct intel_context *ce, int err)
1700{
1701	struct i915_request *rq;
1702	struct dma_fence *fence;
1703
1704	rq = intel_engine_create_kernel_request(ce->engine);
1705	if (IS_ERR(rq))
1706		return PTR_ERR(rq);
1707
1708	fence = i915_active_fence_get(&ce->timeline->last_request);
1709	if (fence) {
1710		i915_request_await_dma_fence(rq, fence);
1711		dma_fence_put(fence);
1712	}
1713
1714	rq = i915_request_get(rq);
1715	i915_request_add(rq);
1716	if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1717		err = -ETIME;
1718	i915_request_put(rq);
1719
1720	while (!err && !intel_engine_is_idle(ce->engine))
1721		intel_engine_flush_submission(ce->engine);
1722
1723	return err;
1724}
1725
1726struct perf_stats {
1727	struct intel_engine_cs *engine;
1728	unsigned long count;
1729	ktime_t time;
1730	ktime_t busy;
1731	u64 runtime;
1732};
1733
1734struct perf_series {
1735	struct drm_i915_private *i915;
1736	unsigned int nengines;
1737	struct intel_context *ce[];
1738};
1739
1740static int cmp_u32(const void *A, const void *B)
1741{
1742	const u32 *a = A, *b = B;
1743
1744	return *a - *b;
1745}
1746
1747static u32 trifilter(u32 *a)
1748{
1749	u64 sum;
1750
1751#define TF_COUNT 5
1752	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1753
1754	sum = mul_u32_u32(a[2], 2);
1755	sum += a[1];
1756	sum += a[3];
1757
1758	GEM_BUG_ON(sum > U32_MAX);
1759	return sum;
1760#define TF_BIAS 2
1761}
1762
1763static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1764{
1765	u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1766
1767	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1768}
1769
1770static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1771{
1772	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1773	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1774	*cs++ = offset;
1775	*cs++ = 0;
1776
1777	return cs;
1778}
1779
1780static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1781{
1782	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1783	*cs++ = offset;
1784	*cs++ = 0;
1785	*cs++ = value;
1786
1787	return cs;
1788}
1789
1790static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1791{
1792	*cs++ = MI_SEMAPHORE_WAIT |
1793		MI_SEMAPHORE_GLOBAL_GTT |
1794		MI_SEMAPHORE_POLL |
1795		mode;
1796	*cs++ = value;
1797	*cs++ = offset;
1798	*cs++ = 0;
1799
1800	return cs;
1801}
1802
1803static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1804{
1805	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1806}
1807
1808static void semaphore_set(u32 *sema, u32 value)
1809{
1810	WRITE_ONCE(*sema, value);
1811	wmb(); /* flush the update to the cache, and beyond */
1812}
1813
1814static u32 *hwsp_scratch(const struct intel_context *ce)
1815{
1816	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1817}
1818
1819static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1820{
1821	return (i915_ggtt_offset(ce->engine->status_page.vma) +
1822		offset_in_page(dw));
1823}
1824
1825static int measure_semaphore_response(struct intel_context *ce)
1826{
1827	u32 *sema = hwsp_scratch(ce);
1828	const u32 offset = hwsp_offset(ce, sema);
1829	u32 elapsed[TF_COUNT], cycles;
1830	struct i915_request *rq;
1831	u32 *cs;
1832	int err;
1833	int i;
1834
1835	/*
1836	 * Measure how many cycles it takes for the HW to detect the change
1837	 * in a semaphore value.
1838	 *
1839	 *    A: read CS_TIMESTAMP from CPU
1840	 *    poke semaphore
1841	 *    B: read CS_TIMESTAMP on GPU
1842	 *
1843	 * Semaphore latency: B - A
1844	 */
1845
1846	semaphore_set(sema, -1);
1847
1848	rq = i915_request_create(ce);
1849	if (IS_ERR(rq))
1850		return PTR_ERR(rq);
1851
1852	cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1853	if (IS_ERR(cs)) {
1854		i915_request_add(rq);
1855		err = PTR_ERR(cs);
1856		goto err;
1857	}
1858
1859	cs = emit_store_dw(cs, offset, 0);
1860	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1861		cs = emit_semaphore_poll_until(cs, offset, i);
1862		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1863		cs = emit_store_dw(cs, offset, 0);
1864	}
1865
1866	intel_ring_advance(rq, cs);
1867	i915_request_add(rq);
1868
1869	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1870		err = -EIO;
1871		goto err;
1872	}
1873
1874	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1875		preempt_disable();
1876		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1877		semaphore_set(sema, i);
1878		preempt_enable();
1879
1880		if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1881			err = -EIO;
1882			goto err;
1883		}
1884
1885		elapsed[i - 1] = sema[i] - cycles;
1886	}
1887
1888	cycles = trifilter(elapsed);
1889	pr_info("%s: semaphore response %d cycles, %lluns\n",
1890		ce->engine->name, cycles >> TF_BIAS,
1891		cycles_to_ns(ce->engine, cycles));
1892
1893	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1894
1895err:
1896	intel_gt_set_wedged(ce->engine->gt);
1897	return err;
1898}
1899
1900static int measure_idle_dispatch(struct intel_context *ce)
1901{
1902	u32 *sema = hwsp_scratch(ce);
1903	const u32 offset = hwsp_offset(ce, sema);
1904	u32 elapsed[TF_COUNT], cycles;
1905	u32 *cs;
1906	int err;
1907	int i;
1908
1909	/*
1910	 * Measure how long it takes for us to submit a request while the
1911	 * engine is idle, but is resting in our context.
1912	 *
1913	 *    A: read CS_TIMESTAMP from CPU
1914	 *    submit request
1915	 *    B: read CS_TIMESTAMP on GPU
1916	 *
1917	 * Submission latency: B - A
1918	 */
1919
1920	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
1921		struct i915_request *rq;
1922
1923		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1924		if (err)
1925			return err;
1926
1927		rq = i915_request_create(ce);
1928		if (IS_ERR(rq)) {
1929			err = PTR_ERR(rq);
1930			goto err;
1931		}
1932
1933		cs = intel_ring_begin(rq, 4);
1934		if (IS_ERR(cs)) {
1935			i915_request_add(rq);
1936			err = PTR_ERR(cs);
1937			goto err;
1938		}
1939
1940		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1941
1942		intel_ring_advance(rq, cs);
1943
1944		preempt_disable();
1945		local_bh_disable();
1946		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1947		i915_request_add(rq);
1948		local_bh_enable();
1949		preempt_enable();
1950	}
1951
1952	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1953	if (err)
1954		goto err;
1955
1956	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
1957		elapsed[i] = sema[i] - elapsed[i];
1958
1959	cycles = trifilter(elapsed);
1960	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
1961		ce->engine->name, cycles >> TF_BIAS,
1962		cycles_to_ns(ce->engine, cycles));
1963
1964	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1965
1966err:
1967	intel_gt_set_wedged(ce->engine->gt);
1968	return err;
1969}
1970
1971static int measure_busy_dispatch(struct intel_context *ce)
1972{
1973	u32 *sema = hwsp_scratch(ce);
1974	const u32 offset = hwsp_offset(ce, sema);
1975	u32 elapsed[TF_COUNT + 1], cycles;
1976	u32 *cs;
1977	int err;
1978	int i;
1979
1980	/*
1981	 * Measure how long it takes for us to submit a request while the
1982	 * engine is busy, polling on a semaphore in our context. With
1983	 * direct submission, this will include the cost of a lite restore.
1984	 *
1985	 *    A: read CS_TIMESTAMP from CPU
1986	 *    submit request
1987	 *    B: read CS_TIMESTAMP on GPU
1988	 *
1989	 * Submission latency: B - A
1990	 */
1991
1992	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1993		struct i915_request *rq;
1994
1995		rq = i915_request_create(ce);
1996		if (IS_ERR(rq)) {
1997			err = PTR_ERR(rq);
1998			goto err;
1999		}
2000
2001		cs = intel_ring_begin(rq, 12);
2002		if (IS_ERR(cs)) {
2003			i915_request_add(rq);
2004			err = PTR_ERR(cs);
2005			goto err;
2006		}
2007
2008		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2009		cs = emit_semaphore_poll_until(cs, offset, i);
2010		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2011
2012		intel_ring_advance(rq, cs);
2013
2014		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
2015			err = -EIO;
2016			goto err;
2017		}
2018
2019		preempt_disable();
2020		local_bh_disable();
2021		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2022		i915_request_add(rq);
2023		local_bh_enable();
2024		semaphore_set(sema, i - 1);
2025		preempt_enable();
2026	}
2027
2028	wait_for(READ_ONCE(sema[i - 1]), 500);
2029	semaphore_set(sema, i - 1);
2030
2031	for (i = 1; i <= TF_COUNT; i++) {
2032		GEM_BUG_ON(sema[i] == -1);
2033		elapsed[i - 1] = sema[i] - elapsed[i];
2034	}
2035
2036	cycles = trifilter(elapsed);
2037	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
2038		ce->engine->name, cycles >> TF_BIAS,
2039		cycles_to_ns(ce->engine, cycles));
2040
2041	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2042
2043err:
2044	intel_gt_set_wedged(ce->engine->gt);
2045	return err;
2046}
2047
2048static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
2049{
2050	const u32 offset =
2051		i915_ggtt_offset(engine->status_page.vma) +
2052		offset_in_page(sema);
2053	struct i915_request *rq;
2054	u32 *cs;
2055
2056	rq = i915_request_create(engine->kernel_context);
2057	if (IS_ERR(rq))
2058		return PTR_ERR(rq);
2059
2060	cs = intel_ring_begin(rq, 4);
2061	if (IS_ERR(cs)) {
2062		i915_request_add(rq);
2063		return PTR_ERR(cs);
2064	}
2065
2066	cs = emit_semaphore_poll(cs, mode, value, offset);
2067
2068	intel_ring_advance(rq, cs);
2069	i915_request_add(rq);
2070
2071	return 0;
2072}
2073
2074static int measure_inter_request(struct intel_context *ce)
2075{
2076	u32 *sema = hwsp_scratch(ce);
2077	const u32 offset = hwsp_offset(ce, sema);
2078	u32 elapsed[TF_COUNT + 1], cycles;
2079	struct i915_sw_fence *submit;
2080	int i, err;
2081
2082	/*
2083	 * Measure how long it takes to advance from one request into the
2084	 * next. Between each request we flush the GPU caches to memory,
2085	 * update the breadcrumbs, and then invalidate those caches.
2086	 * We queue up all the requests to be submitted in one batch so
2087	 * it should be one set of contiguous measurements.
2088	 *
2089	 *    A: read CS_TIMESTAMP on GPU
2090	 *    advance request
2091	 *    B: read CS_TIMESTAMP on GPU
2092	 *
2093	 * Request latency: B - A
2094	 */
2095
2096	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2097	if (err)
2098		return err;
2099
2100	submit = heap_fence_create(GFP_KERNEL);
2101	if (!submit) {
2102		semaphore_set(sema, 1);
2103		return -ENOMEM;
2104	}
2105
2106	intel_engine_flush_submission(ce->engine);
2107	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2108		struct i915_request *rq;
2109		u32 *cs;
2110
2111		rq = i915_request_create(ce);
2112		if (IS_ERR(rq)) {
2113			err = PTR_ERR(rq);
2114			goto err_submit;
2115		}
2116
2117		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
2118						       submit,
2119						       GFP_KERNEL);
2120		if (err < 0) {
2121			i915_request_add(rq);
2122			goto err_submit;
2123		}
2124
2125		cs = intel_ring_begin(rq, 4);
2126		if (IS_ERR(cs)) {
2127			i915_request_add(rq);
2128			err = PTR_ERR(cs);
2129			goto err_submit;
2130		}
2131
2132		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2133
2134		intel_ring_advance(rq, cs);
2135		i915_request_add(rq);
2136	}
2137	i915_sw_fence_commit(submit);
2138	intel_engine_flush_submission(ce->engine);
2139	heap_fence_put(submit);
2140
2141	semaphore_set(sema, 1);
2142	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2143	if (err)
2144		goto err;
2145
2146	for (i = 1; i <= TF_COUNT; i++)
2147		elapsed[i - 1] = sema[i + 1] - sema[i];
2148
2149	cycles = trifilter(elapsed);
2150	pr_info("%s: inter-request latency %d cycles, %lluns\n",
2151		ce->engine->name, cycles >> TF_BIAS,
2152		cycles_to_ns(ce->engine, cycles));
2153
2154	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2155
2156err_submit:
2157	i915_sw_fence_commit(submit);
2158	heap_fence_put(submit);
2159	semaphore_set(sema, 1);
2160err:
2161	intel_gt_set_wedged(ce->engine->gt);
2162	return err;
2163}
2164
2165static int measure_context_switch(struct intel_context *ce)
2166{
2167	u32 *sema = hwsp_scratch(ce);
2168	const u32 offset = hwsp_offset(ce, sema);
2169	struct i915_request *fence = NULL;
2170	u32 elapsed[TF_COUNT + 1], cycles;
2171	int i, j, err;
2172	u32 *cs;
2173
2174	/*
2175	 * Measure how long it takes to advance from one request in one
2176	 * context to a request in another context. This allows us to
2177	 * measure how long the context save/restore take, along with all
2178	 * the inter-context setup we require.
2179	 *
2180	 *    A: read CS_TIMESTAMP on GPU
2181	 *    switch context
2182	 *    B: read CS_TIMESTAMP on GPU
2183	 *
2184	 * Context switch latency: B - A
2185	 */
2186
2187	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2188	if (err)
2189		return err;
2190
2191	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2192		struct intel_context *arr[] = {
2193			ce, ce->engine->kernel_context
2194		};
2195		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
2196
2197		for (j = 0; j < ARRAY_SIZE(arr); j++) {
2198			struct i915_request *rq;
2199
2200			rq = i915_request_create(arr[j]);
2201			if (IS_ERR(rq)) {
2202				err = PTR_ERR(rq);
2203				goto err_fence;
2204			}
2205
2206			if (fence) {
2207				err = i915_request_await_dma_fence(rq,
2208								   &fence->fence);
2209				if (err) {
2210					i915_request_add(rq);
2211					goto err_fence;
2212				}
2213			}
2214
2215			cs = intel_ring_begin(rq, 4);
2216			if (IS_ERR(cs)) {
2217				i915_request_add(rq);
2218				err = PTR_ERR(cs);
2219				goto err_fence;
2220			}
2221
2222			cs = emit_timestamp_store(cs, ce, addr);
2223			addr += sizeof(u32);
2224
2225			intel_ring_advance(rq, cs);
2226
2227			i915_request_put(fence);
2228			fence = i915_request_get(rq);
2229
2230			i915_request_add(rq);
2231		}
2232	}
2233	i915_request_put(fence);
2234	intel_engine_flush_submission(ce->engine);
2235
2236	semaphore_set(sema, 1);
2237	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2238	if (err)
2239		goto err;
2240
2241	for (i = 1; i <= TF_COUNT; i++)
2242		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2243
2244	cycles = trifilter(elapsed);
2245	pr_info("%s: context switch latency %d cycles, %lluns\n",
2246		ce->engine->name, cycles >> TF_BIAS,
2247		cycles_to_ns(ce->engine, cycles));
2248
2249	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2250
2251err_fence:
2252	i915_request_put(fence);
2253	semaphore_set(sema, 1);
2254err:
2255	intel_gt_set_wedged(ce->engine->gt);
2256	return err;
2257}
2258
2259static int measure_preemption(struct intel_context *ce)
2260{
2261	u32 *sema = hwsp_scratch(ce);
2262	const u32 offset = hwsp_offset(ce, sema);
2263	u32 elapsed[TF_COUNT], cycles;
2264	u32 *cs;
2265	int err;
2266	int i;
2267
2268	/*
2269	 * We measure two latencies while triggering preemption. The first
2270	 * latency is how long it takes for us to submit a preempting request.
2271	 * The second latency is how it takes for us to return from the
2272	 * preemption back to the original context.
2273	 *
2274	 *    A: read CS_TIMESTAMP from CPU
2275	 *    submit preemption
2276	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
2277	 *    context switch
2278	 *    C: read CS_TIMESTAMP on GPU (in original context)
2279	 *
2280	 * Preemption dispatch latency: B - A
2281	 * Preemption switch latency: C - B
2282	 */
2283
2284	if (!intel_engine_has_preemption(ce->engine))
2285		return 0;
2286
2287	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2288		u32 addr = offset + 2 * i * sizeof(u32);
2289		struct i915_request *rq;
2290
2291		rq = i915_request_create(ce);
2292		if (IS_ERR(rq)) {
2293			err = PTR_ERR(rq);
2294			goto err;
2295		}
2296
2297		cs = intel_ring_begin(rq, 12);
2298		if (IS_ERR(cs)) {
2299			i915_request_add(rq);
2300			err = PTR_ERR(cs);
2301			goto err;
2302		}
2303
2304		cs = emit_store_dw(cs, addr, -1);
2305		cs = emit_semaphore_poll_until(cs, offset, i);
2306		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2307
2308		intel_ring_advance(rq, cs);
2309		i915_request_add(rq);
2310
2311		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2312			err = -EIO;
2313			goto err;
2314		}
2315
2316		rq = i915_request_create(ce->engine->kernel_context);
2317		if (IS_ERR(rq)) {
2318			err = PTR_ERR(rq);
2319			goto err;
2320		}
2321
2322		cs = intel_ring_begin(rq, 8);
2323		if (IS_ERR(cs)) {
2324			i915_request_add(rq);
2325			err = PTR_ERR(cs);
2326			goto err;
2327		}
2328
2329		cs = emit_timestamp_store(cs, ce, addr);
2330		cs = emit_store_dw(cs, offset, i);
2331
2332		intel_ring_advance(rq, cs);
2333		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2334
2335		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2336		i915_request_add(rq);
2337	}
2338
2339	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2340		err = -EIO;
2341		goto err;
2342	}
2343
2344	for (i = 1; i <= TF_COUNT; i++)
2345		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2346
2347	cycles = trifilter(elapsed);
2348	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2349		ce->engine->name, cycles >> TF_BIAS,
2350		cycles_to_ns(ce->engine, cycles));
2351
2352	for (i = 1; i <= TF_COUNT; i++)
2353		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2354
2355	cycles = trifilter(elapsed);
2356	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2357		ce->engine->name, cycles >> TF_BIAS,
2358		cycles_to_ns(ce->engine, cycles));
2359
2360	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2361
2362err:
2363	intel_gt_set_wedged(ce->engine->gt);
2364	return err;
2365}
2366
2367struct signal_cb {
2368	struct dma_fence_cb base;
2369	bool seen;
2370};
2371
2372static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2373{
2374	struct signal_cb *s = container_of(cb, typeof(*s), base);
2375
2376	smp_store_mb(s->seen, true); /* be safe, be strong */
2377}
2378
2379static int measure_completion(struct intel_context *ce)
2380{
2381	u32 *sema = hwsp_scratch(ce);
2382	const u32 offset = hwsp_offset(ce, sema);
2383	u32 elapsed[TF_COUNT], cycles;
2384	u32 *cs;
2385	int err;
2386	int i;
2387
2388	/*
2389	 * Measure how long it takes for the signal (interrupt) to be
2390	 * sent from the GPU to be processed by the CPU.
2391	 *
2392	 *    A: read CS_TIMESTAMP on GPU
2393	 *    signal
2394	 *    B: read CS_TIMESTAMP from CPU
2395	 *
2396	 * Completion latency: B - A
2397	 */
2398
2399	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2400		struct signal_cb cb = { .seen = false };
2401		struct i915_request *rq;
2402
2403		rq = i915_request_create(ce);
2404		if (IS_ERR(rq)) {
2405			err = PTR_ERR(rq);
2406			goto err;
2407		}
2408
2409		cs = intel_ring_begin(rq, 12);
2410		if (IS_ERR(cs)) {
2411			i915_request_add(rq);
2412			err = PTR_ERR(cs);
2413			goto err;
2414		}
2415
2416		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2417		cs = emit_semaphore_poll_until(cs, offset, i);
2418		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2419
2420		intel_ring_advance(rq, cs);
2421
2422		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2423		i915_request_add(rq);
2424
2425		intel_engine_flush_submission(ce->engine);
2426		if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2427			err = -EIO;
2428			goto err;
2429		}
2430
2431		preempt_disable();
2432		semaphore_set(sema, i);
2433		while (!READ_ONCE(cb.seen))
2434			cpu_relax();
2435
2436		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2437		preempt_enable();
2438	}
2439
2440	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2441	if (err)
2442		goto err;
2443
2444	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2445		GEM_BUG_ON(sema[i + 1] == -1);
2446		elapsed[i] = elapsed[i] - sema[i + 1];
2447	}
2448
2449	cycles = trifilter(elapsed);
2450	pr_info("%s: completion latency %d cycles, %lluns\n",
2451		ce->engine->name, cycles >> TF_BIAS,
2452		cycles_to_ns(ce->engine, cycles));
2453
2454	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2455
2456err:
2457	intel_gt_set_wedged(ce->engine->gt);
2458	return err;
2459}
2460
2461static void rps_pin(struct intel_gt *gt)
2462{
2463	/* Pin the frequency to max */
2464	atomic_inc(&gt->rps.num_waiters);
2465	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2466
2467	mutex_lock(&gt->rps.lock);
2468	intel_rps_set(&gt->rps, gt->rps.max_freq);
2469	mutex_unlock(&gt->rps.lock);
2470}
2471
2472static void rps_unpin(struct intel_gt *gt)
2473{
2474	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2475	atomic_dec(&gt->rps.num_waiters);
2476}
2477
2478static int perf_request_latency(void *arg)
2479{
2480	struct drm_i915_private *i915 = arg;
2481	struct intel_engine_cs *engine;
2482	struct pm_qos_request qos;
2483	int err = 0;
2484
2485	if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */
2486		return 0;
2487
2488	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2489
2490	for_each_uabi_engine(engine, i915) {
2491		struct intel_context *ce;
2492
2493		ce = intel_context_create(engine);
2494		if (IS_ERR(ce)) {
2495			err = PTR_ERR(ce);
2496			goto out;
2497		}
2498
2499		err = intel_context_pin(ce);
2500		if (err) {
2501			intel_context_put(ce);
2502			goto out;
2503		}
2504
2505		st_engine_heartbeat_disable(engine);
2506		rps_pin(engine->gt);
2507
2508		if (err == 0)
2509			err = measure_semaphore_response(ce);
2510		if (err == 0)
2511			err = measure_idle_dispatch(ce);
2512		if (err == 0)
2513			err = measure_busy_dispatch(ce);
2514		if (err == 0)
2515			err = measure_inter_request(ce);
2516		if (err == 0)
2517			err = measure_context_switch(ce);
2518		if (err == 0)
2519			err = measure_preemption(ce);
2520		if (err == 0)
2521			err = measure_completion(ce);
2522
2523		rps_unpin(engine->gt);
2524		st_engine_heartbeat_enable(engine);
2525
2526		intel_context_unpin(ce);
2527		intel_context_put(ce);
2528		if (err)
2529			goto out;
2530	}
2531
2532out:
2533	if (igt_flush_test(i915))
2534		err = -EIO;
2535
2536	cpu_latency_qos_remove_request(&qos);
2537	return err;
2538}
2539
2540static int s_sync0(void *arg)
2541{
2542	struct perf_series *ps = arg;
2543	IGT_TIMEOUT(end_time);
2544	unsigned int idx = 0;
2545	int err = 0;
2546
2547	GEM_BUG_ON(!ps->nengines);
2548	do {
2549		struct i915_request *rq;
2550
2551		rq = i915_request_create(ps->ce[idx]);
2552		if (IS_ERR(rq)) {
2553			err = PTR_ERR(rq);
2554			break;
2555		}
2556
2557		i915_request_get(rq);
2558		i915_request_add(rq);
2559
2560		if (i915_request_wait(rq, 0, HZ / 5) < 0)
2561			err = -ETIME;
2562		i915_request_put(rq);
2563		if (err)
2564			break;
2565
2566		if (++idx == ps->nengines)
2567			idx = 0;
2568	} while (!__igt_timeout(end_time, NULL));
2569
2570	return err;
2571}
2572
2573static int s_sync1(void *arg)
2574{
2575	struct perf_series *ps = arg;
2576	struct i915_request *prev = NULL;
2577	IGT_TIMEOUT(end_time);
2578	unsigned int idx = 0;
2579	int err = 0;
2580
2581	GEM_BUG_ON(!ps->nengines);
2582	do {
2583		struct i915_request *rq;
2584
2585		rq = i915_request_create(ps->ce[idx]);
2586		if (IS_ERR(rq)) {
2587			err = PTR_ERR(rq);
2588			break;
2589		}
2590
2591		i915_request_get(rq);
2592		i915_request_add(rq);
2593
2594		if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2595			err = -ETIME;
2596		i915_request_put(prev);
2597		prev = rq;
2598		if (err)
2599			break;
2600
2601		if (++idx == ps->nengines)
2602			idx = 0;
2603	} while (!__igt_timeout(end_time, NULL));
2604	i915_request_put(prev);
2605
2606	return err;
2607}
2608
2609static int s_many(void *arg)
2610{
2611	struct perf_series *ps = arg;
2612	IGT_TIMEOUT(end_time);
2613	unsigned int idx = 0;
2614
2615	GEM_BUG_ON(!ps->nengines);
2616	do {
2617		struct i915_request *rq;
2618
2619		rq = i915_request_create(ps->ce[idx]);
2620		if (IS_ERR(rq))
2621			return PTR_ERR(rq);
2622
2623		i915_request_add(rq);
2624
2625		if (++idx == ps->nengines)
2626			idx = 0;
2627	} while (!__igt_timeout(end_time, NULL));
2628
2629	return 0;
2630}
2631
2632static int perf_series_engines(void *arg)
2633{
2634	struct drm_i915_private *i915 = arg;
2635	static int (* const func[])(void *arg) = {
2636		s_sync0,
2637		s_sync1,
2638		s_many,
2639		NULL,
2640	};
2641	const unsigned int nengines = num_uabi_engines(i915);
2642	struct intel_engine_cs *engine;
2643	int (* const *fn)(void *arg);
2644	struct pm_qos_request qos;
2645	struct perf_stats *stats;
2646	struct perf_series *ps;
2647	unsigned int idx;
2648	int err = 0;
2649
2650	stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2651	if (!stats)
2652		return -ENOMEM;
2653
2654	ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2655	if (!ps) {
2656		kfree(stats);
2657		return -ENOMEM;
2658	}
2659
2660	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2661
2662	ps->i915 = i915;
2663	ps->nengines = nengines;
2664
2665	idx = 0;
2666	for_each_uabi_engine(engine, i915) {
2667		struct intel_context *ce;
2668
2669		ce = intel_context_create(engine);
2670		if (IS_ERR(ce)) {
2671			err = PTR_ERR(ce);
2672			goto out;
2673		}
2674
2675		err = intel_context_pin(ce);
2676		if (err) {
2677			intel_context_put(ce);
2678			goto out;
2679		}
2680
2681		ps->ce[idx++] = ce;
2682	}
2683	GEM_BUG_ON(idx != ps->nengines);
2684
2685	for (fn = func; *fn && !err; fn++) {
2686		char name[KSYM_NAME_LEN];
2687		struct igt_live_test t;
2688
2689		snprintf(name, sizeof(name), "%ps", *fn);
2690		err = igt_live_test_begin(&t, i915, __func__, name);
2691		if (err)
2692			break;
2693
2694		for (idx = 0; idx < nengines; idx++) {
2695			struct perf_stats *p =
2696				memset(&stats[idx], 0, sizeof(stats[idx]));
2697			struct intel_context *ce = ps->ce[idx];
2698
2699			p->engine = ps->ce[idx]->engine;
2700			intel_engine_pm_get(p->engine);
2701
2702			if (intel_engine_supports_stats(p->engine))
2703				p->busy = intel_engine_get_busy_time(p->engine,
2704								     &p->time) + 1;
2705			else
2706				p->time = ktime_get();
2707			p->runtime = -intel_context_get_total_runtime_ns(ce);
2708		}
2709
2710		err = (*fn)(ps);
2711		if (igt_live_test_end(&t))
2712			err = -EIO;
2713
2714		for (idx = 0; idx < nengines; idx++) {
2715			struct perf_stats *p = &stats[idx];
2716			struct intel_context *ce = ps->ce[idx];
2717			int integer, decimal;
2718			u64 busy, dt, now;
2719
2720			if (p->busy)
2721				p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2722									       &now),
2723						    p->busy - 1);
2724			else
2725				now = ktime_get();
2726			p->time = ktime_sub(now, p->time);
2727
2728			err = switch_to_kernel_sync(ce, err);
2729			p->runtime += intel_context_get_total_runtime_ns(ce);
2730			intel_engine_pm_put(p->engine);
2731
2732			busy = 100 * ktime_to_ns(p->busy);
2733			dt = ktime_to_ns(p->time);
2734			if (dt) {
2735				integer = div64_u64(busy, dt);
2736				busy -= integer * dt;
2737				decimal = div64_u64(100 * busy, dt);
2738			} else {
2739				integer = 0;
2740				decimal = 0;
2741			}
2742
2743			pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2744				name, p->engine->name, ce->timeline->seqno,
2745				integer, decimal,
2746				div_u64(p->runtime, 1000 * 1000),
2747				div_u64(ktime_to_ns(p->time), 1000 * 1000));
2748		}
2749	}
2750
2751out:
2752	for (idx = 0; idx < nengines; idx++) {
2753		if (IS_ERR_OR_NULL(ps->ce[idx]))
2754			break;
2755
2756		intel_context_unpin(ps->ce[idx]);
2757		intel_context_put(ps->ce[idx]);
2758	}
2759	kfree(ps);
2760
2761	cpu_latency_qos_remove_request(&qos);
2762	kfree(stats);
2763	return err;
2764}
2765
2766static int p_sync0(void *arg)
 
 
 
 
 
 
 
 
2767{
2768	struct perf_stats *p = arg;
 
2769	struct intel_engine_cs *engine = p->engine;
2770	struct intel_context *ce;
2771	IGT_TIMEOUT(end_time);
2772	unsigned long count;
2773	bool busy;
2774	int err = 0;
2775
2776	ce = intel_context_create(engine);
2777	if (IS_ERR(ce))
2778		return PTR_ERR(ce);
 
 
2779
2780	err = intel_context_pin(ce);
2781	if (err) {
2782		intel_context_put(ce);
2783		return err;
 
2784	}
2785
2786	if (intel_engine_supports_stats(engine)) {
2787		p->busy = intel_engine_get_busy_time(engine, &p->time);
2788		busy = true;
2789	} else {
2790		p->time = ktime_get();
2791		busy = false;
2792	}
2793
2794	count = 0;
2795	do {
2796		struct i915_request *rq;
2797
2798		rq = i915_request_create(ce);
2799		if (IS_ERR(rq)) {
2800			err = PTR_ERR(rq);
2801			break;
2802		}
2803
2804		i915_request_get(rq);
2805		i915_request_add(rq);
2806
2807		err = 0;
2808		if (i915_request_wait(rq, 0, HZ / 5) < 0)
2809			err = -ETIME;
2810		i915_request_put(rq);
2811		if (err)
2812			break;
2813
2814		count++;
2815	} while (!__igt_timeout(end_time, NULL));
2816
2817	if (busy) {
2818		ktime_t now;
2819
2820		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2821				    p->busy);
2822		p->time = ktime_sub(now, p->time);
2823	} else {
2824		p->time = ktime_sub(ktime_get(), p->time);
2825	}
2826
2827	err = switch_to_kernel_sync(ce, err);
2828	p->runtime = intel_context_get_total_runtime_ns(ce);
2829	p->count = count;
2830
2831	intel_context_unpin(ce);
2832	intel_context_put(ce);
2833	return err;
2834}
2835
2836static int p_sync1(void *arg)
2837{
2838	struct perf_stats *p = arg;
 
2839	struct intel_engine_cs *engine = p->engine;
2840	struct i915_request *prev = NULL;
2841	struct intel_context *ce;
2842	IGT_TIMEOUT(end_time);
2843	unsigned long count;
2844	bool busy;
2845	int err = 0;
2846
2847	ce = intel_context_create(engine);
2848	if (IS_ERR(ce))
2849		return PTR_ERR(ce);
 
 
2850
2851	err = intel_context_pin(ce);
2852	if (err) {
2853		intel_context_put(ce);
2854		return err;
 
2855	}
2856
2857	if (intel_engine_supports_stats(engine)) {
2858		p->busy = intel_engine_get_busy_time(engine, &p->time);
2859		busy = true;
2860	} else {
2861		p->time = ktime_get();
2862		busy = false;
2863	}
2864
2865	count = 0;
2866	do {
2867		struct i915_request *rq;
2868
2869		rq = i915_request_create(ce);
2870		if (IS_ERR(rq)) {
2871			err = PTR_ERR(rq);
2872			break;
2873		}
2874
2875		i915_request_get(rq);
2876		i915_request_add(rq);
2877
2878		err = 0;
2879		if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2880			err = -ETIME;
2881		i915_request_put(prev);
2882		prev = rq;
2883		if (err)
2884			break;
2885
2886		count++;
2887	} while (!__igt_timeout(end_time, NULL));
2888	i915_request_put(prev);
2889
2890	if (busy) {
2891		ktime_t now;
2892
2893		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2894				    p->busy);
2895		p->time = ktime_sub(now, p->time);
2896	} else {
2897		p->time = ktime_sub(ktime_get(), p->time);
2898	}
2899
2900	err = switch_to_kernel_sync(ce, err);
2901	p->runtime = intel_context_get_total_runtime_ns(ce);
2902	p->count = count;
2903
2904	intel_context_unpin(ce);
2905	intel_context_put(ce);
2906	return err;
2907}
2908
2909static int p_many(void *arg)
2910{
2911	struct perf_stats *p = arg;
 
2912	struct intel_engine_cs *engine = p->engine;
2913	struct intel_context *ce;
2914	IGT_TIMEOUT(end_time);
2915	unsigned long count;
2916	int err = 0;
2917	bool busy;
2918
2919	ce = intel_context_create(engine);
2920	if (IS_ERR(ce))
2921		return PTR_ERR(ce);
 
 
2922
2923	err = intel_context_pin(ce);
2924	if (err) {
2925		intel_context_put(ce);
2926		return err;
 
2927	}
2928
2929	if (intel_engine_supports_stats(engine)) {
2930		p->busy = intel_engine_get_busy_time(engine, &p->time);
2931		busy = true;
2932	} else {
2933		p->time = ktime_get();
2934		busy = false;
2935	}
2936
2937	count = 0;
2938	do {
2939		struct i915_request *rq;
2940
2941		rq = i915_request_create(ce);
2942		if (IS_ERR(rq)) {
2943			err = PTR_ERR(rq);
2944			break;
2945		}
2946
2947		i915_request_add(rq);
2948		count++;
2949	} while (!__igt_timeout(end_time, NULL));
2950
2951	if (busy) {
2952		ktime_t now;
2953
2954		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2955				    p->busy);
2956		p->time = ktime_sub(now, p->time);
2957	} else {
2958		p->time = ktime_sub(ktime_get(), p->time);
2959	}
2960
2961	err = switch_to_kernel_sync(ce, err);
2962	p->runtime = intel_context_get_total_runtime_ns(ce);
2963	p->count = count;
2964
2965	intel_context_unpin(ce);
2966	intel_context_put(ce);
2967	return err;
2968}
2969
2970static int perf_parallel_engines(void *arg)
2971{
2972	struct drm_i915_private *i915 = arg;
2973	static int (* const func[])(void *arg) = {
2974		p_sync0,
2975		p_sync1,
2976		p_many,
2977		NULL,
2978	};
2979	const unsigned int nengines = num_uabi_engines(i915);
 
2980	struct intel_engine_cs *engine;
2981	int (* const *fn)(void *arg);
2982	struct pm_qos_request qos;
2983	struct {
2984		struct perf_stats p;
2985		struct task_struct *tsk;
2986	} *engines;
2987	int err = 0;
2988
2989	engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
2990	if (!engines)
2991		return -ENOMEM;
2992
2993	cpu_latency_qos_add_request(&qos, 0);
2994
2995	for (fn = func; *fn; fn++) {
2996		char name[KSYM_NAME_LEN];
2997		struct igt_live_test t;
2998		unsigned int idx;
2999
3000		snprintf(name, sizeof(name), "%ps", *fn);
3001		err = igt_live_test_begin(&t, i915, __func__, name);
3002		if (err)
3003			break;
3004
3005		atomic_set(&i915->selftest.counter, nengines);
3006
3007		idx = 0;
3008		for_each_uabi_engine(engine, i915) {
 
 
3009			intel_engine_pm_get(engine);
3010
3011			memset(&engines[idx].p, 0, sizeof(engines[idx].p));
3012			engines[idx].p.engine = engine;
3013
3014			engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
3015						       "igt:%s", engine->name);
3016			if (IS_ERR(engines[idx].tsk)) {
3017				err = PTR_ERR(engines[idx].tsk);
3018				intel_engine_pm_put(engine);
3019				break;
3020			}
3021			get_task_struct(engines[idx++].tsk);
3022		}
 
 
3023
3024		yield(); /* start all threads before we kthread_stop() */
 
 
 
3025
3026		idx = 0;
3027		for_each_uabi_engine(engine, i915) {
3028			int status;
3029
3030			if (IS_ERR(engines[idx].tsk))
3031				break;
3032
3033			status = kthread_stop(engines[idx].tsk);
 
3034			if (status && !err)
3035				err = status;
3036
3037			intel_engine_pm_put(engine);
3038			put_task_struct(engines[idx++].tsk);
 
 
3039		}
3040
3041		if (igt_live_test_end(&t))
3042			err = -EIO;
3043		if (err)
3044			break;
3045
3046		idx = 0;
3047		for_each_uabi_engine(engine, i915) {
3048			struct perf_stats *p = &engines[idx].p;
3049			u64 busy = 100 * ktime_to_ns(p->busy);
3050			u64 dt = ktime_to_ns(p->time);
3051			int integer, decimal;
3052
3053			if (dt) {
3054				integer = div64_u64(busy, dt);
3055				busy -= integer * dt;
3056				decimal = div64_u64(100 * busy, dt);
3057			} else {
3058				integer = 0;
3059				decimal = 0;
3060			}
3061
3062			GEM_BUG_ON(engine != p->engine);
3063			pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
3064				name, engine->name, p->count, integer, decimal,
3065				div_u64(p->runtime, 1000 * 1000),
3066				div_u64(ktime_to_ns(p->time), 1000 * 1000));
3067			idx++;
3068		}
3069	}
3070
3071	cpu_latency_qos_remove_request(&qos);
3072	kfree(engines);
3073	return err;
3074}
3075
3076int i915_request_perf_selftests(struct drm_i915_private *i915)
3077{
3078	static const struct i915_subtest tests[] = {
3079		SUBTEST(perf_request_latency),
3080		SUBTEST(perf_series_engines),
3081		SUBTEST(perf_parallel_engines),
3082	};
3083
3084	if (intel_gt_is_wedged(&i915->gt))
3085		return 0;
3086
3087	return i915_subtests(tests, i915);
3088}

   1/*
   2 * Copyright © 2016 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 */
  24
  25#include <linux/prime_numbers.h>
  26#include <linux/pm_qos.h>
  27#include <linux/sort.h>
  28
  29#include "gem/i915_gem_internal.h"
  30#include "gem/i915_gem_pm.h"
  31#include "gem/selftests/mock_context.h"
  32
  33#include "gt/intel_engine_heartbeat.h"
  34#include "gt/intel_engine_pm.h"
  35#include "gt/intel_engine_user.h"
  36#include "gt/intel_gt.h"
  37#include "gt/intel_gt_clock_utils.h"
  38#include "gt/intel_gt_requests.h"
  39#include "gt/selftest_engine_heartbeat.h"
  40
  41#include "i915_random.h"
  42#include "i915_selftest.h"
  43#include "igt_flush_test.h"
  44#include "igt_live_test.h"
  45#include "igt_spinner.h"
  46#include "lib_sw_fence.h"
  47
  48#include "mock_drm.h"
  49#include "mock_gem_device.h"
  50
  51static unsigned int num_uabi_engines(struct drm_i915_private *i915)
  52{
  53	struct intel_engine_cs *engine;
  54	unsigned int count;
  55
  56	count = 0;
  57	for_each_uabi_engine(engine, i915)
  58		count++;
  59
  60	return count;
  61}
  62
  63static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
  64{
  65	return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
  66}
  67
  68static int igt_add_request(void *arg)
  69{
  70	struct drm_i915_private *i915 = arg;
  71	struct i915_request *request;
  72
  73	/* Basic preliminary test to create a request and let it loose! */
  74
  75	request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
  76	if (!request)
  77		return -ENOMEM;
  78
  79	i915_request_add(request);
  80
  81	return 0;
  82}
  83
  84static int igt_wait_request(void *arg)
  85{
  86	const long T = HZ / 4;
  87	struct drm_i915_private *i915 = arg;
  88	struct i915_request *request;
  89	int err = -EINVAL;
  90
  91	/* Submit a request, then wait upon it */
  92
  93	request = mock_request(rcs0(i915)->kernel_context, T);
  94	if (!request)
  95		return -ENOMEM;
  96
  97	i915_request_get(request);
  98
  99	if (i915_request_wait(request, 0, 0) != -ETIME) {
 100		pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
 101		goto out_request;
 102	}
 103
 104	if (i915_request_wait(request, 0, T) != -ETIME) {
 105		pr_err("request wait succeeded (expected timeout before submit!)\n");
 106		goto out_request;
 107	}
 108
 109	if (i915_request_completed(request)) {
 110		pr_err("request completed before submit!!\n");
 111		goto out_request;
 112	}
 113
 114	i915_request_add(request);
 115
 116	if (i915_request_wait(request, 0, 0) != -ETIME) {
 117		pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
 118		goto out_request;
 119	}
 120
 121	if (i915_request_completed(request)) {
 122		pr_err("request completed immediately!\n");
 123		goto out_request;
 124	}
 125
 126	if (i915_request_wait(request, 0, T / 2) != -ETIME) {
 127		pr_err("request wait succeeded (expected timeout!)\n");
 128		goto out_request;
 129	}
 130
 131	if (i915_request_wait(request, 0, T) == -ETIME) {
 132		pr_err("request wait timed out!\n");
 133		goto out_request;
 134	}
 135
 136	if (!i915_request_completed(request)) {
 137		pr_err("request not complete after waiting!\n");
 138		goto out_request;
 139	}
 140
 141	if (i915_request_wait(request, 0, T) == -ETIME) {
 142		pr_err("request wait timed out when already complete!\n");
 143		goto out_request;
 144	}
 145
 146	err = 0;
 147out_request:
 148	i915_request_put(request);
 149	mock_device_flush(i915);
 150	return err;
 151}
 152
 153static int igt_fence_wait(void *arg)
 154{
 155	const long T = HZ / 4;
 156	struct drm_i915_private *i915 = arg;
 157	struct i915_request *request;
 158	int err = -EINVAL;
 159
 160	/* Submit a request, treat it as a fence and wait upon it */
 161
 162	request = mock_request(rcs0(i915)->kernel_context, T);
 163	if (!request)
 164		return -ENOMEM;
 165
 166	if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
 167		pr_err("fence wait success before submit (expected timeout)!\n");
 168		goto out;
 169	}
 170
 171	i915_request_add(request);
 172
 173	if (dma_fence_is_signaled(&request->fence)) {
 174		pr_err("fence signaled immediately!\n");
 175		goto out;
 176	}
 177
 178	if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
 179		pr_err("fence wait success after submit (expected timeout)!\n");
 180		goto out;
 181	}
 182
 183	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
 184		pr_err("fence wait timed out (expected success)!\n");
 185		goto out;
 186	}
 187
 188	if (!dma_fence_is_signaled(&request->fence)) {
 189		pr_err("fence unsignaled after waiting!\n");
 190		goto out;
 191	}
 192
 193	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
 194		pr_err("fence wait timed out when complete (expected success)!\n");
 195		goto out;
 196	}
 197
 198	err = 0;
 199out:
 200	mock_device_flush(i915);
 201	return err;
 202}
 203
 204static int igt_request_rewind(void *arg)
 205{
 206	struct drm_i915_private *i915 = arg;
 207	struct i915_request *request, *vip;
 208	struct i915_gem_context *ctx[2];
 209	struct intel_context *ce;
 210	int err = -EINVAL;
 211
 212	ctx[0] = mock_context(i915, "A");
 213	if (!ctx[0]) {
 214		err = -ENOMEM;
 215		goto err_ctx_0;
 216	}
 217
 218	ce = i915_gem_context_get_engine(ctx[0], RCS0);
 219	GEM_BUG_ON(IS_ERR(ce));
 220	request = mock_request(ce, 2 * HZ);
 221	intel_context_put(ce);
 222	if (!request) {
 223		err = -ENOMEM;
 224		goto err_context_0;
 225	}
 226
 227	i915_request_get(request);
 228	i915_request_add(request);
 229
 230	ctx[1] = mock_context(i915, "B");
 231	if (!ctx[1]) {
 232		err = -ENOMEM;
 233		goto err_ctx_1;
 234	}
 235
 236	ce = i915_gem_context_get_engine(ctx[1], RCS0);
 237	GEM_BUG_ON(IS_ERR(ce));
 238	vip = mock_request(ce, 0);
 239	intel_context_put(ce);
 240	if (!vip) {
 241		err = -ENOMEM;
 242		goto err_context_1;
 243	}
 244
 245	/* Simulate preemption by manual reordering */
 246	if (!mock_cancel_request(request)) {
 247		pr_err("failed to cancel request (already executed)!\n");
 248		i915_request_add(vip);
 249		goto err_context_1;
 250	}
 251	i915_request_get(vip);
 252	i915_request_add(vip);
 253	rcu_read_lock();
 254	request->engine->submit_request(request);
 255	rcu_read_unlock();
 256
 257
 258	if (i915_request_wait(vip, 0, HZ) == -ETIME) {
 259		pr_err("timed out waiting for high priority request\n");
 260		goto err;
 261	}
 262
 263	if (i915_request_completed(request)) {
 264		pr_err("low priority request already completed\n");
 265		goto err;
 266	}
 267
 268	err = 0;
 269err:
 270	i915_request_put(vip);
 271err_context_1:
 272	mock_context_close(ctx[1]);
 273err_ctx_1:
 274	i915_request_put(request);
 275err_context_0:
 276	mock_context_close(ctx[0]);
 277err_ctx_0:
 278	mock_device_flush(i915);
 279	return err;
 280}
 281
 282struct smoketest {
 283	struct intel_engine_cs *engine;
 284	struct i915_gem_context **contexts;
 285	atomic_long_t num_waits, num_fences;
 286	int ncontexts, max_batch;
 287	struct i915_request *(*request_alloc)(struct intel_context *ce);
 288};
 289
 290static struct i915_request *
 291__mock_request_alloc(struct intel_context *ce)
 292{
 293	return mock_request(ce, 0);
 294}
 295
 296static struct i915_request *
 297__live_request_alloc(struct intel_context *ce)
 298{
 299	return intel_context_create_request(ce);
 300}
 301
 302struct smoke_thread {
 303	struct kthread_worker *worker;
 304	struct kthread_work work;
 305	struct smoketest *t;
 306	bool stop;
 307	int result;
 308};
 309
 310static void __igt_breadcrumbs_smoketest(struct kthread_work *work)
 311{
 312	struct smoke_thread *thread = container_of(work, typeof(*thread), work);
 313	struct smoketest *t = thread->t;
 314	const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
 315	const unsigned int total = 4 * t->ncontexts + 1;
 316	unsigned int num_waits = 0, num_fences = 0;
 317	struct i915_request **requests;
 318	I915_RND_STATE(prng);
 319	unsigned int *order;
 320	int err = 0;
 321
 322	/*
 323	 * A very simple test to catch the most egregious of list handling bugs.
 324	 *
 325	 * At its heart, we simply create oodles of requests running across
 326	 * multiple kthreads and enable signaling on them, for the sole purpose
 327	 * of stressing our breadcrumb handling. The only inspection we do is
 328	 * that the fences were marked as signaled.
 329	 */
 330
 331	requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
 332	if (!requests) {
 333		thread->result = -ENOMEM;
 334		return;
 335	}
 336
 337	order = i915_random_order(total, &prng);
 338	if (!order) {
 339		err = -ENOMEM;
 340		goto out_requests;
 341	}
 342
 343	while (!READ_ONCE(thread->stop)) {
 344		struct i915_sw_fence *submit, *wait;
 345		unsigned int n, count;
 346
 347		submit = heap_fence_create(GFP_KERNEL);
 348		if (!submit) {
 349			err = -ENOMEM;
 350			break;
 351		}
 352
 353		wait = heap_fence_create(GFP_KERNEL);
 354		if (!wait) {
 355			i915_sw_fence_commit(submit);
 356			heap_fence_put(submit);
 357			err = -ENOMEM;
 358			break;
 359		}
 360
 361		i915_random_reorder(order, total, &prng);
 362		count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
 363
 364		for (n = 0; n < count; n++) {
 365			struct i915_gem_context *ctx =
 366				t->contexts[order[n] % t->ncontexts];
 367			struct i915_request *rq;
 368			struct intel_context *ce;
 369
 370			ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
 371			GEM_BUG_ON(IS_ERR(ce));
 372			rq = t->request_alloc(ce);
 373			intel_context_put(ce);
 374			if (IS_ERR(rq)) {
 375				err = PTR_ERR(rq);
 376				count = n;
 377				break;
 378			}
 379
 380			err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
 381							       submit,
 382							       GFP_KERNEL);
 383
 384			requests[n] = i915_request_get(rq);
 385			i915_request_add(rq);
 386
 387			if (err >= 0)
 388				err = i915_sw_fence_await_dma_fence(wait,
 389								    &rq->fence,
 390								    0,
 391								    GFP_KERNEL);
 392
 393			if (err < 0) {
 394				i915_request_put(rq);
 395				count = n;
 396				break;
 397			}
 398		}
 399
 400		i915_sw_fence_commit(submit);
 401		i915_sw_fence_commit(wait);
 402
 403		if (!wait_event_timeout(wait->wait,
 404					i915_sw_fence_done(wait),
 405					5 * HZ)) {
 406			struct i915_request *rq = requests[count - 1];
 407
 408			pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
 409			       atomic_read(&wait->pending), count,
 410			       rq->fence.context, rq->fence.seqno,
 411			       t->engine->name);
 412			GEM_TRACE_DUMP();
 413
 414			intel_gt_set_wedged(t->engine->gt);
 415			GEM_BUG_ON(!i915_request_completed(rq));
 416			i915_sw_fence_wait(wait);
 417			err = -EIO;
 418		}
 419
 420		for (n = 0; n < count; n++) {
 421			struct i915_request *rq = requests[n];
 422
 423			if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
 424				      &rq->fence.flags)) {
 425				pr_err("%llu:%llu was not signaled!\n",
 426				       rq->fence.context, rq->fence.seqno);
 427				err = -EINVAL;
 428			}
 429
 430			i915_request_put(rq);
 431		}
 432
 433		heap_fence_put(wait);
 434		heap_fence_put(submit);
 435
 436		if (err < 0)
 437			break;
 438
 439		num_fences += count;
 440		num_waits++;
 441
 442		cond_resched();
 443	}
 444
 445	atomic_long_add(num_fences, &t->num_fences);
 446	atomic_long_add(num_waits, &t->num_waits);
 447
 448	kfree(order);
 449out_requests:
 450	kfree(requests);
 451	thread->result = err;
 452}
 453
 454static int mock_breadcrumbs_smoketest(void *arg)
 455{
 456	struct drm_i915_private *i915 = arg;
 457	struct smoketest t = {
 458		.engine = rcs0(i915),
 459		.ncontexts = 1024,
 460		.max_batch = 1024,
 461		.request_alloc = __mock_request_alloc
 462	};
 463	unsigned int ncpus = num_online_cpus();
 464	struct smoke_thread *threads;
 465	unsigned int n;
 466	int ret = 0;
 467
 468	/*
 469	 * Smoketest our breadcrumb/signal handling for requests across multiple
 470	 * threads. A very simple test to only catch the most egregious of bugs.
 471	 * See __igt_breadcrumbs_smoketest();
 472	 */
 473
 474	threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
 475	if (!threads)
 476		return -ENOMEM;
 477
 478	t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
 479	if (!t.contexts) {
 480		ret = -ENOMEM;
 481		goto out_threads;
 482	}
 483
 484	for (n = 0; n < t.ncontexts; n++) {
 485		t.contexts[n] = mock_context(t.engine->i915, "mock");
 486		if (!t.contexts[n]) {
 487			ret = -ENOMEM;
 488			goto out_contexts;
 489		}
 490	}
 491
 492	for (n = 0; n < ncpus; n++) {
 493		struct kthread_worker *worker;
 494
 495		worker = kthread_create_worker(0, "igt/%d", n);
 496		if (IS_ERR(worker)) {
 497			ret = PTR_ERR(worker);
 498			ncpus = n;
 499			break;
 500		}
 501
 502		threads[n].worker = worker;
 503		threads[n].t = &t;
 504		threads[n].stop = false;
 505		threads[n].result = 0;
 506
 507		kthread_init_work(&threads[n].work,
 508				  __igt_breadcrumbs_smoketest);
 509		kthread_queue_work(worker, &threads[n].work);
 510	}
 511
 
 512	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
 513
 514	for (n = 0; n < ncpus; n++) {
 515		int err;
 516
 517		WRITE_ONCE(threads[n].stop, true);
 518		kthread_flush_work(&threads[n].work);
 519		err = READ_ONCE(threads[n].result);
 520		if (err < 0 && !ret)
 521			ret = err;
 522
 523		kthread_destroy_worker(threads[n].worker);
 524	}
 525	pr_info("Completed %lu waits for %lu fence across %d cpus\n",
 526		atomic_long_read(&t.num_waits),
 527		atomic_long_read(&t.num_fences),
 528		ncpus);
 529
 530out_contexts:
 531	for (n = 0; n < t.ncontexts; n++) {
 532		if (!t.contexts[n])
 533			break;
 534		mock_context_close(t.contexts[n]);
 535	}
 536	kfree(t.contexts);
 537out_threads:
 538	kfree(threads);
 539	return ret;
 540}
 541
 542int i915_request_mock_selftests(void)
 543{
 544	static const struct i915_subtest tests[] = {
 545		SUBTEST(igt_add_request),
 546		SUBTEST(igt_wait_request),
 547		SUBTEST(igt_fence_wait),
 548		SUBTEST(igt_request_rewind),
 549		SUBTEST(mock_breadcrumbs_smoketest),
 550	};
 551	struct drm_i915_private *i915;
 552	intel_wakeref_t wakeref;
 553	int err = 0;
 554
 555	i915 = mock_gem_device();
 556	if (!i915)
 557		return -ENOMEM;
 558
 559	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
 560		err = i915_subtests(tests, i915);
 561
 562	mock_destroy_device(i915);
 563
 564	return err;
 565}
 566
 567static int live_nop_request(void *arg)
 568{
 569	struct drm_i915_private *i915 = arg;
 570	struct intel_engine_cs *engine;
 571	struct igt_live_test t;
 572	int err = -ENODEV;
 573
 574	/*
 575	 * Submit various sized batches of empty requests, to each engine
 576	 * (individually), and wait for the batch to complete. We can check
 577	 * the overhead of submitting requests to the hardware.
 578	 */
 579
 580	for_each_uabi_engine(engine, i915) {
 581		unsigned long n, prime;
 582		IGT_TIMEOUT(end_time);
 583		ktime_t times[2] = {};
 584
 585		err = igt_live_test_begin(&t, i915, __func__, engine->name);
 586		if (err)
 587			return err;
 588
 589		intel_engine_pm_get(engine);
 590		for_each_prime_number_from(prime, 1, 8192) {
 591			struct i915_request *request = NULL;
 592
 593			times[1] = ktime_get_raw();
 594
 595			for (n = 0; n < prime; n++) {
 596				i915_request_put(request);
 597				request = i915_request_create(engine->kernel_context);
 598				if (IS_ERR(request))
 599					return PTR_ERR(request);
 600
 601				/*
 602				 * This space is left intentionally blank.
 603				 *
 604				 * We do not actually want to perform any
 605				 * action with this request, we just want
 606				 * to measure the latency in allocation
 607				 * and submission of our breadcrumbs -
 608				 * ensuring that the bare request is sufficient
 609				 * for the system to work (i.e. proper HEAD
 610				 * tracking of the rings, interrupt handling,
 611				 * etc). It also gives us the lowest bounds
 612				 * for latency.
 613				 */
 614
 615				i915_request_get(request);
 616				i915_request_add(request);
 617			}
 618			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
 619			i915_request_put(request);
 620
 621			times[1] = ktime_sub(ktime_get_raw(), times[1]);
 622			if (prime == 1)
 623				times[0] = times[1];
 624
 625			if (__igt_timeout(end_time, NULL))
 626				break;
 627		}
 628		intel_engine_pm_put(engine);
 629
 630		err = igt_live_test_end(&t);
 631		if (err)
 632			return err;
 633
 634		pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
 635			engine->name,
 636			ktime_to_ns(times[0]),
 637			prime, div64_u64(ktime_to_ns(times[1]), prime));
 638	}
 639
 640	return err;
 641}
 642
 643static int __cancel_inactive(struct intel_engine_cs *engine)
 644{
 645	struct intel_context *ce;
 646	struct igt_spinner spin;
 647	struct i915_request *rq;
 648	int err = 0;
 649
 650	if (igt_spinner_init(&spin, engine->gt))
 651		return -ENOMEM;
 652
 653	ce = intel_context_create(engine);
 654	if (IS_ERR(ce)) {
 655		err = PTR_ERR(ce);
 656		goto out_spin;
 657	}
 658
 659	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
 660	if (IS_ERR(rq)) {
 661		err = PTR_ERR(rq);
 662		goto out_ce;
 663	}
 664
 665	pr_debug("%s: Cancelling inactive request\n", engine->name);
 666	i915_request_cancel(rq, -EINTR);
 667	i915_request_get(rq);
 668	i915_request_add(rq);
 669
 670	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 671		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
 672
 673		pr_err("%s: Failed to cancel inactive request\n", engine->name);
 674		intel_engine_dump(engine, &p, "%s\n", engine->name);
 675		err = -ETIME;
 676		goto out_rq;
 677	}
 678
 679	if (rq->fence.error != -EINTR) {
 680		pr_err("%s: fence not cancelled (%u)\n",
 681		       engine->name, rq->fence.error);
 682		err = -EINVAL;
 683	}
 684
 685out_rq:
 686	i915_request_put(rq);
 687out_ce:
 688	intel_context_put(ce);
 689out_spin:
 690	igt_spinner_fini(&spin);
 691	if (err)
 692		pr_err("%s: %s error %d\n", __func__, engine->name, err);
 693	return err;
 694}
 695
 696static int __cancel_active(struct intel_engine_cs *engine)
 697{
 698	struct intel_context *ce;
 699	struct igt_spinner spin;
 700	struct i915_request *rq;
 701	int err = 0;
 702
 703	if (igt_spinner_init(&spin, engine->gt))
 704		return -ENOMEM;
 705
 706	ce = intel_context_create(engine);
 707	if (IS_ERR(ce)) {
 708		err = PTR_ERR(ce);
 709		goto out_spin;
 710	}
 711
 712	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
 713	if (IS_ERR(rq)) {
 714		err = PTR_ERR(rq);
 715		goto out_ce;
 716	}
 717
 718	pr_debug("%s: Cancelling active request\n", engine->name);
 719	i915_request_get(rq);
 720	i915_request_add(rq);
 721	if (!igt_wait_for_spinner(&spin, rq)) {
 722		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
 723
 724		pr_err("Failed to start spinner on %s\n", engine->name);
 725		intel_engine_dump(engine, &p, "%s\n", engine->name);
 726		err = -ETIME;
 727		goto out_rq;
 728	}
 729	i915_request_cancel(rq, -EINTR);
 730
 731	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 732		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
 733
 734		pr_err("%s: Failed to cancel active request\n", engine->name);
 735		intel_engine_dump(engine, &p, "%s\n", engine->name);
 736		err = -ETIME;
 737		goto out_rq;
 738	}
 739
 740	if (rq->fence.error != -EINTR) {
 741		pr_err("%s: fence not cancelled (%u)\n",
 742		       engine->name, rq->fence.error);
 743		err = -EINVAL;
 744	}
 745
 746out_rq:
 747	i915_request_put(rq);
 748out_ce:
 749	intel_context_put(ce);
 750out_spin:
 751	igt_spinner_fini(&spin);
 752	if (err)
 753		pr_err("%s: %s error %d\n", __func__, engine->name, err);
 754	return err;
 755}
 756
 757static int __cancel_completed(struct intel_engine_cs *engine)
 758{
 759	struct intel_context *ce;
 760	struct igt_spinner spin;
 761	struct i915_request *rq;
 762	int err = 0;
 763
 764	if (igt_spinner_init(&spin, engine->gt))
 765		return -ENOMEM;
 766
 767	ce = intel_context_create(engine);
 768	if (IS_ERR(ce)) {
 769		err = PTR_ERR(ce);
 770		goto out_spin;
 771	}
 772
 773	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
 774	if (IS_ERR(rq)) {
 775		err = PTR_ERR(rq);
 776		goto out_ce;
 777	}
 778	igt_spinner_end(&spin);
 779	i915_request_get(rq);
 780	i915_request_add(rq);
 781
 782	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 783		err = -ETIME;
 784		goto out_rq;
 785	}
 786
 787	pr_debug("%s: Cancelling completed request\n", engine->name);
 788	i915_request_cancel(rq, -EINTR);
 789	if (rq->fence.error) {
 790		pr_err("%s: fence not cancelled (%u)\n",
 791		       engine->name, rq->fence.error);
 792		err = -EINVAL;
 793	}
 794
 795out_rq:
 796	i915_request_put(rq);
 797out_ce:
 798	intel_context_put(ce);
 799out_spin:
 800	igt_spinner_fini(&spin);
 801	if (err)
 802		pr_err("%s: %s error %d\n", __func__, engine->name, err);
 803	return err;
 804}
 805
 806/*
 807 * Test to prove a non-preemptable request can be cancelled and a subsequent
 808 * request on the same context can successfully complete after cancellation.
 809 *
 810 * Testing methodology is to create a non-preemptible request and submit it,
 811 * wait for spinner to start, create a NOP request and submit it, cancel the
 812 * spinner, wait for spinner to complete and verify it failed with an error,
 813 * finally wait for NOP request to complete verify it succeeded without an
 814 * error. Preemption timeout also reduced / restored so test runs in a timely
 815 * maner.
 816 */
 817static int __cancel_reset(struct drm_i915_private *i915,
 818			  struct intel_engine_cs *engine)
 819{
 820	struct intel_context *ce;
 821	struct igt_spinner spin;
 822	struct i915_request *rq, *nop;
 823	unsigned long preempt_timeout_ms;
 824	int err = 0;
 825
 826	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT ||
 827	    !intel_has_reset_engine(engine->gt))
 828		return 0;
 829
 830	preempt_timeout_ms = engine->props.preempt_timeout_ms;
 831	engine->props.preempt_timeout_ms = 100;
 832
 833	if (igt_spinner_init(&spin, engine->gt))
 834		goto out_restore;
 835
 836	ce = intel_context_create(engine);
 837	if (IS_ERR(ce)) {
 838		err = PTR_ERR(ce);
 839		goto out_spin;
 840	}
 841
 842	rq = igt_spinner_create_request(&spin, ce, MI_NOOP);
 843	if (IS_ERR(rq)) {
 844		err = PTR_ERR(rq);
 845		goto out_ce;
 846	}
 847
 848	pr_debug("%s: Cancelling active non-preemptable request\n",
 849		 engine->name);
 850	i915_request_get(rq);
 851	i915_request_add(rq);
 852	if (!igt_wait_for_spinner(&spin, rq)) {
 853		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
 854
 855		pr_err("Failed to start spinner on %s\n", engine->name);
 856		intel_engine_dump(engine, &p, "%s\n", engine->name);
 857		err = -ETIME;
 858		goto out_rq;
 859	}
 860
 861	nop = intel_context_create_request(ce);
 862	if (IS_ERR(nop))
 863		goto out_rq;
 864	i915_request_get(nop);
 865	i915_request_add(nop);
 866
 867	i915_request_cancel(rq, -EINTR);
 868
 869	if (i915_request_wait(rq, 0, HZ) < 0) {
 870		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
 871
 872		pr_err("%s: Failed to cancel hung request\n", engine->name);
 873		intel_engine_dump(engine, &p, "%s\n", engine->name);
 874		err = -ETIME;
 875		goto out_nop;
 876	}
 877
 878	if (rq->fence.error != -EINTR) {
 879		pr_err("%s: fence not cancelled (%u)\n",
 880		       engine->name, rq->fence.error);
 881		err = -EINVAL;
 882		goto out_nop;
 883	}
 884
 885	if (i915_request_wait(nop, 0, HZ) < 0) {
 886		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
 887
 888		pr_err("%s: Failed to complete nop request\n", engine->name);
 889		intel_engine_dump(engine, &p, "%s\n", engine->name);
 890		err = -ETIME;
 891		goto out_nop;
 892	}
 893
 894	if (nop->fence.error != 0) {
 895		pr_err("%s: Nop request errored (%u)\n",
 896		       engine->name, nop->fence.error);
 897		err = -EINVAL;
 898	}
 899
 900out_nop:
 901	i915_request_put(nop);
 902out_rq:
 903	i915_request_put(rq);
 904out_ce:
 905	intel_context_put(ce);
 906out_spin:
 907	igt_spinner_fini(&spin);
 908out_restore:
 909	engine->props.preempt_timeout_ms = preempt_timeout_ms;
 910	if (err)
 911		pr_err("%s: %s error %d\n", __func__, engine->name, err);
 912	return err;
 913}
 914
 915static int live_cancel_request(void *arg)
 916{
 917	struct drm_i915_private *i915 = arg;
 918	struct intel_engine_cs *engine;
 919
 920	/*
 921	 * Check cancellation of requests. We expect to be able to immediately
 922	 * cancel active requests, even if they are currently on the GPU.
 923	 */
 924
 925	for_each_uabi_engine(engine, i915) {
 926		struct igt_live_test t;
 927		int err, err2;
 928
 929		if (!intel_engine_has_preemption(engine))
 930			continue;
 931
 932		err = igt_live_test_begin(&t, i915, __func__, engine->name);
 933		if (err)
 934			return err;
 935
 936		err = __cancel_inactive(engine);
 937		if (err == 0)
 938			err = __cancel_active(engine);
 939		if (err == 0)
 940			err = __cancel_completed(engine);
 941
 942		err2 = igt_live_test_end(&t);
 943		if (err)
 944			return err;
 945		if (err2)
 946			return err2;
 947
 948		/* Expects reset so call outside of igt_live_test_* */
 949		err = __cancel_reset(i915, engine);
 950		if (err)
 951			return err;
 952
 953		if (igt_flush_test(i915))
 954			return -EIO;
 955	}
 956
 957	return 0;
 958}
 959
 960static struct i915_vma *empty_batch(struct intel_gt *gt)
 961{
 962	struct drm_i915_gem_object *obj;
 963	struct i915_vma *vma;
 964	u32 *cmd;
 965	int err;
 966
 967	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
 968	if (IS_ERR(obj))
 969		return ERR_CAST(obj);
 970
 971	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
 972	if (IS_ERR(cmd)) {
 973		err = PTR_ERR(cmd);
 974		goto err;
 975	}
 976
 977	*cmd = MI_BATCH_BUFFER_END;
 978
 979	__i915_gem_object_flush_map(obj, 0, 64);
 980	i915_gem_object_unpin_map(obj);
 981
 982	intel_gt_chipset_flush(gt);
 983
 984	vma = i915_vma_instance(obj, gt->vm, NULL);
 985	if (IS_ERR(vma)) {
 986		err = PTR_ERR(vma);
 987		goto err;
 988	}
 989
 990	err = i915_vma_pin(vma, 0, 0, PIN_USER);
 991	if (err)
 992		goto err;
 993
 994	/* Force the wait now to avoid including it in the benchmark */
 995	err = i915_vma_sync(vma);
 996	if (err)
 997		goto err_pin;
 998
 999	return vma;
1000
1001err_pin:
1002	i915_vma_unpin(vma);
1003err:
1004	i915_gem_object_put(obj);
1005	return ERR_PTR(err);
1006}
1007
1008static int emit_bb_start(struct i915_request *rq, struct i915_vma *batch)
1009{
1010	return rq->engine->emit_bb_start(rq,
1011					 i915_vma_offset(batch),
1012					 i915_vma_size(batch),
1013					 0);
1014}
1015
1016static struct i915_request *
1017empty_request(struct intel_engine_cs *engine,
1018	      struct i915_vma *batch)
1019{
1020	struct i915_request *request;
1021	int err;
1022
1023	request = i915_request_create(engine->kernel_context);
1024	if (IS_ERR(request))
1025		return request;
1026
1027	err = emit_bb_start(request, batch);
 
 
 
1028	if (err)
1029		goto out_request;
1030
1031	i915_request_get(request);
1032out_request:
1033	i915_request_add(request);
1034	return err ? ERR_PTR(err) : request;
1035}
1036
1037static int live_empty_request(void *arg)
1038{
1039	struct drm_i915_private *i915 = arg;
1040	struct intel_engine_cs *engine;
1041	struct igt_live_test t;
1042	int err;
 
1043
1044	/*
1045	 * Submit various sized batches of empty requests, to each engine
1046	 * (individually), and wait for the batch to complete. We can check
1047	 * the overhead of submitting requests to the hardware.
1048	 */
1049
 
 
 
 
1050	for_each_uabi_engine(engine, i915) {
1051		IGT_TIMEOUT(end_time);
1052		struct i915_request *request;
1053		struct i915_vma *batch;
1054		unsigned long n, prime;
1055		ktime_t times[2] = {};
1056
1057		batch = empty_batch(engine->gt);
1058		if (IS_ERR(batch))
1059			return PTR_ERR(batch);
1060
1061		err = igt_live_test_begin(&t, i915, __func__, engine->name);
1062		if (err)
1063			goto out_batch;
1064
1065		intel_engine_pm_get(engine);
1066
1067		/* Warmup / preload */
1068		request = empty_request(engine, batch);
1069		if (IS_ERR(request)) {
1070			err = PTR_ERR(request);
1071			intel_engine_pm_put(engine);
1072			goto out_batch;
1073		}
1074		i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1075
1076		for_each_prime_number_from(prime, 1, 8192) {
1077			times[1] = ktime_get_raw();
1078
1079			for (n = 0; n < prime; n++) {
1080				i915_request_put(request);
1081				request = empty_request(engine, batch);
1082				if (IS_ERR(request)) {
1083					err = PTR_ERR(request);
1084					intel_engine_pm_put(engine);
1085					goto out_batch;
1086				}
1087			}
1088			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1089
1090			times[1] = ktime_sub(ktime_get_raw(), times[1]);
1091			if (prime == 1)
1092				times[0] = times[1];
1093
1094			if (__igt_timeout(end_time, NULL))
1095				break;
1096		}
1097		i915_request_put(request);
1098		intel_engine_pm_put(engine);
1099
1100		err = igt_live_test_end(&t);
1101		if (err)
1102			goto out_batch;
1103
1104		pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
1105			engine->name,
1106			ktime_to_ns(times[0]),
1107			prime, div64_u64(ktime_to_ns(times[1]), prime));
1108out_batch:
1109		i915_vma_unpin(batch);
1110		i915_vma_put(batch);
1111		if (err)
1112			break;
1113	}
1114
 
 
 
1115	return err;
1116}
1117
1118static struct i915_vma *recursive_batch(struct intel_gt *gt)
1119{
1120	struct drm_i915_gem_object *obj;
1121	const int ver = GRAPHICS_VER(gt->i915);
1122	struct i915_vma *vma;
1123	u32 *cmd;
1124	int err;
1125
1126	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
1127	if (IS_ERR(obj))
1128		return ERR_CAST(obj);
1129
1130	vma = i915_vma_instance(obj, gt->vm, NULL);
1131	if (IS_ERR(vma)) {
1132		err = PTR_ERR(vma);
1133		goto err;
1134	}
1135
1136	err = i915_vma_pin(vma, 0, 0, PIN_USER);
1137	if (err)
1138		goto err;
1139
1140	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
1141	if (IS_ERR(cmd)) {
1142		err = PTR_ERR(cmd);
1143		goto err;
1144	}
1145
1146	if (ver >= 8) {
1147		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
1148		*cmd++ = lower_32_bits(i915_vma_offset(vma));
1149		*cmd++ = upper_32_bits(i915_vma_offset(vma));
1150	} else if (ver >= 6) {
1151		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
1152		*cmd++ = lower_32_bits(i915_vma_offset(vma));
1153	} else {
1154		*cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1155		*cmd++ = lower_32_bits(i915_vma_offset(vma));
1156	}
1157	*cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
1158
1159	__i915_gem_object_flush_map(obj, 0, 64);
1160	i915_gem_object_unpin_map(obj);
1161
1162	intel_gt_chipset_flush(gt);
1163
1164	return vma;
1165
1166err:
1167	i915_gem_object_put(obj);
1168	return ERR_PTR(err);
1169}
1170
1171static int recursive_batch_resolve(struct i915_vma *batch)
1172{
1173	u32 *cmd;
1174
1175	cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
1176	if (IS_ERR(cmd))
1177		return PTR_ERR(cmd);
1178
1179	*cmd = MI_BATCH_BUFFER_END;
1180
1181	__i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
1182	i915_gem_object_unpin_map(batch->obj);
1183
1184	intel_gt_chipset_flush(batch->vm->gt);
1185
1186	return 0;
1187}
1188
1189static int live_all_engines(void *arg)
1190{
1191	struct drm_i915_private *i915 = arg;
1192	const unsigned int nengines = num_uabi_engines(i915);
1193	struct intel_engine_cs *engine;
1194	struct i915_request **request;
1195	struct igt_live_test t;
 
1196	unsigned int idx;
1197	int err;
1198
1199	/*
1200	 * Check we can submit requests to all engines simultaneously. We
1201	 * send a recursive batch to each engine - checking that we don't
1202	 * block doing so, and that they don't complete too soon.
1203	 */
1204
1205	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1206	if (!request)
1207		return -ENOMEM;
1208
1209	err = igt_live_test_begin(&t, i915, __func__, "");
1210	if (err)
1211		goto out_free;
1212
 
 
 
 
 
 
 
 
 
1213	idx = 0;
1214	for_each_uabi_engine(engine, i915) {
1215		struct i915_vma *batch;
1216
1217		batch = recursive_batch(engine->gt);
1218		if (IS_ERR(batch)) {
1219			err = PTR_ERR(batch);
1220			pr_err("%s: Unable to create batch, err=%d\n",
1221			       __func__, err);
1222			goto out_free;
1223		}
1224
1225		i915_vma_lock(batch);
1226		request[idx] = intel_engine_create_kernel_request(engine);
1227		if (IS_ERR(request[idx])) {
1228			err = PTR_ERR(request[idx]);
1229			pr_err("%s: Request allocation failed with err=%d\n",
1230			       __func__, err);
1231			goto out_unlock;
1232		}
1233		GEM_BUG_ON(request[idx]->context->vm != batch->vm);
1234
1235		err = i915_vma_move_to_active(batch, request[idx], 0);
 
 
1236		GEM_BUG_ON(err);
1237
1238		err = emit_bb_start(request[idx], batch);
 
 
 
1239		GEM_BUG_ON(err);
1240		request[idx]->batch = batch;
1241
1242		i915_request_get(request[idx]);
1243		i915_request_add(request[idx]);
1244		idx++;
1245out_unlock:
1246		i915_vma_unlock(batch);
1247		if (err)
1248			goto out_request;
1249	}
1250
 
 
1251	idx = 0;
1252	for_each_uabi_engine(engine, i915) {
1253		if (i915_request_completed(request[idx])) {
1254			pr_err("%s(%s): request completed too early!\n",
1255			       __func__, engine->name);
1256			err = -EINVAL;
1257			goto out_request;
1258		}
1259		idx++;
1260	}
1261
1262	idx = 0;
1263	for_each_uabi_engine(engine, i915) {
1264		err = recursive_batch_resolve(request[idx]->batch);
1265		if (err) {
1266			pr_err("%s: failed to resolve batch, err=%d\n",
1267			       __func__, err);
1268			goto out_request;
1269		}
1270		idx++;
1271	}
1272
1273	idx = 0;
1274	for_each_uabi_engine(engine, i915) {
1275		struct i915_request *rq = request[idx];
1276		long timeout;
1277
1278		timeout = i915_request_wait(rq, 0,
1279					    MAX_SCHEDULE_TIMEOUT);
1280		if (timeout < 0) {
1281			err = timeout;
1282			pr_err("%s: error waiting for request on %s, err=%d\n",
1283			       __func__, engine->name, err);
1284			goto out_request;
1285		}
1286
1287		GEM_BUG_ON(!i915_request_completed(rq));
1288		i915_vma_unpin(rq->batch);
1289		i915_vma_put(rq->batch);
1290		i915_request_put(rq);
1291		request[idx] = NULL;
1292		idx++;
1293	}
1294
1295	err = igt_live_test_end(&t);
1296
1297out_request:
1298	idx = 0;
1299	for_each_uabi_engine(engine, i915) {
1300		struct i915_request *rq = request[idx];
1301
1302		if (!rq)
1303			continue;
1304
1305		if (rq->batch) {
1306			i915_vma_unpin(rq->batch);
1307			i915_vma_put(rq->batch);
1308		}
1309		i915_request_put(rq);
1310		idx++;
1311	}
 
 
1312out_free:
1313	kfree(request);
1314	return err;
1315}
1316
1317static int live_sequential_engines(void *arg)
1318{
1319	struct drm_i915_private *i915 = arg;
1320	const unsigned int nengines = num_uabi_engines(i915);
1321	struct i915_request **request;
1322	struct i915_request *prev = NULL;
1323	struct intel_engine_cs *engine;
1324	struct igt_live_test t;
1325	unsigned int idx;
1326	int err;
1327
1328	/*
1329	 * Check we can submit requests to all engines sequentially, such
1330	 * that each successive request waits for the earlier ones. This
1331	 * tests that we don't execute requests out of order, even though
1332	 * they are running on independent engines.
1333	 */
1334
1335	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1336	if (!request)
1337		return -ENOMEM;
1338
1339	err = igt_live_test_begin(&t, i915, __func__, "");
1340	if (err)
1341		goto out_free;
1342
1343	idx = 0;
1344	for_each_uabi_engine(engine, i915) {
1345		struct i915_vma *batch;
1346
1347		batch = recursive_batch(engine->gt);
1348		if (IS_ERR(batch)) {
1349			err = PTR_ERR(batch);
1350			pr_err("%s: Unable to create batch for %s, err=%d\n",
1351			       __func__, engine->name, err);
1352			goto out_free;
1353		}
1354
1355		i915_vma_lock(batch);
1356		request[idx] = intel_engine_create_kernel_request(engine);
1357		if (IS_ERR(request[idx])) {
1358			err = PTR_ERR(request[idx]);
1359			pr_err("%s: Request allocation failed for %s with err=%d\n",
1360			       __func__, engine->name, err);
1361			goto out_unlock;
1362		}
1363		GEM_BUG_ON(request[idx]->context->vm != batch->vm);
1364
1365		if (prev) {
1366			err = i915_request_await_dma_fence(request[idx],
1367							   &prev->fence);
1368			if (err) {
1369				i915_request_add(request[idx]);
1370				pr_err("%s: Request await failed for %s with err=%d\n",
1371				       __func__, engine->name, err);
1372				goto out_unlock;
1373			}
1374		}
1375
1376		err = i915_vma_move_to_active(batch, request[idx], 0);
 
 
 
1377		GEM_BUG_ON(err);
1378
1379		err = emit_bb_start(request[idx], batch);
 
 
 
1380		GEM_BUG_ON(err);
1381		request[idx]->batch = batch;
1382
1383		i915_request_get(request[idx]);
1384		i915_request_add(request[idx]);
1385
1386		prev = request[idx];
1387		idx++;
1388
1389out_unlock:
1390		i915_vma_unlock(batch);
1391		if (err)
1392			goto out_request;
1393	}
1394
1395	idx = 0;
1396	for_each_uabi_engine(engine, i915) {
1397		long timeout;
1398
1399		if (i915_request_completed(request[idx])) {
1400			pr_err("%s(%s): request completed too early!\n",
1401			       __func__, engine->name);
1402			err = -EINVAL;
1403			goto out_request;
1404		}
1405
1406		err = recursive_batch_resolve(request[idx]->batch);
1407		if (err) {
1408			pr_err("%s: failed to resolve batch, err=%d\n",
1409			       __func__, err);
1410			goto out_request;
1411		}
1412
1413		timeout = i915_request_wait(request[idx], 0,
1414					    MAX_SCHEDULE_TIMEOUT);
1415		if (timeout < 0) {
1416			err = timeout;
1417			pr_err("%s: error waiting for request on %s, err=%d\n",
1418			       __func__, engine->name, err);
1419			goto out_request;
1420		}
1421
1422		GEM_BUG_ON(!i915_request_completed(request[idx]));
1423		idx++;
1424	}
1425
1426	err = igt_live_test_end(&t);
1427
1428out_request:
1429	idx = 0;
1430	for_each_uabi_engine(engine, i915) {
1431		u32 *cmd;
1432
1433		if (!request[idx])
1434			break;
1435
1436		cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
1437						       I915_MAP_WC);
1438		if (!IS_ERR(cmd)) {
1439			*cmd = MI_BATCH_BUFFER_END;
1440
1441			__i915_gem_object_flush_map(request[idx]->batch->obj,
1442						    0, sizeof(*cmd));
1443			i915_gem_object_unpin_map(request[idx]->batch->obj);
1444
1445			intel_gt_chipset_flush(engine->gt);
1446		}
1447
1448		i915_vma_put(request[idx]->batch);
1449		i915_request_put(request[idx]);
1450		idx++;
1451	}
1452out_free:
1453	kfree(request);
1454	return err;
1455}
1456
1457struct parallel_thread {
1458	struct kthread_worker *worker;
1459	struct kthread_work work;
1460	struct intel_engine_cs *engine;
1461	int result;
1462};
1463
1464static void __live_parallel_engine1(struct kthread_work *work)
1465{
1466	struct parallel_thread *thread =
1467		container_of(work, typeof(*thread), work);
1468	struct intel_engine_cs *engine = thread->engine;
1469	IGT_TIMEOUT(end_time);
1470	unsigned long count;
1471	int err = 0;
1472
1473	count = 0;
1474	intel_engine_pm_get(engine);
1475	do {
1476		struct i915_request *rq;
1477
1478		rq = i915_request_create(engine->kernel_context);
1479		if (IS_ERR(rq)) {
1480			err = PTR_ERR(rq);
1481			break;
1482		}
1483
1484		i915_request_get(rq);
1485		i915_request_add(rq);
1486
1487		err = 0;
1488		if (i915_request_wait(rq, 0, HZ) < 0)
1489			err = -ETIME;
1490		i915_request_put(rq);
1491		if (err)
1492			break;
1493
1494		count++;
1495	} while (!__igt_timeout(end_time, NULL));
1496	intel_engine_pm_put(engine);
1497
1498	pr_info("%s: %lu request + sync\n", engine->name, count);
1499	thread->result = err;
1500}
1501
1502static void __live_parallel_engineN(struct kthread_work *work)
1503{
1504	struct parallel_thread *thread =
1505		container_of(work, typeof(*thread), work);
1506	struct intel_engine_cs *engine = thread->engine;
1507	IGT_TIMEOUT(end_time);
1508	unsigned long count;
1509	int err = 0;
1510
1511	count = 0;
1512	intel_engine_pm_get(engine);
1513	do {
1514		struct i915_request *rq;
1515
1516		rq = i915_request_create(engine->kernel_context);
1517		if (IS_ERR(rq)) {
1518			err = PTR_ERR(rq);
1519			break;
1520		}
1521
1522		i915_request_add(rq);
1523		count++;
1524	} while (!__igt_timeout(end_time, NULL));
1525	intel_engine_pm_put(engine);
1526
1527	pr_info("%s: %lu requests\n", engine->name, count);
1528	thread->result = err;
1529}
1530
1531static bool wake_all(struct drm_i915_private *i915)
1532{
1533	if (atomic_dec_and_test(&i915->selftest.counter)) {
1534		wake_up_var(&i915->selftest.counter);
1535		return true;
1536	}
1537
1538	return false;
1539}
1540
1541static int wait_for_all(struct drm_i915_private *i915)
1542{
1543	if (wake_all(i915))
1544		return 0;
1545
1546	if (wait_var_event_timeout(&i915->selftest.counter,
1547				   !atomic_read(&i915->selftest.counter),
1548				   i915_selftest.timeout_jiffies))
1549		return 0;
1550
1551	return -ETIME;
1552}
1553
1554static void __live_parallel_spin(struct kthread_work *work)
1555{
1556	struct parallel_thread *thread =
1557		container_of(work, typeof(*thread), work);
1558	struct intel_engine_cs *engine = thread->engine;
1559	struct igt_spinner spin;
1560	struct i915_request *rq;
1561	int err = 0;
1562
1563	/*
1564	 * Create a spinner running for eternity on each engine. If a second
1565	 * spinner is incorrectly placed on the same engine, it will not be
1566	 * able to start in time.
1567	 */
1568
1569	if (igt_spinner_init(&spin, engine->gt)) {
1570		wake_all(engine->i915);
1571		thread->result = -ENOMEM;
1572		return;
1573	}
1574
1575	intel_engine_pm_get(engine);
1576	rq = igt_spinner_create_request(&spin,
1577					engine->kernel_context,
1578					MI_NOOP); /* no preemption */
1579	intel_engine_pm_put(engine);
1580	if (IS_ERR(rq)) {
1581		err = PTR_ERR(rq);
1582		if (err == -ENODEV)
1583			err = 0;
1584		wake_all(engine->i915);
1585		goto out_spin;
1586	}
1587
1588	i915_request_get(rq);
1589	i915_request_add(rq);
1590	if (igt_wait_for_spinner(&spin, rq)) {
1591		/* Occupy this engine for the whole test */
1592		err = wait_for_all(engine->i915);
1593	} else {
1594		pr_err("Failed to start spinner on %s\n", engine->name);
1595		err = -EINVAL;
1596	}
1597	igt_spinner_end(&spin);
1598
1599	if (err == 0 && i915_request_wait(rq, 0, HZ) < 0)
1600		err = -EIO;
1601	i915_request_put(rq);
1602
1603out_spin:
1604	igt_spinner_fini(&spin);
1605	thread->result = err;
1606}
1607
1608static int live_parallel_engines(void *arg)
1609{
1610	struct drm_i915_private *i915 = arg;
1611	static void (* const func[])(struct kthread_work *) = {
1612		__live_parallel_engine1,
1613		__live_parallel_engineN,
1614		__live_parallel_spin,
1615		NULL,
1616	};
1617	const unsigned int nengines = num_uabi_engines(i915);
1618	struct parallel_thread *threads;
1619	struct intel_engine_cs *engine;
1620	void (* const *fn)(struct kthread_work *);
 
1621	int err = 0;
1622
1623	/*
1624	 * Check we can submit requests to all engines concurrently. This
1625	 * tests that we load up the system maximally.
1626	 */
1627
1628	threads = kcalloc(nengines, sizeof(*threads), GFP_KERNEL);
1629	if (!threads)
1630		return -ENOMEM;
1631
1632	for (fn = func; !err && *fn; fn++) {
1633		char name[KSYM_NAME_LEN];
1634		struct igt_live_test t;
1635		unsigned int idx;
1636
1637		snprintf(name, sizeof(name), "%ps", *fn);
1638		err = igt_live_test_begin(&t, i915, __func__, name);
1639		if (err)
1640			break;
1641
1642		atomic_set(&i915->selftest.counter, nengines);
1643
1644		idx = 0;
1645		for_each_uabi_engine(engine, i915) {
1646			struct kthread_worker *worker;
1647
1648			worker = kthread_create_worker(0, "igt/parallel:%s",
1649						       engine->name);
1650			if (IS_ERR(worker)) {
1651				err = PTR_ERR(worker);
1652				break;
1653			}
 
 
1654
1655			threads[idx].worker = worker;
1656			threads[idx].result = 0;
1657			threads[idx].engine = engine;
1658
1659			kthread_init_work(&threads[idx].work, *fn);
1660			kthread_queue_work(worker, &threads[idx].work);
1661			idx++;
1662		}
1663
1664		idx = 0;
1665		for_each_uabi_engine(engine, i915) {
1666			int status;
1667
1668			if (!threads[idx].worker)
1669				break;
1670
1671			kthread_flush_work(&threads[idx].work);
1672			status = READ_ONCE(threads[idx].result);
1673			if (status && !err)
1674				err = status;
1675
1676			kthread_destroy_worker(threads[idx++].worker);
1677		}
1678
1679		if (igt_live_test_end(&t))
1680			err = -EIO;
1681	}
1682
1683	kfree(threads);
1684	return err;
1685}
1686
1687static int
1688max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1689{
1690	struct i915_request *rq;
1691	int ret;
1692
1693	/*
1694	 * Before execlists, all contexts share the same ringbuffer. With
1695	 * execlists, each context/engine has a separate ringbuffer and
1696	 * for the purposes of this test, inexhaustible.
1697	 *
1698	 * For the global ringbuffer though, we have to be very careful
1699	 * that we do not wrap while preventing the execution of requests
1700	 * with a unsignaled fence.
1701	 */
1702	if (HAS_EXECLISTS(ctx->i915))
1703		return INT_MAX;
1704
1705	rq = igt_request_alloc(ctx, engine);
1706	if (IS_ERR(rq)) {
1707		ret = PTR_ERR(rq);
1708	} else {
1709		int sz;
1710
1711		ret = rq->ring->size - rq->reserved_space;
1712		i915_request_add(rq);
1713
1714		sz = rq->ring->emit - rq->head;
1715		if (sz < 0)
1716			sz += rq->ring->size;
1717		ret /= sz;
1718		ret /= 2; /* leave half spare, in case of emergency! */
1719	}
1720
1721	return ret;
1722}
1723
1724static int live_breadcrumbs_smoketest(void *arg)
1725{
1726	struct drm_i915_private *i915 = arg;
1727	const unsigned int nengines = num_uabi_engines(i915);
1728	const unsigned int ncpus = /* saturate with nengines * ncpus */
1729		max_t(int, 2, DIV_ROUND_UP(num_online_cpus(), nengines));
1730	unsigned long num_waits, num_fences;
1731	struct intel_engine_cs *engine;
1732	struct smoke_thread *threads;
1733	struct igt_live_test live;
1734	intel_wakeref_t wakeref;
1735	struct smoketest *smoke;
1736	unsigned int n, idx;
1737	struct file *file;
1738	int ret = 0;
1739
1740	/*
1741	 * Smoketest our breadcrumb/signal handling for requests across multiple
1742	 * threads. A very simple test to only catch the most egregious of bugs.
1743	 * See __igt_breadcrumbs_smoketest();
1744	 *
1745	 * On real hardware this time.
1746	 */
1747
1748	wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1749
1750	file = mock_file(i915);
1751	if (IS_ERR(file)) {
1752		ret = PTR_ERR(file);
1753		goto out_rpm;
1754	}
1755
1756	smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1757	if (!smoke) {
1758		ret = -ENOMEM;
1759		goto out_file;
1760	}
1761
1762	threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1763	if (!threads) {
1764		ret = -ENOMEM;
1765		goto out_smoke;
1766	}
1767
1768	smoke[0].request_alloc = __live_request_alloc;
1769	smoke[0].ncontexts = 64;
1770	smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1771				    sizeof(*smoke[0].contexts),
1772				    GFP_KERNEL);
1773	if (!smoke[0].contexts) {
1774		ret = -ENOMEM;
1775		goto out_threads;
1776	}
1777
1778	for (n = 0; n < smoke[0].ncontexts; n++) {
1779		smoke[0].contexts[n] = live_context(i915, file);
1780		if (IS_ERR(smoke[0].contexts[n])) {
1781			ret = PTR_ERR(smoke[0].contexts[n]);
1782			goto out_contexts;
1783		}
1784	}
1785
1786	ret = igt_live_test_begin(&live, i915, __func__, "");
1787	if (ret)
1788		goto out_contexts;
1789
1790	idx = 0;
1791	for_each_uabi_engine(engine, i915) {
1792		smoke[idx] = smoke[0];
1793		smoke[idx].engine = engine;
1794		smoke[idx].max_batch =
1795			max_batches(smoke[0].contexts[0], engine);
1796		if (smoke[idx].max_batch < 0) {
1797			ret = smoke[idx].max_batch;
1798			goto out_flush;
1799		}
1800		/* One ring interleaved between requests from all cpus */
1801		smoke[idx].max_batch /= ncpus + 1;
1802		pr_debug("Limiting batches to %d requests on %s\n",
1803			 smoke[idx].max_batch, engine->name);
1804
1805		for (n = 0; n < ncpus; n++) {
1806			unsigned int i = idx * ncpus + n;
1807			struct kthread_worker *worker;
1808
1809			worker = kthread_create_worker(0, "igt/%d.%d", idx, n);
1810			if (IS_ERR(worker)) {
1811				ret = PTR_ERR(worker);
 
1812				goto out_flush;
1813			}
1814
1815			threads[i].worker = worker;
1816			threads[i].t = &smoke[idx];
1817
1818			kthread_init_work(&threads[i].work,
1819					  __igt_breadcrumbs_smoketest);
1820			kthread_queue_work(worker, &threads[i].work);
1821		}
1822
1823		idx++;
1824	}
1825
 
1826	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1827
1828out_flush:
1829	idx = 0;
1830	num_waits = 0;
1831	num_fences = 0;
1832	for_each_uabi_engine(engine, i915) {
1833		for (n = 0; n < ncpus; n++) {
1834			unsigned int i = idx * ncpus + n;
1835			int err;
1836
1837			if (!threads[i].worker)
1838				continue;
1839
1840			WRITE_ONCE(threads[i].stop, true);
1841			kthread_flush_work(&threads[i].work);
1842			err = READ_ONCE(threads[i].result);
1843			if (err < 0 && !ret)
1844				ret = err;
1845
1846			kthread_destroy_worker(threads[i].worker);
1847		}
1848
1849		num_waits += atomic_long_read(&smoke[idx].num_waits);
1850		num_fences += atomic_long_read(&smoke[idx].num_fences);
1851		idx++;
1852	}
1853	pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1854		num_waits, num_fences, idx, ncpus);
1855
1856	ret = igt_live_test_end(&live) ?: ret;
1857out_contexts:
1858	kfree(smoke[0].contexts);
1859out_threads:
1860	kfree(threads);
1861out_smoke:
1862	kfree(smoke);
1863out_file:
1864	fput(file);
1865out_rpm:
1866	intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1867
1868	return ret;
1869}
1870
1871int i915_request_live_selftests(struct drm_i915_private *i915)
1872{
1873	static const struct i915_subtest tests[] = {
1874		SUBTEST(live_nop_request),
1875		SUBTEST(live_all_engines),
1876		SUBTEST(live_sequential_engines),
1877		SUBTEST(live_parallel_engines),
1878		SUBTEST(live_empty_request),
1879		SUBTEST(live_cancel_request),
1880		SUBTEST(live_breadcrumbs_smoketest),
1881	};
1882
1883	if (intel_gt_is_wedged(to_gt(i915)))
1884		return 0;
1885
1886	return i915_live_subtests(tests, i915);
1887}
1888
1889static int switch_to_kernel_sync(struct intel_context *ce, int err)
1890{
1891	struct i915_request *rq;
1892	struct dma_fence *fence;
1893
1894	rq = intel_engine_create_kernel_request(ce->engine);
1895	if (IS_ERR(rq))
1896		return PTR_ERR(rq);
1897
1898	fence = i915_active_fence_get(&ce->timeline->last_request);
1899	if (fence) {
1900		i915_request_await_dma_fence(rq, fence);
1901		dma_fence_put(fence);
1902	}
1903
1904	rq = i915_request_get(rq);
1905	i915_request_add(rq);
1906	if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1907		err = -ETIME;
1908	i915_request_put(rq);
1909
1910	while (!err && !intel_engine_is_idle(ce->engine))
1911		intel_engine_flush_submission(ce->engine);
1912
1913	return err;
1914}
1915
1916struct perf_stats {
1917	struct intel_engine_cs *engine;
1918	unsigned long count;
1919	ktime_t time;
1920	ktime_t busy;
1921	u64 runtime;
1922};
1923
1924struct perf_series {
1925	struct drm_i915_private *i915;
1926	unsigned int nengines;
1927	struct intel_context *ce[] __counted_by(nengines);
1928};
1929
1930static int cmp_u32(const void *A, const void *B)
1931{
1932	const u32 *a = A, *b = B;
1933
1934	return *a - *b;
1935}
1936
1937static u32 trifilter(u32 *a)
1938{
1939	u64 sum;
1940
1941#define TF_COUNT 5
1942	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1943
1944	sum = mul_u32_u32(a[2], 2);
1945	sum += a[1];
1946	sum += a[3];
1947
1948	GEM_BUG_ON(sum > U32_MAX);
1949	return sum;
1950#define TF_BIAS 2
1951}
1952
1953static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1954{
1955	u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1956
1957	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1958}
1959
1960static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1961{
1962	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1963	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1964	*cs++ = offset;
1965	*cs++ = 0;
1966
1967	return cs;
1968}
1969
1970static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1971{
1972	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1973	*cs++ = offset;
1974	*cs++ = 0;
1975	*cs++ = value;
1976
1977	return cs;
1978}
1979
1980static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1981{
1982	*cs++ = MI_SEMAPHORE_WAIT |
1983		MI_SEMAPHORE_GLOBAL_GTT |
1984		MI_SEMAPHORE_POLL |
1985		mode;
1986	*cs++ = value;
1987	*cs++ = offset;
1988	*cs++ = 0;
1989
1990	return cs;
1991}
1992
1993static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1994{
1995	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1996}
1997
1998static void semaphore_set(u32 *sema, u32 value)
1999{
2000	WRITE_ONCE(*sema, value);
2001	wmb(); /* flush the update to the cache, and beyond */
2002}
2003
2004static u32 *hwsp_scratch(const struct intel_context *ce)
2005{
2006	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
2007}
2008
2009static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
2010{
2011	return (i915_ggtt_offset(ce->engine->status_page.vma) +
2012		offset_in_page(dw));
2013}
2014
2015static int measure_semaphore_response(struct intel_context *ce)
2016{
2017	u32 *sema = hwsp_scratch(ce);
2018	const u32 offset = hwsp_offset(ce, sema);
2019	u32 elapsed[TF_COUNT], cycles;
2020	struct i915_request *rq;
2021	u32 *cs;
2022	int err;
2023	int i;
2024
2025	/*
2026	 * Measure how many cycles it takes for the HW to detect the change
2027	 * in a semaphore value.
2028	 *
2029	 *    A: read CS_TIMESTAMP from CPU
2030	 *    poke semaphore
2031	 *    B: read CS_TIMESTAMP on GPU
2032	 *
2033	 * Semaphore latency: B - A
2034	 */
2035
2036	semaphore_set(sema, -1);
2037
2038	rq = i915_request_create(ce);
2039	if (IS_ERR(rq))
2040		return PTR_ERR(rq);
2041
2042	cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
2043	if (IS_ERR(cs)) {
2044		i915_request_add(rq);
2045		err = PTR_ERR(cs);
2046		goto err;
2047	}
2048
2049	cs = emit_store_dw(cs, offset, 0);
2050	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2051		cs = emit_semaphore_poll_until(cs, offset, i);
2052		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2053		cs = emit_store_dw(cs, offset, 0);
2054	}
2055
2056	intel_ring_advance(rq, cs);
2057	i915_request_add(rq);
2058
2059	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2060		err = -EIO;
2061		goto err;
2062	}
2063
2064	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2065		preempt_disable();
2066		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2067		semaphore_set(sema, i);
2068		preempt_enable();
2069
2070		if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2071			err = -EIO;
2072			goto err;
2073		}
2074
2075		elapsed[i - 1] = sema[i] - cycles;
2076	}
2077
2078	cycles = trifilter(elapsed);
2079	pr_info("%s: semaphore response %d cycles, %lluns\n",
2080		ce->engine->name, cycles >> TF_BIAS,
2081		cycles_to_ns(ce->engine, cycles));
2082
2083	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2084
2085err:
2086	intel_gt_set_wedged(ce->engine->gt);
2087	return err;
2088}
2089
2090static int measure_idle_dispatch(struct intel_context *ce)
2091{
2092	u32 *sema = hwsp_scratch(ce);
2093	const u32 offset = hwsp_offset(ce, sema);
2094	u32 elapsed[TF_COUNT], cycles;
2095	u32 *cs;
2096	int err;
2097	int i;
2098
2099	/*
2100	 * Measure how long it takes for us to submit a request while the
2101	 * engine is idle, but is resting in our context.
2102	 *
2103	 *    A: read CS_TIMESTAMP from CPU
2104	 *    submit request
2105	 *    B: read CS_TIMESTAMP on GPU
2106	 *
2107	 * Submission latency: B - A
2108	 */
2109
2110	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2111		struct i915_request *rq;
2112
2113		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2114		if (err)
2115			return err;
2116
2117		rq = i915_request_create(ce);
2118		if (IS_ERR(rq)) {
2119			err = PTR_ERR(rq);
2120			goto err;
2121		}
2122
2123		cs = intel_ring_begin(rq, 4);
2124		if (IS_ERR(cs)) {
2125			i915_request_add(rq);
2126			err = PTR_ERR(cs);
2127			goto err;
2128		}
2129
2130		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2131
2132		intel_ring_advance(rq, cs);
2133
2134		preempt_disable();
2135		local_bh_disable();
2136		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2137		i915_request_add(rq);
2138		local_bh_enable();
2139		preempt_enable();
2140	}
2141
2142	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2143	if (err)
2144		goto err;
2145
2146	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
2147		elapsed[i] = sema[i] - elapsed[i];
2148
2149	cycles = trifilter(elapsed);
2150	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
2151		ce->engine->name, cycles >> TF_BIAS,
2152		cycles_to_ns(ce->engine, cycles));
2153
2154	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2155
2156err:
2157	intel_gt_set_wedged(ce->engine->gt);
2158	return err;
2159}
2160
2161static int measure_busy_dispatch(struct intel_context *ce)
2162{
2163	u32 *sema = hwsp_scratch(ce);
2164	const u32 offset = hwsp_offset(ce, sema);
2165	u32 elapsed[TF_COUNT + 1], cycles;
2166	u32 *cs;
2167	int err;
2168	int i;
2169
2170	/*
2171	 * Measure how long it takes for us to submit a request while the
2172	 * engine is busy, polling on a semaphore in our context. With
2173	 * direct submission, this will include the cost of a lite restore.
2174	 *
2175	 *    A: read CS_TIMESTAMP from CPU
2176	 *    submit request
2177	 *    B: read CS_TIMESTAMP on GPU
2178	 *
2179	 * Submission latency: B - A
2180	 */
2181
2182	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2183		struct i915_request *rq;
2184
2185		rq = i915_request_create(ce);
2186		if (IS_ERR(rq)) {
2187			err = PTR_ERR(rq);
2188			goto err;
2189		}
2190
2191		cs = intel_ring_begin(rq, 12);
2192		if (IS_ERR(cs)) {
2193			i915_request_add(rq);
2194			err = PTR_ERR(cs);
2195			goto err;
2196		}
2197
2198		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2199		cs = emit_semaphore_poll_until(cs, offset, i);
2200		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2201
2202		intel_ring_advance(rq, cs);
2203
2204		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
2205			err = -EIO;
2206			goto err;
2207		}
2208
2209		preempt_disable();
2210		local_bh_disable();
2211		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2212		i915_request_add(rq);
2213		local_bh_enable();
2214		semaphore_set(sema, i - 1);
2215		preempt_enable();
2216	}
2217
2218	wait_for(READ_ONCE(sema[i - 1]), 500);
2219	semaphore_set(sema, i - 1);
2220
2221	for (i = 1; i <= TF_COUNT; i++) {
2222		GEM_BUG_ON(sema[i] == -1);
2223		elapsed[i - 1] = sema[i] - elapsed[i];
2224	}
2225
2226	cycles = trifilter(elapsed);
2227	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
2228		ce->engine->name, cycles >> TF_BIAS,
2229		cycles_to_ns(ce->engine, cycles));
2230
2231	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2232
2233err:
2234	intel_gt_set_wedged(ce->engine->gt);
2235	return err;
2236}
2237
2238static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
2239{
2240	const u32 offset =
2241		i915_ggtt_offset(engine->status_page.vma) +
2242		offset_in_page(sema);
2243	struct i915_request *rq;
2244	u32 *cs;
2245
2246	rq = i915_request_create(engine->kernel_context);
2247	if (IS_ERR(rq))
2248		return PTR_ERR(rq);
2249
2250	cs = intel_ring_begin(rq, 4);
2251	if (IS_ERR(cs)) {
2252		i915_request_add(rq);
2253		return PTR_ERR(cs);
2254	}
2255
2256	cs = emit_semaphore_poll(cs, mode, value, offset);
2257
2258	intel_ring_advance(rq, cs);
2259	i915_request_add(rq);
2260
2261	return 0;
2262}
2263
2264static int measure_inter_request(struct intel_context *ce)
2265{
2266	u32 *sema = hwsp_scratch(ce);
2267	const u32 offset = hwsp_offset(ce, sema);
2268	u32 elapsed[TF_COUNT + 1], cycles;
2269	struct i915_sw_fence *submit;
2270	int i, err;
2271
2272	/*
2273	 * Measure how long it takes to advance from one request into the
2274	 * next. Between each request we flush the GPU caches to memory,
2275	 * update the breadcrumbs, and then invalidate those caches.
2276	 * We queue up all the requests to be submitted in one batch so
2277	 * it should be one set of contiguous measurements.
2278	 *
2279	 *    A: read CS_TIMESTAMP on GPU
2280	 *    advance request
2281	 *    B: read CS_TIMESTAMP on GPU
2282	 *
2283	 * Request latency: B - A
2284	 */
2285
2286	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2287	if (err)
2288		return err;
2289
2290	submit = heap_fence_create(GFP_KERNEL);
2291	if (!submit) {
2292		semaphore_set(sema, 1);
2293		return -ENOMEM;
2294	}
2295
2296	intel_engine_flush_submission(ce->engine);
2297	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2298		struct i915_request *rq;
2299		u32 *cs;
2300
2301		rq = i915_request_create(ce);
2302		if (IS_ERR(rq)) {
2303			err = PTR_ERR(rq);
2304			goto err_submit;
2305		}
2306
2307		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
2308						       submit,
2309						       GFP_KERNEL);
2310		if (err < 0) {
2311			i915_request_add(rq);
2312			goto err_submit;
2313		}
2314
2315		cs = intel_ring_begin(rq, 4);
2316		if (IS_ERR(cs)) {
2317			i915_request_add(rq);
2318			err = PTR_ERR(cs);
2319			goto err_submit;
2320		}
2321
2322		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2323
2324		intel_ring_advance(rq, cs);
2325		i915_request_add(rq);
2326	}
2327	i915_sw_fence_commit(submit);
2328	intel_engine_flush_submission(ce->engine);
2329	heap_fence_put(submit);
2330
2331	semaphore_set(sema, 1);
2332	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2333	if (err)
2334		goto err;
2335
2336	for (i = 1; i <= TF_COUNT; i++)
2337		elapsed[i - 1] = sema[i + 1] - sema[i];
2338
2339	cycles = trifilter(elapsed);
2340	pr_info("%s: inter-request latency %d cycles, %lluns\n",
2341		ce->engine->name, cycles >> TF_BIAS,
2342		cycles_to_ns(ce->engine, cycles));
2343
2344	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2345
2346err_submit:
2347	i915_sw_fence_commit(submit);
2348	heap_fence_put(submit);
2349	semaphore_set(sema, 1);
2350err:
2351	intel_gt_set_wedged(ce->engine->gt);
2352	return err;
2353}
2354
2355static int measure_context_switch(struct intel_context *ce)
2356{
2357	u32 *sema = hwsp_scratch(ce);
2358	const u32 offset = hwsp_offset(ce, sema);
2359	struct i915_request *fence = NULL;
2360	u32 elapsed[TF_COUNT + 1], cycles;
2361	int i, j, err;
2362	u32 *cs;
2363
2364	/*
2365	 * Measure how long it takes to advance from one request in one
2366	 * context to a request in another context. This allows us to
2367	 * measure how long the context save/restore take, along with all
2368	 * the inter-context setup we require.
2369	 *
2370	 *    A: read CS_TIMESTAMP on GPU
2371	 *    switch context
2372	 *    B: read CS_TIMESTAMP on GPU
2373	 *
2374	 * Context switch latency: B - A
2375	 */
2376
2377	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2378	if (err)
2379		return err;
2380
2381	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2382		struct intel_context *arr[] = {
2383			ce, ce->engine->kernel_context
2384		};
2385		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
2386
2387		for (j = 0; j < ARRAY_SIZE(arr); j++) {
2388			struct i915_request *rq;
2389
2390			rq = i915_request_create(arr[j]);
2391			if (IS_ERR(rq)) {
2392				err = PTR_ERR(rq);
2393				goto err_fence;
2394			}
2395
2396			if (fence) {
2397				err = i915_request_await_dma_fence(rq,
2398								   &fence->fence);
2399				if (err) {
2400					i915_request_add(rq);
2401					goto err_fence;
2402				}
2403			}
2404
2405			cs = intel_ring_begin(rq, 4);
2406			if (IS_ERR(cs)) {
2407				i915_request_add(rq);
2408				err = PTR_ERR(cs);
2409				goto err_fence;
2410			}
2411
2412			cs = emit_timestamp_store(cs, ce, addr);
2413			addr += sizeof(u32);
2414
2415			intel_ring_advance(rq, cs);
2416
2417			i915_request_put(fence);
2418			fence = i915_request_get(rq);
2419
2420			i915_request_add(rq);
2421		}
2422	}
2423	i915_request_put(fence);
2424	intel_engine_flush_submission(ce->engine);
2425
2426	semaphore_set(sema, 1);
2427	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2428	if (err)
2429		goto err;
2430
2431	for (i = 1; i <= TF_COUNT; i++)
2432		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2433
2434	cycles = trifilter(elapsed);
2435	pr_info("%s: context switch latency %d cycles, %lluns\n",
2436		ce->engine->name, cycles >> TF_BIAS,
2437		cycles_to_ns(ce->engine, cycles));
2438
2439	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2440
2441err_fence:
2442	i915_request_put(fence);
2443	semaphore_set(sema, 1);
2444err:
2445	intel_gt_set_wedged(ce->engine->gt);
2446	return err;
2447}
2448
2449static int measure_preemption(struct intel_context *ce)
2450{
2451	u32 *sema = hwsp_scratch(ce);
2452	const u32 offset = hwsp_offset(ce, sema);
2453	u32 elapsed[TF_COUNT], cycles;
2454	u32 *cs;
2455	int err;
2456	int i;
2457
2458	/*
2459	 * We measure two latencies while triggering preemption. The first
2460	 * latency is how long it takes for us to submit a preempting request.
2461	 * The second latency is how it takes for us to return from the
2462	 * preemption back to the original context.
2463	 *
2464	 *    A: read CS_TIMESTAMP from CPU
2465	 *    submit preemption
2466	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
2467	 *    context switch
2468	 *    C: read CS_TIMESTAMP on GPU (in original context)
2469	 *
2470	 * Preemption dispatch latency: B - A
2471	 * Preemption switch latency: C - B
2472	 */
2473
2474	if (!intel_engine_has_preemption(ce->engine))
2475		return 0;
2476
2477	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2478		u32 addr = offset + 2 * i * sizeof(u32);
2479		struct i915_request *rq;
2480
2481		rq = i915_request_create(ce);
2482		if (IS_ERR(rq)) {
2483			err = PTR_ERR(rq);
2484			goto err;
2485		}
2486
2487		cs = intel_ring_begin(rq, 12);
2488		if (IS_ERR(cs)) {
2489			i915_request_add(rq);
2490			err = PTR_ERR(cs);
2491			goto err;
2492		}
2493
2494		cs = emit_store_dw(cs, addr, -1);
2495		cs = emit_semaphore_poll_until(cs, offset, i);
2496		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2497
2498		intel_ring_advance(rq, cs);
2499		i915_request_add(rq);
2500
2501		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2502			err = -EIO;
2503			goto err;
2504		}
2505
2506		rq = i915_request_create(ce->engine->kernel_context);
2507		if (IS_ERR(rq)) {
2508			err = PTR_ERR(rq);
2509			goto err;
2510		}
2511
2512		cs = intel_ring_begin(rq, 8);
2513		if (IS_ERR(cs)) {
2514			i915_request_add(rq);
2515			err = PTR_ERR(cs);
2516			goto err;
2517		}
2518
2519		cs = emit_timestamp_store(cs, ce, addr);
2520		cs = emit_store_dw(cs, offset, i);
2521
2522		intel_ring_advance(rq, cs);
2523		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2524
2525		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2526		i915_request_add(rq);
2527	}
2528
2529	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2530		err = -EIO;
2531		goto err;
2532	}
2533
2534	for (i = 1; i <= TF_COUNT; i++)
2535		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2536
2537	cycles = trifilter(elapsed);
2538	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2539		ce->engine->name, cycles >> TF_BIAS,
2540		cycles_to_ns(ce->engine, cycles));
2541
2542	for (i = 1; i <= TF_COUNT; i++)
2543		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2544
2545	cycles = trifilter(elapsed);
2546	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2547		ce->engine->name, cycles >> TF_BIAS,
2548		cycles_to_ns(ce->engine, cycles));
2549
2550	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2551
2552err:
2553	intel_gt_set_wedged(ce->engine->gt);
2554	return err;
2555}
2556
2557struct signal_cb {
2558	struct dma_fence_cb base;
2559	bool seen;
2560};
2561
2562static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2563{
2564	struct signal_cb *s = container_of(cb, typeof(*s), base);
2565
2566	smp_store_mb(s->seen, true); /* be safe, be strong */
2567}
2568
2569static int measure_completion(struct intel_context *ce)
2570{
2571	u32 *sema = hwsp_scratch(ce);
2572	const u32 offset = hwsp_offset(ce, sema);
2573	u32 elapsed[TF_COUNT], cycles;
2574	u32 *cs;
2575	int err;
2576	int i;
2577
2578	/*
2579	 * Measure how long it takes for the signal (interrupt) to be
2580	 * sent from the GPU to be processed by the CPU.
2581	 *
2582	 *    A: read CS_TIMESTAMP on GPU
2583	 *    signal
2584	 *    B: read CS_TIMESTAMP from CPU
2585	 *
2586	 * Completion latency: B - A
2587	 */
2588
2589	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2590		struct signal_cb cb = { .seen = false };
2591		struct i915_request *rq;
2592
2593		rq = i915_request_create(ce);
2594		if (IS_ERR(rq)) {
2595			err = PTR_ERR(rq);
2596			goto err;
2597		}
2598
2599		cs = intel_ring_begin(rq, 12);
2600		if (IS_ERR(cs)) {
2601			i915_request_add(rq);
2602			err = PTR_ERR(cs);
2603			goto err;
2604		}
2605
2606		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2607		cs = emit_semaphore_poll_until(cs, offset, i);
2608		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2609
2610		intel_ring_advance(rq, cs);
2611
2612		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2613		i915_request_add(rq);
2614
2615		intel_engine_flush_submission(ce->engine);
2616		if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2617			err = -EIO;
2618			goto err;
2619		}
2620
2621		preempt_disable();
2622		semaphore_set(sema, i);
2623		while (!READ_ONCE(cb.seen))
2624			cpu_relax();
2625
2626		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2627		preempt_enable();
2628	}
2629
2630	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2631	if (err)
2632		goto err;
2633
2634	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2635		GEM_BUG_ON(sema[i + 1] == -1);
2636		elapsed[i] = elapsed[i] - sema[i + 1];
2637	}
2638
2639	cycles = trifilter(elapsed);
2640	pr_info("%s: completion latency %d cycles, %lluns\n",
2641		ce->engine->name, cycles >> TF_BIAS,
2642		cycles_to_ns(ce->engine, cycles));
2643
2644	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2645
2646err:
2647	intel_gt_set_wedged(ce->engine->gt);
2648	return err;
2649}
2650
2651static void rps_pin(struct intel_gt *gt)
2652{
2653	/* Pin the frequency to max */
2654	atomic_inc(&gt->rps.num_waiters);
2655	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2656
2657	mutex_lock(&gt->rps.lock);
2658	intel_rps_set(&gt->rps, gt->rps.max_freq);
2659	mutex_unlock(&gt->rps.lock);
2660}
2661
2662static void rps_unpin(struct intel_gt *gt)
2663{
2664	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2665	atomic_dec(&gt->rps.num_waiters);
2666}
2667
2668static int perf_request_latency(void *arg)
2669{
2670	struct drm_i915_private *i915 = arg;
2671	struct intel_engine_cs *engine;
2672	struct pm_qos_request qos;
2673	int err = 0;
2674
2675	if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */
2676		return 0;
2677
2678	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2679
2680	for_each_uabi_engine(engine, i915) {
2681		struct intel_context *ce;
2682
2683		ce = intel_context_create(engine);
2684		if (IS_ERR(ce)) {
2685			err = PTR_ERR(ce);
2686			goto out;
2687		}
2688
2689		err = intel_context_pin(ce);
2690		if (err) {
2691			intel_context_put(ce);
2692			goto out;
2693		}
2694
2695		st_engine_heartbeat_disable(engine);
2696		rps_pin(engine->gt);
2697
2698		if (err == 0)
2699			err = measure_semaphore_response(ce);
2700		if (err == 0)
2701			err = measure_idle_dispatch(ce);
2702		if (err == 0)
2703			err = measure_busy_dispatch(ce);
2704		if (err == 0)
2705			err = measure_inter_request(ce);
2706		if (err == 0)
2707			err = measure_context_switch(ce);
2708		if (err == 0)
2709			err = measure_preemption(ce);
2710		if (err == 0)
2711			err = measure_completion(ce);
2712
2713		rps_unpin(engine->gt);
2714		st_engine_heartbeat_enable(engine);
2715
2716		intel_context_unpin(ce);
2717		intel_context_put(ce);
2718		if (err)
2719			goto out;
2720	}
2721
2722out:
2723	if (igt_flush_test(i915))
2724		err = -EIO;
2725
2726	cpu_latency_qos_remove_request(&qos);
2727	return err;
2728}
2729
2730static int s_sync0(void *arg)
2731{
2732	struct perf_series *ps = arg;
2733	IGT_TIMEOUT(end_time);
2734	unsigned int idx = 0;
2735	int err = 0;
2736
2737	GEM_BUG_ON(!ps->nengines);
2738	do {
2739		struct i915_request *rq;
2740
2741		rq = i915_request_create(ps->ce[idx]);
2742		if (IS_ERR(rq)) {
2743			err = PTR_ERR(rq);
2744			break;
2745		}
2746
2747		i915_request_get(rq);
2748		i915_request_add(rq);
2749
2750		if (i915_request_wait(rq, 0, HZ / 5) < 0)
2751			err = -ETIME;
2752		i915_request_put(rq);
2753		if (err)
2754			break;
2755
2756		if (++idx == ps->nengines)
2757			idx = 0;
2758	} while (!__igt_timeout(end_time, NULL));
2759
2760	return err;
2761}
2762
2763static int s_sync1(void *arg)
2764{
2765	struct perf_series *ps = arg;
2766	struct i915_request *prev = NULL;
2767	IGT_TIMEOUT(end_time);
2768	unsigned int idx = 0;
2769	int err = 0;
2770
2771	GEM_BUG_ON(!ps->nengines);
2772	do {
2773		struct i915_request *rq;
2774
2775		rq = i915_request_create(ps->ce[idx]);
2776		if (IS_ERR(rq)) {
2777			err = PTR_ERR(rq);
2778			break;
2779		}
2780
2781		i915_request_get(rq);
2782		i915_request_add(rq);
2783
2784		if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2785			err = -ETIME;
2786		i915_request_put(prev);
2787		prev = rq;
2788		if (err)
2789			break;
2790
2791		if (++idx == ps->nengines)
2792			idx = 0;
2793	} while (!__igt_timeout(end_time, NULL));
2794	i915_request_put(prev);
2795
2796	return err;
2797}
2798
2799static int s_many(void *arg)
2800{
2801	struct perf_series *ps = arg;
2802	IGT_TIMEOUT(end_time);
2803	unsigned int idx = 0;
2804
2805	GEM_BUG_ON(!ps->nengines);
2806	do {
2807		struct i915_request *rq;
2808
2809		rq = i915_request_create(ps->ce[idx]);
2810		if (IS_ERR(rq))
2811			return PTR_ERR(rq);
2812
2813		i915_request_add(rq);
2814
2815		if (++idx == ps->nengines)
2816			idx = 0;
2817	} while (!__igt_timeout(end_time, NULL));
2818
2819	return 0;
2820}
2821
2822static int perf_series_engines(void *arg)
2823{
2824	struct drm_i915_private *i915 = arg;
2825	static int (* const func[])(void *arg) = {
2826		s_sync0,
2827		s_sync1,
2828		s_many,
2829		NULL,
2830	};
2831	const unsigned int nengines = num_uabi_engines(i915);
2832	struct intel_engine_cs *engine;
2833	int (* const *fn)(void *arg);
2834	struct pm_qos_request qos;
2835	struct perf_stats *stats;
2836	struct perf_series *ps;
2837	unsigned int idx;
2838	int err = 0;
2839
2840	stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2841	if (!stats)
2842		return -ENOMEM;
2843
2844	ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2845	if (!ps) {
2846		kfree(stats);
2847		return -ENOMEM;
2848	}
2849
2850	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2851
2852	ps->i915 = i915;
2853	ps->nengines = nengines;
2854
2855	idx = 0;
2856	for_each_uabi_engine(engine, i915) {
2857		struct intel_context *ce;
2858
2859		ce = intel_context_create(engine);
2860		if (IS_ERR(ce)) {
2861			err = PTR_ERR(ce);
2862			goto out;
2863		}
2864
2865		err = intel_context_pin(ce);
2866		if (err) {
2867			intel_context_put(ce);
2868			goto out;
2869		}
2870
2871		ps->ce[idx++] = ce;
2872	}
2873	GEM_BUG_ON(idx != ps->nengines);
2874
2875	for (fn = func; *fn && !err; fn++) {
2876		char name[KSYM_NAME_LEN];
2877		struct igt_live_test t;
2878
2879		snprintf(name, sizeof(name), "%ps", *fn);
2880		err = igt_live_test_begin(&t, i915, __func__, name);
2881		if (err)
2882			break;
2883
2884		for (idx = 0; idx < nengines; idx++) {
2885			struct perf_stats *p =
2886				memset(&stats[idx], 0, sizeof(stats[idx]));
2887			struct intel_context *ce = ps->ce[idx];
2888
2889			p->engine = ps->ce[idx]->engine;
2890			intel_engine_pm_get(p->engine);
2891
2892			if (intel_engine_supports_stats(p->engine))
2893				p->busy = intel_engine_get_busy_time(p->engine,
2894								     &p->time) + 1;
2895			else
2896				p->time = ktime_get();
2897			p->runtime = -intel_context_get_total_runtime_ns(ce);
2898		}
2899
2900		err = (*fn)(ps);
2901		if (igt_live_test_end(&t))
2902			err = -EIO;
2903
2904		for (idx = 0; idx < nengines; idx++) {
2905			struct perf_stats *p = &stats[idx];
2906			struct intel_context *ce = ps->ce[idx];
2907			int integer, decimal;
2908			u64 busy, dt, now;
2909
2910			if (p->busy)
2911				p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2912									       &now),
2913						    p->busy - 1);
2914			else
2915				now = ktime_get();
2916			p->time = ktime_sub(now, p->time);
2917
2918			err = switch_to_kernel_sync(ce, err);
2919			p->runtime += intel_context_get_total_runtime_ns(ce);
2920			intel_engine_pm_put(p->engine);
2921
2922			busy = 100 * ktime_to_ns(p->busy);
2923			dt = ktime_to_ns(p->time);
2924			if (dt) {
2925				integer = div64_u64(busy, dt);
2926				busy -= integer * dt;
2927				decimal = div64_u64(100 * busy, dt);
2928			} else {
2929				integer = 0;
2930				decimal = 0;
2931			}
2932
2933			pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2934				name, p->engine->name, ce->timeline->seqno,
2935				integer, decimal,
2936				div_u64(p->runtime, 1000 * 1000),
2937				div_u64(ktime_to_ns(p->time), 1000 * 1000));
2938		}
2939	}
2940
2941out:
2942	for (idx = 0; idx < nengines; idx++) {
2943		if (IS_ERR_OR_NULL(ps->ce[idx]))
2944			break;
2945
2946		intel_context_unpin(ps->ce[idx]);
2947		intel_context_put(ps->ce[idx]);
2948	}
2949	kfree(ps);
2950
2951	cpu_latency_qos_remove_request(&qos);
2952	kfree(stats);
2953	return err;
2954}
2955
2956struct p_thread {
2957	struct perf_stats p;
2958	struct kthread_worker *worker;
2959	struct kthread_work work;
2960	struct intel_engine_cs *engine;
2961	int result;
2962};
2963
2964static void p_sync0(struct kthread_work *work)
2965{
2966	struct p_thread *thread = container_of(work, typeof(*thread), work);
2967	struct perf_stats *p = &thread->p;
2968	struct intel_engine_cs *engine = p->engine;
2969	struct intel_context *ce;
2970	IGT_TIMEOUT(end_time);
2971	unsigned long count;
2972	bool busy;
2973	int err = 0;
2974
2975	ce = intel_context_create(engine);
2976	if (IS_ERR(ce)) {
2977		thread->result = PTR_ERR(ce);
2978		return;
2979	}
2980
2981	err = intel_context_pin(ce);
2982	if (err) {
2983		intel_context_put(ce);
2984		thread->result = err;
2985		return;
2986	}
2987
2988	if (intel_engine_supports_stats(engine)) {
2989		p->busy = intel_engine_get_busy_time(engine, &p->time);
2990		busy = true;
2991	} else {
2992		p->time = ktime_get();
2993		busy = false;
2994	}
2995
2996	count = 0;
2997	do {
2998		struct i915_request *rq;
2999
3000		rq = i915_request_create(ce);
3001		if (IS_ERR(rq)) {
3002			err = PTR_ERR(rq);
3003			break;
3004		}
3005
3006		i915_request_get(rq);
3007		i915_request_add(rq);
3008
3009		err = 0;
3010		if (i915_request_wait(rq, 0, HZ) < 0)
3011			err = -ETIME;
3012		i915_request_put(rq);
3013		if (err)
3014			break;
3015
3016		count++;
3017	} while (!__igt_timeout(end_time, NULL));
3018
3019	if (busy) {
3020		ktime_t now;
3021
3022		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3023				    p->busy);
3024		p->time = ktime_sub(now, p->time);
3025	} else {
3026		p->time = ktime_sub(ktime_get(), p->time);
3027	}
3028
3029	err = switch_to_kernel_sync(ce, err);
3030	p->runtime = intel_context_get_total_runtime_ns(ce);
3031	p->count = count;
3032
3033	intel_context_unpin(ce);
3034	intel_context_put(ce);
3035	thread->result = err;
3036}
3037
3038static void p_sync1(struct kthread_work *work)
3039{
3040	struct p_thread *thread = container_of(work, typeof(*thread), work);
3041	struct perf_stats *p = &thread->p;
3042	struct intel_engine_cs *engine = p->engine;
3043	struct i915_request *prev = NULL;
3044	struct intel_context *ce;
3045	IGT_TIMEOUT(end_time);
3046	unsigned long count;
3047	bool busy;
3048	int err = 0;
3049
3050	ce = intel_context_create(engine);
3051	if (IS_ERR(ce)) {
3052		thread->result = PTR_ERR(ce);
3053		return;
3054	}
3055
3056	err = intel_context_pin(ce);
3057	if (err) {
3058		intel_context_put(ce);
3059		thread->result = err;
3060		return;
3061	}
3062
3063	if (intel_engine_supports_stats(engine)) {
3064		p->busy = intel_engine_get_busy_time(engine, &p->time);
3065		busy = true;
3066	} else {
3067		p->time = ktime_get();
3068		busy = false;
3069	}
3070
3071	count = 0;
3072	do {
3073		struct i915_request *rq;
3074
3075		rq = i915_request_create(ce);
3076		if (IS_ERR(rq)) {
3077			err = PTR_ERR(rq);
3078			break;
3079		}
3080
3081		i915_request_get(rq);
3082		i915_request_add(rq);
3083
3084		err = 0;
3085		if (prev && i915_request_wait(prev, 0, HZ) < 0)
3086			err = -ETIME;
3087		i915_request_put(prev);
3088		prev = rq;
3089		if (err)
3090			break;
3091
3092		count++;
3093	} while (!__igt_timeout(end_time, NULL));
3094	i915_request_put(prev);
3095
3096	if (busy) {
3097		ktime_t now;
3098
3099		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3100				    p->busy);
3101		p->time = ktime_sub(now, p->time);
3102	} else {
3103		p->time = ktime_sub(ktime_get(), p->time);
3104	}
3105
3106	err = switch_to_kernel_sync(ce, err);
3107	p->runtime = intel_context_get_total_runtime_ns(ce);
3108	p->count = count;
3109
3110	intel_context_unpin(ce);
3111	intel_context_put(ce);
3112	thread->result = err;
3113}
3114
3115static void p_many(struct kthread_work *work)
3116{
3117	struct p_thread *thread = container_of(work, typeof(*thread), work);
3118	struct perf_stats *p = &thread->p;
3119	struct intel_engine_cs *engine = p->engine;
3120	struct intel_context *ce;
3121	IGT_TIMEOUT(end_time);
3122	unsigned long count;
3123	int err = 0;
3124	bool busy;
3125
3126	ce = intel_context_create(engine);
3127	if (IS_ERR(ce)) {
3128		thread->result = PTR_ERR(ce);
3129		return;
3130	}
3131
3132	err = intel_context_pin(ce);
3133	if (err) {
3134		intel_context_put(ce);
3135		thread->result = err;
3136		return;
3137	}
3138
3139	if (intel_engine_supports_stats(engine)) {
3140		p->busy = intel_engine_get_busy_time(engine, &p->time);
3141		busy = true;
3142	} else {
3143		p->time = ktime_get();
3144		busy = false;
3145	}
3146
3147	count = 0;
3148	do {
3149		struct i915_request *rq;
3150
3151		rq = i915_request_create(ce);
3152		if (IS_ERR(rq)) {
3153			err = PTR_ERR(rq);
3154			break;
3155		}
3156
3157		i915_request_add(rq);
3158		count++;
3159	} while (!__igt_timeout(end_time, NULL));
3160
3161	if (busy) {
3162		ktime_t now;
3163
3164		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3165				    p->busy);
3166		p->time = ktime_sub(now, p->time);
3167	} else {
3168		p->time = ktime_sub(ktime_get(), p->time);
3169	}
3170
3171	err = switch_to_kernel_sync(ce, err);
3172	p->runtime = intel_context_get_total_runtime_ns(ce);
3173	p->count = count;
3174
3175	intel_context_unpin(ce);
3176	intel_context_put(ce);
3177	thread->result = err;
3178}
3179
3180static int perf_parallel_engines(void *arg)
3181{
3182	struct drm_i915_private *i915 = arg;
3183	static void (* const func[])(struct kthread_work *) = {
3184		p_sync0,
3185		p_sync1,
3186		p_many,
3187		NULL,
3188	};
3189	const unsigned int nengines = num_uabi_engines(i915);
3190	void (* const *fn)(struct kthread_work *);
3191	struct intel_engine_cs *engine;
 
3192	struct pm_qos_request qos;
3193	struct p_thread *engines;
 
 
 
3194	int err = 0;
3195
3196	engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
3197	if (!engines)
3198		return -ENOMEM;
3199
3200	cpu_latency_qos_add_request(&qos, 0);
3201
3202	for (fn = func; *fn; fn++) {
3203		char name[KSYM_NAME_LEN];
3204		struct igt_live_test t;
3205		unsigned int idx;
3206
3207		snprintf(name, sizeof(name), "%ps", *fn);
3208		err = igt_live_test_begin(&t, i915, __func__, name);
3209		if (err)
3210			break;
3211
3212		atomic_set(&i915->selftest.counter, nengines);
3213
3214		idx = 0;
3215		for_each_uabi_engine(engine, i915) {
3216			struct kthread_worker *worker;
3217
3218			intel_engine_pm_get(engine);
3219
3220			memset(&engines[idx].p, 0, sizeof(engines[idx].p));
 
3221
3222			worker = kthread_create_worker(0, "igt:%s",
3223						       engine->name);
3224			if (IS_ERR(worker)) {
3225				err = PTR_ERR(worker);
3226				intel_engine_pm_put(engine);
3227				break;
3228			}
3229			engines[idx].worker = worker;
3230			engines[idx].result = 0;
3231			engines[idx].p.engine = engine;
3232			engines[idx].engine = engine;
3233
3234			kthread_init_work(&engines[idx].work, *fn);
3235			kthread_queue_work(worker, &engines[idx].work);
3236			idx++;
3237		}
3238
3239		idx = 0;
3240		for_each_uabi_engine(engine, i915) {
3241			int status;
3242
3243			if (!engines[idx].worker)
3244				break;
3245
3246			kthread_flush_work(&engines[idx].work);
3247			status = READ_ONCE(engines[idx].result);
3248			if (status && !err)
3249				err = status;
3250
3251			intel_engine_pm_put(engine);
3252
3253			kthread_destroy_worker(engines[idx].worker);
3254			idx++;
3255		}
3256
3257		if (igt_live_test_end(&t))
3258			err = -EIO;
3259		if (err)
3260			break;
3261
3262		idx = 0;
3263		for_each_uabi_engine(engine, i915) {
3264			struct perf_stats *p = &engines[idx].p;
3265			u64 busy = 100 * ktime_to_ns(p->busy);
3266			u64 dt = ktime_to_ns(p->time);
3267			int integer, decimal;
3268
3269			if (dt) {
3270				integer = div64_u64(busy, dt);
3271				busy -= integer * dt;
3272				decimal = div64_u64(100 * busy, dt);
3273			} else {
3274				integer = 0;
3275				decimal = 0;
3276			}
3277
3278			GEM_BUG_ON(engine != p->engine);
3279			pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
3280				name, engine->name, p->count, integer, decimal,
3281				div_u64(p->runtime, 1000 * 1000),
3282				div_u64(ktime_to_ns(p->time), 1000 * 1000));
3283			idx++;
3284		}
3285	}
3286
3287	cpu_latency_qos_remove_request(&qos);
3288	kfree(engines);
3289	return err;
3290}
3291
3292int i915_request_perf_selftests(struct drm_i915_private *i915)
3293{
3294	static const struct i915_subtest tests[] = {
3295		SUBTEST(perf_request_latency),
3296		SUBTEST(perf_series_engines),
3297		SUBTEST(perf_parallel_engines),
3298	};
3299
3300	if (intel_gt_is_wedged(to_gt(i915)))
3301		return 0;
3302
3303	return i915_subtests(tests, i915);
3304}