Loading...
Note: File does not exist in v3.1.
1/*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25#include <linux/prime_numbers.h>
26#include <linux/pm_qos.h>
27#include <linux/sort.h>
28
29#include "gem/i915_gem_pm.h"
30#include "gem/selftests/mock_context.h"
31
32#include "gt/intel_engine_heartbeat.h"
33#include "gt/intel_engine_pm.h"
34#include "gt/intel_engine_user.h"
35#include "gt/intel_gt.h"
36#include "gt/intel_gt_clock_utils.h"
37#include "gt/intel_gt_requests.h"
38#include "gt/selftest_engine_heartbeat.h"
39
40#include "i915_random.h"
41#include "i915_selftest.h"
42#include "igt_flush_test.h"
43#include "igt_live_test.h"
44#include "igt_spinner.h"
45#include "lib_sw_fence.h"
46
47#include "mock_drm.h"
48#include "mock_gem_device.h"
49
50static unsigned int num_uabi_engines(struct drm_i915_private *i915)
51{
52 struct intel_engine_cs *engine;
53 unsigned int count;
54
55 count = 0;
56 for_each_uabi_engine(engine, i915)
57 count++;
58
59 return count;
60}
61
62static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
63{
64 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
65}
66
67static int igt_add_request(void *arg)
68{
69 struct drm_i915_private *i915 = arg;
70 struct i915_request *request;
71
72 /* Basic preliminary test to create a request and let it loose! */
73
74 request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
75 if (!request)
76 return -ENOMEM;
77
78 i915_request_add(request);
79
80 return 0;
81}
82
83static int igt_wait_request(void *arg)
84{
85 const long T = HZ / 4;
86 struct drm_i915_private *i915 = arg;
87 struct i915_request *request;
88 int err = -EINVAL;
89
90 /* Submit a request, then wait upon it */
91
92 request = mock_request(rcs0(i915)->kernel_context, T);
93 if (!request)
94 return -ENOMEM;
95
96 i915_request_get(request);
97
98 if (i915_request_wait(request, 0, 0) != -ETIME) {
99 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
100 goto out_request;
101 }
102
103 if (i915_request_wait(request, 0, T) != -ETIME) {
104 pr_err("request wait succeeded (expected timeout before submit!)\n");
105 goto out_request;
106 }
107
108 if (i915_request_completed(request)) {
109 pr_err("request completed before submit!!\n");
110 goto out_request;
111 }
112
113 i915_request_add(request);
114
115 if (i915_request_wait(request, 0, 0) != -ETIME) {
116 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
117 goto out_request;
118 }
119
120 if (i915_request_completed(request)) {
121 pr_err("request completed immediately!\n");
122 goto out_request;
123 }
124
125 if (i915_request_wait(request, 0, T / 2) != -ETIME) {
126 pr_err("request wait succeeded (expected timeout!)\n");
127 goto out_request;
128 }
129
130 if (i915_request_wait(request, 0, T) == -ETIME) {
131 pr_err("request wait timed out!\n");
132 goto out_request;
133 }
134
135 if (!i915_request_completed(request)) {
136 pr_err("request not complete after waiting!\n");
137 goto out_request;
138 }
139
140 if (i915_request_wait(request, 0, T) == -ETIME) {
141 pr_err("request wait timed out when already complete!\n");
142 goto out_request;
143 }
144
145 err = 0;
146out_request:
147 i915_request_put(request);
148 mock_device_flush(i915);
149 return err;
150}
151
152static int igt_fence_wait(void *arg)
153{
154 const long T = HZ / 4;
155 struct drm_i915_private *i915 = arg;
156 struct i915_request *request;
157 int err = -EINVAL;
158
159 /* Submit a request, treat it as a fence and wait upon it */
160
161 request = mock_request(rcs0(i915)->kernel_context, T);
162 if (!request)
163 return -ENOMEM;
164
165 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
166 pr_err("fence wait success before submit (expected timeout)!\n");
167 goto out;
168 }
169
170 i915_request_add(request);
171
172 if (dma_fence_is_signaled(&request->fence)) {
173 pr_err("fence signaled immediately!\n");
174 goto out;
175 }
176
177 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
178 pr_err("fence wait success after submit (expected timeout)!\n");
179 goto out;
180 }
181
182 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
183 pr_err("fence wait timed out (expected success)!\n");
184 goto out;
185 }
186
187 if (!dma_fence_is_signaled(&request->fence)) {
188 pr_err("fence unsignaled after waiting!\n");
189 goto out;
190 }
191
192 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
193 pr_err("fence wait timed out when complete (expected success)!\n");
194 goto out;
195 }
196
197 err = 0;
198out:
199 mock_device_flush(i915);
200 return err;
201}
202
203static int igt_request_rewind(void *arg)
204{
205 struct drm_i915_private *i915 = arg;
206 struct i915_request *request, *vip;
207 struct i915_gem_context *ctx[2];
208 struct intel_context *ce;
209 int err = -EINVAL;
210
211 ctx[0] = mock_context(i915, "A");
212
213 ce = i915_gem_context_get_engine(ctx[0], RCS0);
214 GEM_BUG_ON(IS_ERR(ce));
215 request = mock_request(ce, 2 * HZ);
216 intel_context_put(ce);
217 if (!request) {
218 err = -ENOMEM;
219 goto err_context_0;
220 }
221
222 i915_request_get(request);
223 i915_request_add(request);
224
225 ctx[1] = mock_context(i915, "B");
226
227 ce = i915_gem_context_get_engine(ctx[1], RCS0);
228 GEM_BUG_ON(IS_ERR(ce));
229 vip = mock_request(ce, 0);
230 intel_context_put(ce);
231 if (!vip) {
232 err = -ENOMEM;
233 goto err_context_1;
234 }
235
236 /* Simulate preemption by manual reordering */
237 if (!mock_cancel_request(request)) {
238 pr_err("failed to cancel request (already executed)!\n");
239 i915_request_add(vip);
240 goto err_context_1;
241 }
242 i915_request_get(vip);
243 i915_request_add(vip);
244 rcu_read_lock();
245 request->engine->submit_request(request);
246 rcu_read_unlock();
247
248
249 if (i915_request_wait(vip, 0, HZ) == -ETIME) {
250 pr_err("timed out waiting for high priority request\n");
251 goto err;
252 }
253
254 if (i915_request_completed(request)) {
255 pr_err("low priority request already completed\n");
256 goto err;
257 }
258
259 err = 0;
260err:
261 i915_request_put(vip);
262err_context_1:
263 mock_context_close(ctx[1]);
264 i915_request_put(request);
265err_context_0:
266 mock_context_close(ctx[0]);
267 mock_device_flush(i915);
268 return err;
269}
270
271struct smoketest {
272 struct intel_engine_cs *engine;
273 struct i915_gem_context **contexts;
274 atomic_long_t num_waits, num_fences;
275 int ncontexts, max_batch;
276 struct i915_request *(*request_alloc)(struct intel_context *ce);
277};
278
279static struct i915_request *
280__mock_request_alloc(struct intel_context *ce)
281{
282 return mock_request(ce, 0);
283}
284
285static struct i915_request *
286__live_request_alloc(struct intel_context *ce)
287{
288 return intel_context_create_request(ce);
289}
290
291static int __igt_breadcrumbs_smoketest(void *arg)
292{
293 struct smoketest *t = arg;
294 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
295 const unsigned int total = 4 * t->ncontexts + 1;
296 unsigned int num_waits = 0, num_fences = 0;
297 struct i915_request **requests;
298 I915_RND_STATE(prng);
299 unsigned int *order;
300 int err = 0;
301
302 /*
303 * A very simple test to catch the most egregious of list handling bugs.
304 *
305 * At its heart, we simply create oodles of requests running across
306 * multiple kthreads and enable signaling on them, for the sole purpose
307 * of stressing our breadcrumb handling. The only inspection we do is
308 * that the fences were marked as signaled.
309 */
310
311 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
312 if (!requests)
313 return -ENOMEM;
314
315 order = i915_random_order(total, &prng);
316 if (!order) {
317 err = -ENOMEM;
318 goto out_requests;
319 }
320
321 while (!kthread_should_stop()) {
322 struct i915_sw_fence *submit, *wait;
323 unsigned int n, count;
324
325 submit = heap_fence_create(GFP_KERNEL);
326 if (!submit) {
327 err = -ENOMEM;
328 break;
329 }
330
331 wait = heap_fence_create(GFP_KERNEL);
332 if (!wait) {
333 i915_sw_fence_commit(submit);
334 heap_fence_put(submit);
335 err = -ENOMEM;
336 break;
337 }
338
339 i915_random_reorder(order, total, &prng);
340 count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
341
342 for (n = 0; n < count; n++) {
343 struct i915_gem_context *ctx =
344 t->contexts[order[n] % t->ncontexts];
345 struct i915_request *rq;
346 struct intel_context *ce;
347
348 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
349 GEM_BUG_ON(IS_ERR(ce));
350 rq = t->request_alloc(ce);
351 intel_context_put(ce);
352 if (IS_ERR(rq)) {
353 err = PTR_ERR(rq);
354 count = n;
355 break;
356 }
357
358 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
359 submit,
360 GFP_KERNEL);
361
362 requests[n] = i915_request_get(rq);
363 i915_request_add(rq);
364
365 if (err >= 0)
366 err = i915_sw_fence_await_dma_fence(wait,
367 &rq->fence,
368 0,
369 GFP_KERNEL);
370
371 if (err < 0) {
372 i915_request_put(rq);
373 count = n;
374 break;
375 }
376 }
377
378 i915_sw_fence_commit(submit);
379 i915_sw_fence_commit(wait);
380
381 if (!wait_event_timeout(wait->wait,
382 i915_sw_fence_done(wait),
383 5 * HZ)) {
384 struct i915_request *rq = requests[count - 1];
385
386 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
387 atomic_read(&wait->pending), count,
388 rq->fence.context, rq->fence.seqno,
389 t->engine->name);
390 GEM_TRACE_DUMP();
391
392 intel_gt_set_wedged(t->engine->gt);
393 GEM_BUG_ON(!i915_request_completed(rq));
394 i915_sw_fence_wait(wait);
395 err = -EIO;
396 }
397
398 for (n = 0; n < count; n++) {
399 struct i915_request *rq = requests[n];
400
401 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
402 &rq->fence.flags)) {
403 pr_err("%llu:%llu was not signaled!\n",
404 rq->fence.context, rq->fence.seqno);
405 err = -EINVAL;
406 }
407
408 i915_request_put(rq);
409 }
410
411 heap_fence_put(wait);
412 heap_fence_put(submit);
413
414 if (err < 0)
415 break;
416
417 num_fences += count;
418 num_waits++;
419
420 cond_resched();
421 }
422
423 atomic_long_add(num_fences, &t->num_fences);
424 atomic_long_add(num_waits, &t->num_waits);
425
426 kfree(order);
427out_requests:
428 kfree(requests);
429 return err;
430}
431
432static int mock_breadcrumbs_smoketest(void *arg)
433{
434 struct drm_i915_private *i915 = arg;
435 struct smoketest t = {
436 .engine = rcs0(i915),
437 .ncontexts = 1024,
438 .max_batch = 1024,
439 .request_alloc = __mock_request_alloc
440 };
441 unsigned int ncpus = num_online_cpus();
442 struct task_struct **threads;
443 unsigned int n;
444 int ret = 0;
445
446 /*
447 * Smoketest our breadcrumb/signal handling for requests across multiple
448 * threads. A very simple test to only catch the most egregious of bugs.
449 * See __igt_breadcrumbs_smoketest();
450 */
451
452 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
453 if (!threads)
454 return -ENOMEM;
455
456 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
457 if (!t.contexts) {
458 ret = -ENOMEM;
459 goto out_threads;
460 }
461
462 for (n = 0; n < t.ncontexts; n++) {
463 t.contexts[n] = mock_context(t.engine->i915, "mock");
464 if (!t.contexts[n]) {
465 ret = -ENOMEM;
466 goto out_contexts;
467 }
468 }
469
470 for (n = 0; n < ncpus; n++) {
471 threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
472 &t, "igt/%d", n);
473 if (IS_ERR(threads[n])) {
474 ret = PTR_ERR(threads[n]);
475 ncpus = n;
476 break;
477 }
478
479 get_task_struct(threads[n]);
480 }
481
482 yield(); /* start all threads before we begin */
483 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
484
485 for (n = 0; n < ncpus; n++) {
486 int err;
487
488 err = kthread_stop(threads[n]);
489 if (err < 0 && !ret)
490 ret = err;
491
492 put_task_struct(threads[n]);
493 }
494 pr_info("Completed %lu waits for %lu fence across %d cpus\n",
495 atomic_long_read(&t.num_waits),
496 atomic_long_read(&t.num_fences),
497 ncpus);
498
499out_contexts:
500 for (n = 0; n < t.ncontexts; n++) {
501 if (!t.contexts[n])
502 break;
503 mock_context_close(t.contexts[n]);
504 }
505 kfree(t.contexts);
506out_threads:
507 kfree(threads);
508 return ret;
509}
510
511int i915_request_mock_selftests(void)
512{
513 static const struct i915_subtest tests[] = {
514 SUBTEST(igt_add_request),
515 SUBTEST(igt_wait_request),
516 SUBTEST(igt_fence_wait),
517 SUBTEST(igt_request_rewind),
518 SUBTEST(mock_breadcrumbs_smoketest),
519 };
520 struct drm_i915_private *i915;
521 intel_wakeref_t wakeref;
522 int err = 0;
523
524 i915 = mock_gem_device();
525 if (!i915)
526 return -ENOMEM;
527
528 with_intel_runtime_pm(&i915->runtime_pm, wakeref)
529 err = i915_subtests(tests, i915);
530
531 mock_destroy_device(i915);
532
533 return err;
534}
535
536static int live_nop_request(void *arg)
537{
538 struct drm_i915_private *i915 = arg;
539 struct intel_engine_cs *engine;
540 struct igt_live_test t;
541 int err = -ENODEV;
542
543 /*
544 * Submit various sized batches of empty requests, to each engine
545 * (individually), and wait for the batch to complete. We can check
546 * the overhead of submitting requests to the hardware.
547 */
548
549 for_each_uabi_engine(engine, i915) {
550 unsigned long n, prime;
551 IGT_TIMEOUT(end_time);
552 ktime_t times[2] = {};
553
554 err = igt_live_test_begin(&t, i915, __func__, engine->name);
555 if (err)
556 return err;
557
558 intel_engine_pm_get(engine);
559 for_each_prime_number_from(prime, 1, 8192) {
560 struct i915_request *request = NULL;
561
562 times[1] = ktime_get_raw();
563
564 for (n = 0; n < prime; n++) {
565 i915_request_put(request);
566 request = i915_request_create(engine->kernel_context);
567 if (IS_ERR(request))
568 return PTR_ERR(request);
569
570 /*
571 * This space is left intentionally blank.
572 *
573 * We do not actually want to perform any
574 * action with this request, we just want
575 * to measure the latency in allocation
576 * and submission of our breadcrumbs -
577 * ensuring that the bare request is sufficient
578 * for the system to work (i.e. proper HEAD
579 * tracking of the rings, interrupt handling,
580 * etc). It also gives us the lowest bounds
581 * for latency.
582 */
583
584 i915_request_get(request);
585 i915_request_add(request);
586 }
587 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
588 i915_request_put(request);
589
590 times[1] = ktime_sub(ktime_get_raw(), times[1]);
591 if (prime == 1)
592 times[0] = times[1];
593
594 if (__igt_timeout(end_time, NULL))
595 break;
596 }
597 intel_engine_pm_put(engine);
598
599 err = igt_live_test_end(&t);
600 if (err)
601 return err;
602
603 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
604 engine->name,
605 ktime_to_ns(times[0]),
606 prime, div64_u64(ktime_to_ns(times[1]), prime));
607 }
608
609 return err;
610}
611
612static int __cancel_inactive(struct intel_engine_cs *engine)
613{
614 struct intel_context *ce;
615 struct igt_spinner spin;
616 struct i915_request *rq;
617 int err = 0;
618
619 if (igt_spinner_init(&spin, engine->gt))
620 return -ENOMEM;
621
622 ce = intel_context_create(engine);
623 if (IS_ERR(ce)) {
624 err = PTR_ERR(ce);
625 goto out_spin;
626 }
627
628 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
629 if (IS_ERR(rq)) {
630 err = PTR_ERR(rq);
631 goto out_ce;
632 }
633
634 pr_debug("%s: Cancelling inactive request\n", engine->name);
635 i915_request_cancel(rq, -EINTR);
636 i915_request_get(rq);
637 i915_request_add(rq);
638
639 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
640 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
641
642 pr_err("%s: Failed to cancel inactive request\n", engine->name);
643 intel_engine_dump(engine, &p, "%s\n", engine->name);
644 err = -ETIME;
645 goto out_rq;
646 }
647
648 if (rq->fence.error != -EINTR) {
649 pr_err("%s: fence not cancelled (%u)\n",
650 engine->name, rq->fence.error);
651 err = -EINVAL;
652 }
653
654out_rq:
655 i915_request_put(rq);
656out_ce:
657 intel_context_put(ce);
658out_spin:
659 igt_spinner_fini(&spin);
660 if (err)
661 pr_err("%s: %s error %d\n", __func__, engine->name, err);
662 return err;
663}
664
665static int __cancel_active(struct intel_engine_cs *engine)
666{
667 struct intel_context *ce;
668 struct igt_spinner spin;
669 struct i915_request *rq;
670 int err = 0;
671
672 if (igt_spinner_init(&spin, engine->gt))
673 return -ENOMEM;
674
675 ce = intel_context_create(engine);
676 if (IS_ERR(ce)) {
677 err = PTR_ERR(ce);
678 goto out_spin;
679 }
680
681 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
682 if (IS_ERR(rq)) {
683 err = PTR_ERR(rq);
684 goto out_ce;
685 }
686
687 pr_debug("%s: Cancelling active request\n", engine->name);
688 i915_request_get(rq);
689 i915_request_add(rq);
690 if (!igt_wait_for_spinner(&spin, rq)) {
691 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
692
693 pr_err("Failed to start spinner on %s\n", engine->name);
694 intel_engine_dump(engine, &p, "%s\n", engine->name);
695 err = -ETIME;
696 goto out_rq;
697 }
698 i915_request_cancel(rq, -EINTR);
699
700 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
701 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
702
703 pr_err("%s: Failed to cancel active request\n", engine->name);
704 intel_engine_dump(engine, &p, "%s\n", engine->name);
705 err = -ETIME;
706 goto out_rq;
707 }
708
709 if (rq->fence.error != -EINTR) {
710 pr_err("%s: fence not cancelled (%u)\n",
711 engine->name, rq->fence.error);
712 err = -EINVAL;
713 }
714
715out_rq:
716 i915_request_put(rq);
717out_ce:
718 intel_context_put(ce);
719out_spin:
720 igt_spinner_fini(&spin);
721 if (err)
722 pr_err("%s: %s error %d\n", __func__, engine->name, err);
723 return err;
724}
725
726static int __cancel_completed(struct intel_engine_cs *engine)
727{
728 struct intel_context *ce;
729 struct igt_spinner spin;
730 struct i915_request *rq;
731 int err = 0;
732
733 if (igt_spinner_init(&spin, engine->gt))
734 return -ENOMEM;
735
736 ce = intel_context_create(engine);
737 if (IS_ERR(ce)) {
738 err = PTR_ERR(ce);
739 goto out_spin;
740 }
741
742 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
743 if (IS_ERR(rq)) {
744 err = PTR_ERR(rq);
745 goto out_ce;
746 }
747 igt_spinner_end(&spin);
748 i915_request_get(rq);
749 i915_request_add(rq);
750
751 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
752 err = -ETIME;
753 goto out_rq;
754 }
755
756 pr_debug("%s: Cancelling completed request\n", engine->name);
757 i915_request_cancel(rq, -EINTR);
758 if (rq->fence.error) {
759 pr_err("%s: fence not cancelled (%u)\n",
760 engine->name, rq->fence.error);
761 err = -EINVAL;
762 }
763
764out_rq:
765 i915_request_put(rq);
766out_ce:
767 intel_context_put(ce);
768out_spin:
769 igt_spinner_fini(&spin);
770 if (err)
771 pr_err("%s: %s error %d\n", __func__, engine->name, err);
772 return err;
773}
774
775static int live_cancel_request(void *arg)
776{
777 struct drm_i915_private *i915 = arg;
778 struct intel_engine_cs *engine;
779
780 /*
781 * Check cancellation of requests. We expect to be able to immediately
782 * cancel active requests, even if they are currently on the GPU.
783 */
784
785 for_each_uabi_engine(engine, i915) {
786 struct igt_live_test t;
787 int err, err2;
788
789 if (!intel_engine_has_preemption(engine))
790 continue;
791
792 err = igt_live_test_begin(&t, i915, __func__, engine->name);
793 if (err)
794 return err;
795
796 err = __cancel_inactive(engine);
797 if (err == 0)
798 err = __cancel_active(engine);
799 if (err == 0)
800 err = __cancel_completed(engine);
801
802 err2 = igt_live_test_end(&t);
803 if (err)
804 return err;
805 if (err2)
806 return err2;
807 }
808
809 return 0;
810}
811
812static struct i915_vma *empty_batch(struct drm_i915_private *i915)
813{
814 struct drm_i915_gem_object *obj;
815 struct i915_vma *vma;
816 u32 *cmd;
817 int err;
818
819 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
820 if (IS_ERR(obj))
821 return ERR_CAST(obj);
822
823 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB);
824 if (IS_ERR(cmd)) {
825 err = PTR_ERR(cmd);
826 goto err;
827 }
828
829 *cmd = MI_BATCH_BUFFER_END;
830
831 __i915_gem_object_flush_map(obj, 0, 64);
832 i915_gem_object_unpin_map(obj);
833
834 intel_gt_chipset_flush(&i915->gt);
835
836 vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
837 if (IS_ERR(vma)) {
838 err = PTR_ERR(vma);
839 goto err;
840 }
841
842 err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
843 if (err)
844 goto err;
845
846 /* Force the wait wait now to avoid including it in the benchmark */
847 err = i915_vma_sync(vma);
848 if (err)
849 goto err_pin;
850
851 return vma;
852
853err_pin:
854 i915_vma_unpin(vma);
855err:
856 i915_gem_object_put(obj);
857 return ERR_PTR(err);
858}
859
860static struct i915_request *
861empty_request(struct intel_engine_cs *engine,
862 struct i915_vma *batch)
863{
864 struct i915_request *request;
865 int err;
866
867 request = i915_request_create(engine->kernel_context);
868 if (IS_ERR(request))
869 return request;
870
871 err = engine->emit_bb_start(request,
872 batch->node.start,
873 batch->node.size,
874 I915_DISPATCH_SECURE);
875 if (err)
876 goto out_request;
877
878 i915_request_get(request);
879out_request:
880 i915_request_add(request);
881 return err ? ERR_PTR(err) : request;
882}
883
884static int live_empty_request(void *arg)
885{
886 struct drm_i915_private *i915 = arg;
887 struct intel_engine_cs *engine;
888 struct igt_live_test t;
889 struct i915_vma *batch;
890 int err = 0;
891
892 /*
893 * Submit various sized batches of empty requests, to each engine
894 * (individually), and wait for the batch to complete. We can check
895 * the overhead of submitting requests to the hardware.
896 */
897
898 batch = empty_batch(i915);
899 if (IS_ERR(batch))
900 return PTR_ERR(batch);
901
902 for_each_uabi_engine(engine, i915) {
903 IGT_TIMEOUT(end_time);
904 struct i915_request *request;
905 unsigned long n, prime;
906 ktime_t times[2] = {};
907
908 err = igt_live_test_begin(&t, i915, __func__, engine->name);
909 if (err)
910 goto out_batch;
911
912 intel_engine_pm_get(engine);
913
914 /* Warmup / preload */
915 request = empty_request(engine, batch);
916 if (IS_ERR(request)) {
917 err = PTR_ERR(request);
918 intel_engine_pm_put(engine);
919 goto out_batch;
920 }
921 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
922
923 for_each_prime_number_from(prime, 1, 8192) {
924 times[1] = ktime_get_raw();
925
926 for (n = 0; n < prime; n++) {
927 i915_request_put(request);
928 request = empty_request(engine, batch);
929 if (IS_ERR(request)) {
930 err = PTR_ERR(request);
931 intel_engine_pm_put(engine);
932 goto out_batch;
933 }
934 }
935 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
936
937 times[1] = ktime_sub(ktime_get_raw(), times[1]);
938 if (prime == 1)
939 times[0] = times[1];
940
941 if (__igt_timeout(end_time, NULL))
942 break;
943 }
944 i915_request_put(request);
945 intel_engine_pm_put(engine);
946
947 err = igt_live_test_end(&t);
948 if (err)
949 goto out_batch;
950
951 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
952 engine->name,
953 ktime_to_ns(times[0]),
954 prime, div64_u64(ktime_to_ns(times[1]), prime));
955 }
956
957out_batch:
958 i915_vma_unpin(batch);
959 i915_vma_put(batch);
960 return err;
961}
962
963static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
964{
965 struct drm_i915_gem_object *obj;
966 const int ver = GRAPHICS_VER(i915);
967 struct i915_vma *vma;
968 u32 *cmd;
969 int err;
970
971 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
972 if (IS_ERR(obj))
973 return ERR_CAST(obj);
974
975 vma = i915_vma_instance(obj, i915->gt.vm, NULL);
976 if (IS_ERR(vma)) {
977 err = PTR_ERR(vma);
978 goto err;
979 }
980
981 err = i915_vma_pin(vma, 0, 0, PIN_USER);
982 if (err)
983 goto err;
984
985 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
986 if (IS_ERR(cmd)) {
987 err = PTR_ERR(cmd);
988 goto err;
989 }
990
991 if (ver >= 8) {
992 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
993 *cmd++ = lower_32_bits(vma->node.start);
994 *cmd++ = upper_32_bits(vma->node.start);
995 } else if (ver >= 6) {
996 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
997 *cmd++ = lower_32_bits(vma->node.start);
998 } else {
999 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1000 *cmd++ = lower_32_bits(vma->node.start);
1001 }
1002 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
1003
1004 __i915_gem_object_flush_map(obj, 0, 64);
1005 i915_gem_object_unpin_map(obj);
1006
1007 intel_gt_chipset_flush(&i915->gt);
1008
1009 return vma;
1010
1011err:
1012 i915_gem_object_put(obj);
1013 return ERR_PTR(err);
1014}
1015
1016static int recursive_batch_resolve(struct i915_vma *batch)
1017{
1018 u32 *cmd;
1019
1020 cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
1021 if (IS_ERR(cmd))
1022 return PTR_ERR(cmd);
1023
1024 *cmd = MI_BATCH_BUFFER_END;
1025
1026 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
1027 i915_gem_object_unpin_map(batch->obj);
1028
1029 intel_gt_chipset_flush(batch->vm->gt);
1030
1031 return 0;
1032}
1033
1034static int live_all_engines(void *arg)
1035{
1036 struct drm_i915_private *i915 = arg;
1037 const unsigned int nengines = num_uabi_engines(i915);
1038 struct intel_engine_cs *engine;
1039 struct i915_request **request;
1040 struct igt_live_test t;
1041 struct i915_vma *batch;
1042 unsigned int idx;
1043 int err;
1044
1045 /*
1046 * Check we can submit requests to all engines simultaneously. We
1047 * send a recursive batch to each engine - checking that we don't
1048 * block doing so, and that they don't complete too soon.
1049 */
1050
1051 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1052 if (!request)
1053 return -ENOMEM;
1054
1055 err = igt_live_test_begin(&t, i915, __func__, "");
1056 if (err)
1057 goto out_free;
1058
1059 batch = recursive_batch(i915);
1060 if (IS_ERR(batch)) {
1061 err = PTR_ERR(batch);
1062 pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
1063 goto out_free;
1064 }
1065
1066 i915_vma_lock(batch);
1067
1068 idx = 0;
1069 for_each_uabi_engine(engine, i915) {
1070 request[idx] = intel_engine_create_kernel_request(engine);
1071 if (IS_ERR(request[idx])) {
1072 err = PTR_ERR(request[idx]);
1073 pr_err("%s: Request allocation failed with err=%d\n",
1074 __func__, err);
1075 goto out_request;
1076 }
1077
1078 err = i915_request_await_object(request[idx], batch->obj, 0);
1079 if (err == 0)
1080 err = i915_vma_move_to_active(batch, request[idx], 0);
1081 GEM_BUG_ON(err);
1082
1083 err = engine->emit_bb_start(request[idx],
1084 batch->node.start,
1085 batch->node.size,
1086 0);
1087 GEM_BUG_ON(err);
1088 request[idx]->batch = batch;
1089
1090 i915_request_get(request[idx]);
1091 i915_request_add(request[idx]);
1092 idx++;
1093 }
1094
1095 i915_vma_unlock(batch);
1096
1097 idx = 0;
1098 for_each_uabi_engine(engine, i915) {
1099 if (i915_request_completed(request[idx])) {
1100 pr_err("%s(%s): request completed too early!\n",
1101 __func__, engine->name);
1102 err = -EINVAL;
1103 goto out_request;
1104 }
1105 idx++;
1106 }
1107
1108 err = recursive_batch_resolve(batch);
1109 if (err) {
1110 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
1111 goto out_request;
1112 }
1113
1114 idx = 0;
1115 for_each_uabi_engine(engine, i915) {
1116 long timeout;
1117
1118 timeout = i915_request_wait(request[idx], 0,
1119 MAX_SCHEDULE_TIMEOUT);
1120 if (timeout < 0) {
1121 err = timeout;
1122 pr_err("%s: error waiting for request on %s, err=%d\n",
1123 __func__, engine->name, err);
1124 goto out_request;
1125 }
1126
1127 GEM_BUG_ON(!i915_request_completed(request[idx]));
1128 i915_request_put(request[idx]);
1129 request[idx] = NULL;
1130 idx++;
1131 }
1132
1133 err = igt_live_test_end(&t);
1134
1135out_request:
1136 idx = 0;
1137 for_each_uabi_engine(engine, i915) {
1138 if (request[idx])
1139 i915_request_put(request[idx]);
1140 idx++;
1141 }
1142 i915_vma_unpin(batch);
1143 i915_vma_put(batch);
1144out_free:
1145 kfree(request);
1146 return err;
1147}
1148
1149static int live_sequential_engines(void *arg)
1150{
1151 struct drm_i915_private *i915 = arg;
1152 const unsigned int nengines = num_uabi_engines(i915);
1153 struct i915_request **request;
1154 struct i915_request *prev = NULL;
1155 struct intel_engine_cs *engine;
1156 struct igt_live_test t;
1157 unsigned int idx;
1158 int err;
1159
1160 /*
1161 * Check we can submit requests to all engines sequentially, such
1162 * that each successive request waits for the earlier ones. This
1163 * tests that we don't execute requests out of order, even though
1164 * they are running on independent engines.
1165 */
1166
1167 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1168 if (!request)
1169 return -ENOMEM;
1170
1171 err = igt_live_test_begin(&t, i915, __func__, "");
1172 if (err)
1173 goto out_free;
1174
1175 idx = 0;
1176 for_each_uabi_engine(engine, i915) {
1177 struct i915_vma *batch;
1178
1179 batch = recursive_batch(i915);
1180 if (IS_ERR(batch)) {
1181 err = PTR_ERR(batch);
1182 pr_err("%s: Unable to create batch for %s, err=%d\n",
1183 __func__, engine->name, err);
1184 goto out_free;
1185 }
1186
1187 i915_vma_lock(batch);
1188 request[idx] = intel_engine_create_kernel_request(engine);
1189 if (IS_ERR(request[idx])) {
1190 err = PTR_ERR(request[idx]);
1191 pr_err("%s: Request allocation failed for %s with err=%d\n",
1192 __func__, engine->name, err);
1193 goto out_unlock;
1194 }
1195
1196 if (prev) {
1197 err = i915_request_await_dma_fence(request[idx],
1198 &prev->fence);
1199 if (err) {
1200 i915_request_add(request[idx]);
1201 pr_err("%s: Request await failed for %s with err=%d\n",
1202 __func__, engine->name, err);
1203 goto out_unlock;
1204 }
1205 }
1206
1207 err = i915_request_await_object(request[idx],
1208 batch->obj, false);
1209 if (err == 0)
1210 err = i915_vma_move_to_active(batch, request[idx], 0);
1211 GEM_BUG_ON(err);
1212
1213 err = engine->emit_bb_start(request[idx],
1214 batch->node.start,
1215 batch->node.size,
1216 0);
1217 GEM_BUG_ON(err);
1218 request[idx]->batch = batch;
1219
1220 i915_request_get(request[idx]);
1221 i915_request_add(request[idx]);
1222
1223 prev = request[idx];
1224 idx++;
1225
1226out_unlock:
1227 i915_vma_unlock(batch);
1228 if (err)
1229 goto out_request;
1230 }
1231
1232 idx = 0;
1233 for_each_uabi_engine(engine, i915) {
1234 long timeout;
1235
1236 if (i915_request_completed(request[idx])) {
1237 pr_err("%s(%s): request completed too early!\n",
1238 __func__, engine->name);
1239 err = -EINVAL;
1240 goto out_request;
1241 }
1242
1243 err = recursive_batch_resolve(request[idx]->batch);
1244 if (err) {
1245 pr_err("%s: failed to resolve batch, err=%d\n",
1246 __func__, err);
1247 goto out_request;
1248 }
1249
1250 timeout = i915_request_wait(request[idx], 0,
1251 MAX_SCHEDULE_TIMEOUT);
1252 if (timeout < 0) {
1253 err = timeout;
1254 pr_err("%s: error waiting for request on %s, err=%d\n",
1255 __func__, engine->name, err);
1256 goto out_request;
1257 }
1258
1259 GEM_BUG_ON(!i915_request_completed(request[idx]));
1260 idx++;
1261 }
1262
1263 err = igt_live_test_end(&t);
1264
1265out_request:
1266 idx = 0;
1267 for_each_uabi_engine(engine, i915) {
1268 u32 *cmd;
1269
1270 if (!request[idx])
1271 break;
1272
1273 cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
1274 I915_MAP_WC);
1275 if (!IS_ERR(cmd)) {
1276 *cmd = MI_BATCH_BUFFER_END;
1277
1278 __i915_gem_object_flush_map(request[idx]->batch->obj,
1279 0, sizeof(*cmd));
1280 i915_gem_object_unpin_map(request[idx]->batch->obj);
1281
1282 intel_gt_chipset_flush(engine->gt);
1283 }
1284
1285 i915_vma_put(request[idx]->batch);
1286 i915_request_put(request[idx]);
1287 idx++;
1288 }
1289out_free:
1290 kfree(request);
1291 return err;
1292}
1293
1294static int __live_parallel_engine1(void *arg)
1295{
1296 struct intel_engine_cs *engine = arg;
1297 IGT_TIMEOUT(end_time);
1298 unsigned long count;
1299 int err = 0;
1300
1301 count = 0;
1302 intel_engine_pm_get(engine);
1303 do {
1304 struct i915_request *rq;
1305
1306 rq = i915_request_create(engine->kernel_context);
1307 if (IS_ERR(rq)) {
1308 err = PTR_ERR(rq);
1309 break;
1310 }
1311
1312 i915_request_get(rq);
1313 i915_request_add(rq);
1314
1315 err = 0;
1316 if (i915_request_wait(rq, 0, HZ / 5) < 0)
1317 err = -ETIME;
1318 i915_request_put(rq);
1319 if (err)
1320 break;
1321
1322 count++;
1323 } while (!__igt_timeout(end_time, NULL));
1324 intel_engine_pm_put(engine);
1325
1326 pr_info("%s: %lu request + sync\n", engine->name, count);
1327 return err;
1328}
1329
1330static int __live_parallel_engineN(void *arg)
1331{
1332 struct intel_engine_cs *engine = arg;
1333 IGT_TIMEOUT(end_time);
1334 unsigned long count;
1335 int err = 0;
1336
1337 count = 0;
1338 intel_engine_pm_get(engine);
1339 do {
1340 struct i915_request *rq;
1341
1342 rq = i915_request_create(engine->kernel_context);
1343 if (IS_ERR(rq)) {
1344 err = PTR_ERR(rq);
1345 break;
1346 }
1347
1348 i915_request_add(rq);
1349 count++;
1350 } while (!__igt_timeout(end_time, NULL));
1351 intel_engine_pm_put(engine);
1352
1353 pr_info("%s: %lu requests\n", engine->name, count);
1354 return err;
1355}
1356
1357static bool wake_all(struct drm_i915_private *i915)
1358{
1359 if (atomic_dec_and_test(&i915->selftest.counter)) {
1360 wake_up_var(&i915->selftest.counter);
1361 return true;
1362 }
1363
1364 return false;
1365}
1366
1367static int wait_for_all(struct drm_i915_private *i915)
1368{
1369 if (wake_all(i915))
1370 return 0;
1371
1372 if (wait_var_event_timeout(&i915->selftest.counter,
1373 !atomic_read(&i915->selftest.counter),
1374 i915_selftest.timeout_jiffies))
1375 return 0;
1376
1377 return -ETIME;
1378}
1379
1380static int __live_parallel_spin(void *arg)
1381{
1382 struct intel_engine_cs *engine = arg;
1383 struct igt_spinner spin;
1384 struct i915_request *rq;
1385 int err = 0;
1386
1387 /*
1388 * Create a spinner running for eternity on each engine. If a second
1389 * spinner is incorrectly placed on the same engine, it will not be
1390 * able to start in time.
1391 */
1392
1393 if (igt_spinner_init(&spin, engine->gt)) {
1394 wake_all(engine->i915);
1395 return -ENOMEM;
1396 }
1397
1398 intel_engine_pm_get(engine);
1399 rq = igt_spinner_create_request(&spin,
1400 engine->kernel_context,
1401 MI_NOOP); /* no preemption */
1402 intel_engine_pm_put(engine);
1403 if (IS_ERR(rq)) {
1404 err = PTR_ERR(rq);
1405 if (err == -ENODEV)
1406 err = 0;
1407 wake_all(engine->i915);
1408 goto out_spin;
1409 }
1410
1411 i915_request_get(rq);
1412 i915_request_add(rq);
1413 if (igt_wait_for_spinner(&spin, rq)) {
1414 /* Occupy this engine for the whole test */
1415 err = wait_for_all(engine->i915);
1416 } else {
1417 pr_err("Failed to start spinner on %s\n", engine->name);
1418 err = -EINVAL;
1419 }
1420 igt_spinner_end(&spin);
1421
1422 if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0)
1423 err = -EIO;
1424 i915_request_put(rq);
1425
1426out_spin:
1427 igt_spinner_fini(&spin);
1428 return err;
1429}
1430
1431static int live_parallel_engines(void *arg)
1432{
1433 struct drm_i915_private *i915 = arg;
1434 static int (* const func[])(void *arg) = {
1435 __live_parallel_engine1,
1436 __live_parallel_engineN,
1437 __live_parallel_spin,
1438 NULL,
1439 };
1440 const unsigned int nengines = num_uabi_engines(i915);
1441 struct intel_engine_cs *engine;
1442 int (* const *fn)(void *arg);
1443 struct task_struct **tsk;
1444 int err = 0;
1445
1446 /*
1447 * Check we can submit requests to all engines concurrently. This
1448 * tests that we load up the system maximally.
1449 */
1450
1451 tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1452 if (!tsk)
1453 return -ENOMEM;
1454
1455 for (fn = func; !err && *fn; fn++) {
1456 char name[KSYM_NAME_LEN];
1457 struct igt_live_test t;
1458 unsigned int idx;
1459
1460 snprintf(name, sizeof(name), "%ps", *fn);
1461 err = igt_live_test_begin(&t, i915, __func__, name);
1462 if (err)
1463 break;
1464
1465 atomic_set(&i915->selftest.counter, nengines);
1466
1467 idx = 0;
1468 for_each_uabi_engine(engine, i915) {
1469 tsk[idx] = kthread_run(*fn, engine,
1470 "igt/parallel:%s",
1471 engine->name);
1472 if (IS_ERR(tsk[idx])) {
1473 err = PTR_ERR(tsk[idx]);
1474 break;
1475 }
1476 get_task_struct(tsk[idx++]);
1477 }
1478
1479 yield(); /* start all threads before we kthread_stop() */
1480
1481 idx = 0;
1482 for_each_uabi_engine(engine, i915) {
1483 int status;
1484
1485 if (IS_ERR(tsk[idx]))
1486 break;
1487
1488 status = kthread_stop(tsk[idx]);
1489 if (status && !err)
1490 err = status;
1491
1492 put_task_struct(tsk[idx++]);
1493 }
1494
1495 if (igt_live_test_end(&t))
1496 err = -EIO;
1497 }
1498
1499 kfree(tsk);
1500 return err;
1501}
1502
1503static int
1504max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1505{
1506 struct i915_request *rq;
1507 int ret;
1508
1509 /*
1510 * Before execlists, all contexts share the same ringbuffer. With
1511 * execlists, each context/engine has a separate ringbuffer and
1512 * for the purposes of this test, inexhaustible.
1513 *
1514 * For the global ringbuffer though, we have to be very careful
1515 * that we do not wrap while preventing the execution of requests
1516 * with a unsignaled fence.
1517 */
1518 if (HAS_EXECLISTS(ctx->i915))
1519 return INT_MAX;
1520
1521 rq = igt_request_alloc(ctx, engine);
1522 if (IS_ERR(rq)) {
1523 ret = PTR_ERR(rq);
1524 } else {
1525 int sz;
1526
1527 ret = rq->ring->size - rq->reserved_space;
1528 i915_request_add(rq);
1529
1530 sz = rq->ring->emit - rq->head;
1531 if (sz < 0)
1532 sz += rq->ring->size;
1533 ret /= sz;
1534 ret /= 2; /* leave half spare, in case of emergency! */
1535 }
1536
1537 return ret;
1538}
1539
1540static int live_breadcrumbs_smoketest(void *arg)
1541{
1542 struct drm_i915_private *i915 = arg;
1543 const unsigned int nengines = num_uabi_engines(i915);
1544 const unsigned int ncpus = num_online_cpus();
1545 unsigned long num_waits, num_fences;
1546 struct intel_engine_cs *engine;
1547 struct task_struct **threads;
1548 struct igt_live_test live;
1549 intel_wakeref_t wakeref;
1550 struct smoketest *smoke;
1551 unsigned int n, idx;
1552 struct file *file;
1553 int ret = 0;
1554
1555 /*
1556 * Smoketest our breadcrumb/signal handling for requests across multiple
1557 * threads. A very simple test to only catch the most egregious of bugs.
1558 * See __igt_breadcrumbs_smoketest();
1559 *
1560 * On real hardware this time.
1561 */
1562
1563 wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1564
1565 file = mock_file(i915);
1566 if (IS_ERR(file)) {
1567 ret = PTR_ERR(file);
1568 goto out_rpm;
1569 }
1570
1571 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1572 if (!smoke) {
1573 ret = -ENOMEM;
1574 goto out_file;
1575 }
1576
1577 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1578 if (!threads) {
1579 ret = -ENOMEM;
1580 goto out_smoke;
1581 }
1582
1583 smoke[0].request_alloc = __live_request_alloc;
1584 smoke[0].ncontexts = 64;
1585 smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1586 sizeof(*smoke[0].contexts),
1587 GFP_KERNEL);
1588 if (!smoke[0].contexts) {
1589 ret = -ENOMEM;
1590 goto out_threads;
1591 }
1592
1593 for (n = 0; n < smoke[0].ncontexts; n++) {
1594 smoke[0].contexts[n] = live_context(i915, file);
1595 if (IS_ERR(smoke[0].contexts[n])) {
1596 ret = PTR_ERR(smoke[0].contexts[n]);
1597 goto out_contexts;
1598 }
1599 }
1600
1601 ret = igt_live_test_begin(&live, i915, __func__, "");
1602 if (ret)
1603 goto out_contexts;
1604
1605 idx = 0;
1606 for_each_uabi_engine(engine, i915) {
1607 smoke[idx] = smoke[0];
1608 smoke[idx].engine = engine;
1609 smoke[idx].max_batch =
1610 max_batches(smoke[0].contexts[0], engine);
1611 if (smoke[idx].max_batch < 0) {
1612 ret = smoke[idx].max_batch;
1613 goto out_flush;
1614 }
1615 /* One ring interleaved between requests from all cpus */
1616 smoke[idx].max_batch /= num_online_cpus() + 1;
1617 pr_debug("Limiting batches to %d requests on %s\n",
1618 smoke[idx].max_batch, engine->name);
1619
1620 for (n = 0; n < ncpus; n++) {
1621 struct task_struct *tsk;
1622
1623 tsk = kthread_run(__igt_breadcrumbs_smoketest,
1624 &smoke[idx], "igt/%d.%d", idx, n);
1625 if (IS_ERR(tsk)) {
1626 ret = PTR_ERR(tsk);
1627 goto out_flush;
1628 }
1629
1630 get_task_struct(tsk);
1631 threads[idx * ncpus + n] = tsk;
1632 }
1633
1634 idx++;
1635 }
1636
1637 yield(); /* start all threads before we begin */
1638 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1639
1640out_flush:
1641 idx = 0;
1642 num_waits = 0;
1643 num_fences = 0;
1644 for_each_uabi_engine(engine, i915) {
1645 for (n = 0; n < ncpus; n++) {
1646 struct task_struct *tsk = threads[idx * ncpus + n];
1647 int err;
1648
1649 if (!tsk)
1650 continue;
1651
1652 err = kthread_stop(tsk);
1653 if (err < 0 && !ret)
1654 ret = err;
1655
1656 put_task_struct(tsk);
1657 }
1658
1659 num_waits += atomic_long_read(&smoke[idx].num_waits);
1660 num_fences += atomic_long_read(&smoke[idx].num_fences);
1661 idx++;
1662 }
1663 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1664 num_waits, num_fences, idx, ncpus);
1665
1666 ret = igt_live_test_end(&live) ?: ret;
1667out_contexts:
1668 kfree(smoke[0].contexts);
1669out_threads:
1670 kfree(threads);
1671out_smoke:
1672 kfree(smoke);
1673out_file:
1674 fput(file);
1675out_rpm:
1676 intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1677
1678 return ret;
1679}
1680
1681int i915_request_live_selftests(struct drm_i915_private *i915)
1682{
1683 static const struct i915_subtest tests[] = {
1684 SUBTEST(live_nop_request),
1685 SUBTEST(live_all_engines),
1686 SUBTEST(live_sequential_engines),
1687 SUBTEST(live_parallel_engines),
1688 SUBTEST(live_empty_request),
1689 SUBTEST(live_cancel_request),
1690 SUBTEST(live_breadcrumbs_smoketest),
1691 };
1692
1693 if (intel_gt_is_wedged(&i915->gt))
1694 return 0;
1695
1696 return i915_subtests(tests, i915);
1697}
1698
1699static int switch_to_kernel_sync(struct intel_context *ce, int err)
1700{
1701 struct i915_request *rq;
1702 struct dma_fence *fence;
1703
1704 rq = intel_engine_create_kernel_request(ce->engine);
1705 if (IS_ERR(rq))
1706 return PTR_ERR(rq);
1707
1708 fence = i915_active_fence_get(&ce->timeline->last_request);
1709 if (fence) {
1710 i915_request_await_dma_fence(rq, fence);
1711 dma_fence_put(fence);
1712 }
1713
1714 rq = i915_request_get(rq);
1715 i915_request_add(rq);
1716 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1717 err = -ETIME;
1718 i915_request_put(rq);
1719
1720 while (!err && !intel_engine_is_idle(ce->engine))
1721 intel_engine_flush_submission(ce->engine);
1722
1723 return err;
1724}
1725
1726struct perf_stats {
1727 struct intel_engine_cs *engine;
1728 unsigned long count;
1729 ktime_t time;
1730 ktime_t busy;
1731 u64 runtime;
1732};
1733
1734struct perf_series {
1735 struct drm_i915_private *i915;
1736 unsigned int nengines;
1737 struct intel_context *ce[];
1738};
1739
1740static int cmp_u32(const void *A, const void *B)
1741{
1742 const u32 *a = A, *b = B;
1743
1744 return *a - *b;
1745}
1746
1747static u32 trifilter(u32 *a)
1748{
1749 u64 sum;
1750
1751#define TF_COUNT 5
1752 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1753
1754 sum = mul_u32_u32(a[2], 2);
1755 sum += a[1];
1756 sum += a[3];
1757
1758 GEM_BUG_ON(sum > U32_MAX);
1759 return sum;
1760#define TF_BIAS 2
1761}
1762
1763static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1764{
1765 u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1766
1767 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1768}
1769
1770static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1771{
1772 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1773 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1774 *cs++ = offset;
1775 *cs++ = 0;
1776
1777 return cs;
1778}
1779
1780static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1781{
1782 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1783 *cs++ = offset;
1784 *cs++ = 0;
1785 *cs++ = value;
1786
1787 return cs;
1788}
1789
1790static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1791{
1792 *cs++ = MI_SEMAPHORE_WAIT |
1793 MI_SEMAPHORE_GLOBAL_GTT |
1794 MI_SEMAPHORE_POLL |
1795 mode;
1796 *cs++ = value;
1797 *cs++ = offset;
1798 *cs++ = 0;
1799
1800 return cs;
1801}
1802
1803static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1804{
1805 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1806}
1807
1808static void semaphore_set(u32 *sema, u32 value)
1809{
1810 WRITE_ONCE(*sema, value);
1811 wmb(); /* flush the update to the cache, and beyond */
1812}
1813
1814static u32 *hwsp_scratch(const struct intel_context *ce)
1815{
1816 return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1817}
1818
1819static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1820{
1821 return (i915_ggtt_offset(ce->engine->status_page.vma) +
1822 offset_in_page(dw));
1823}
1824
1825static int measure_semaphore_response(struct intel_context *ce)
1826{
1827 u32 *sema = hwsp_scratch(ce);
1828 const u32 offset = hwsp_offset(ce, sema);
1829 u32 elapsed[TF_COUNT], cycles;
1830 struct i915_request *rq;
1831 u32 *cs;
1832 int err;
1833 int i;
1834
1835 /*
1836 * Measure how many cycles it takes for the HW to detect the change
1837 * in a semaphore value.
1838 *
1839 * A: read CS_TIMESTAMP from CPU
1840 * poke semaphore
1841 * B: read CS_TIMESTAMP on GPU
1842 *
1843 * Semaphore latency: B - A
1844 */
1845
1846 semaphore_set(sema, -1);
1847
1848 rq = i915_request_create(ce);
1849 if (IS_ERR(rq))
1850 return PTR_ERR(rq);
1851
1852 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1853 if (IS_ERR(cs)) {
1854 i915_request_add(rq);
1855 err = PTR_ERR(cs);
1856 goto err;
1857 }
1858
1859 cs = emit_store_dw(cs, offset, 0);
1860 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1861 cs = emit_semaphore_poll_until(cs, offset, i);
1862 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1863 cs = emit_store_dw(cs, offset, 0);
1864 }
1865
1866 intel_ring_advance(rq, cs);
1867 i915_request_add(rq);
1868
1869 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1870 err = -EIO;
1871 goto err;
1872 }
1873
1874 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1875 preempt_disable();
1876 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1877 semaphore_set(sema, i);
1878 preempt_enable();
1879
1880 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1881 err = -EIO;
1882 goto err;
1883 }
1884
1885 elapsed[i - 1] = sema[i] - cycles;
1886 }
1887
1888 cycles = trifilter(elapsed);
1889 pr_info("%s: semaphore response %d cycles, %lluns\n",
1890 ce->engine->name, cycles >> TF_BIAS,
1891 cycles_to_ns(ce->engine, cycles));
1892
1893 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1894
1895err:
1896 intel_gt_set_wedged(ce->engine->gt);
1897 return err;
1898}
1899
1900static int measure_idle_dispatch(struct intel_context *ce)
1901{
1902 u32 *sema = hwsp_scratch(ce);
1903 const u32 offset = hwsp_offset(ce, sema);
1904 u32 elapsed[TF_COUNT], cycles;
1905 u32 *cs;
1906 int err;
1907 int i;
1908
1909 /*
1910 * Measure how long it takes for us to submit a request while the
1911 * engine is idle, but is resting in our context.
1912 *
1913 * A: read CS_TIMESTAMP from CPU
1914 * submit request
1915 * B: read CS_TIMESTAMP on GPU
1916 *
1917 * Submission latency: B - A
1918 */
1919
1920 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
1921 struct i915_request *rq;
1922
1923 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1924 if (err)
1925 return err;
1926
1927 rq = i915_request_create(ce);
1928 if (IS_ERR(rq)) {
1929 err = PTR_ERR(rq);
1930 goto err;
1931 }
1932
1933 cs = intel_ring_begin(rq, 4);
1934 if (IS_ERR(cs)) {
1935 i915_request_add(rq);
1936 err = PTR_ERR(cs);
1937 goto err;
1938 }
1939
1940 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1941
1942 intel_ring_advance(rq, cs);
1943
1944 preempt_disable();
1945 local_bh_disable();
1946 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1947 i915_request_add(rq);
1948 local_bh_enable();
1949 preempt_enable();
1950 }
1951
1952 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1953 if (err)
1954 goto err;
1955
1956 for (i = 0; i < ARRAY_SIZE(elapsed); i++)
1957 elapsed[i] = sema[i] - elapsed[i];
1958
1959 cycles = trifilter(elapsed);
1960 pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
1961 ce->engine->name, cycles >> TF_BIAS,
1962 cycles_to_ns(ce->engine, cycles));
1963
1964 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1965
1966err:
1967 intel_gt_set_wedged(ce->engine->gt);
1968 return err;
1969}
1970
1971static int measure_busy_dispatch(struct intel_context *ce)
1972{
1973 u32 *sema = hwsp_scratch(ce);
1974 const u32 offset = hwsp_offset(ce, sema);
1975 u32 elapsed[TF_COUNT + 1], cycles;
1976 u32 *cs;
1977 int err;
1978 int i;
1979
1980 /*
1981 * Measure how long it takes for us to submit a request while the
1982 * engine is busy, polling on a semaphore in our context. With
1983 * direct submission, this will include the cost of a lite restore.
1984 *
1985 * A: read CS_TIMESTAMP from CPU
1986 * submit request
1987 * B: read CS_TIMESTAMP on GPU
1988 *
1989 * Submission latency: B - A
1990 */
1991
1992 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1993 struct i915_request *rq;
1994
1995 rq = i915_request_create(ce);
1996 if (IS_ERR(rq)) {
1997 err = PTR_ERR(rq);
1998 goto err;
1999 }
2000
2001 cs = intel_ring_begin(rq, 12);
2002 if (IS_ERR(cs)) {
2003 i915_request_add(rq);
2004 err = PTR_ERR(cs);
2005 goto err;
2006 }
2007
2008 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2009 cs = emit_semaphore_poll_until(cs, offset, i);
2010 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2011
2012 intel_ring_advance(rq, cs);
2013
2014 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
2015 err = -EIO;
2016 goto err;
2017 }
2018
2019 preempt_disable();
2020 local_bh_disable();
2021 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2022 i915_request_add(rq);
2023 local_bh_enable();
2024 semaphore_set(sema, i - 1);
2025 preempt_enable();
2026 }
2027
2028 wait_for(READ_ONCE(sema[i - 1]), 500);
2029 semaphore_set(sema, i - 1);
2030
2031 for (i = 1; i <= TF_COUNT; i++) {
2032 GEM_BUG_ON(sema[i] == -1);
2033 elapsed[i - 1] = sema[i] - elapsed[i];
2034 }
2035
2036 cycles = trifilter(elapsed);
2037 pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
2038 ce->engine->name, cycles >> TF_BIAS,
2039 cycles_to_ns(ce->engine, cycles));
2040
2041 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2042
2043err:
2044 intel_gt_set_wedged(ce->engine->gt);
2045 return err;
2046}
2047
2048static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
2049{
2050 const u32 offset =
2051 i915_ggtt_offset(engine->status_page.vma) +
2052 offset_in_page(sema);
2053 struct i915_request *rq;
2054 u32 *cs;
2055
2056 rq = i915_request_create(engine->kernel_context);
2057 if (IS_ERR(rq))
2058 return PTR_ERR(rq);
2059
2060 cs = intel_ring_begin(rq, 4);
2061 if (IS_ERR(cs)) {
2062 i915_request_add(rq);
2063 return PTR_ERR(cs);
2064 }
2065
2066 cs = emit_semaphore_poll(cs, mode, value, offset);
2067
2068 intel_ring_advance(rq, cs);
2069 i915_request_add(rq);
2070
2071 return 0;
2072}
2073
2074static int measure_inter_request(struct intel_context *ce)
2075{
2076 u32 *sema = hwsp_scratch(ce);
2077 const u32 offset = hwsp_offset(ce, sema);
2078 u32 elapsed[TF_COUNT + 1], cycles;
2079 struct i915_sw_fence *submit;
2080 int i, err;
2081
2082 /*
2083 * Measure how long it takes to advance from one request into the
2084 * next. Between each request we flush the GPU caches to memory,
2085 * update the breadcrumbs, and then invalidate those caches.
2086 * We queue up all the requests to be submitted in one batch so
2087 * it should be one set of contiguous measurements.
2088 *
2089 * A: read CS_TIMESTAMP on GPU
2090 * advance request
2091 * B: read CS_TIMESTAMP on GPU
2092 *
2093 * Request latency: B - A
2094 */
2095
2096 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2097 if (err)
2098 return err;
2099
2100 submit = heap_fence_create(GFP_KERNEL);
2101 if (!submit) {
2102 semaphore_set(sema, 1);
2103 return -ENOMEM;
2104 }
2105
2106 intel_engine_flush_submission(ce->engine);
2107 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2108 struct i915_request *rq;
2109 u32 *cs;
2110
2111 rq = i915_request_create(ce);
2112 if (IS_ERR(rq)) {
2113 err = PTR_ERR(rq);
2114 goto err_submit;
2115 }
2116
2117 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
2118 submit,
2119 GFP_KERNEL);
2120 if (err < 0) {
2121 i915_request_add(rq);
2122 goto err_submit;
2123 }
2124
2125 cs = intel_ring_begin(rq, 4);
2126 if (IS_ERR(cs)) {
2127 i915_request_add(rq);
2128 err = PTR_ERR(cs);
2129 goto err_submit;
2130 }
2131
2132 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2133
2134 intel_ring_advance(rq, cs);
2135 i915_request_add(rq);
2136 }
2137 i915_sw_fence_commit(submit);
2138 intel_engine_flush_submission(ce->engine);
2139 heap_fence_put(submit);
2140
2141 semaphore_set(sema, 1);
2142 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2143 if (err)
2144 goto err;
2145
2146 for (i = 1; i <= TF_COUNT; i++)
2147 elapsed[i - 1] = sema[i + 1] - sema[i];
2148
2149 cycles = trifilter(elapsed);
2150 pr_info("%s: inter-request latency %d cycles, %lluns\n",
2151 ce->engine->name, cycles >> TF_BIAS,
2152 cycles_to_ns(ce->engine, cycles));
2153
2154 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2155
2156err_submit:
2157 i915_sw_fence_commit(submit);
2158 heap_fence_put(submit);
2159 semaphore_set(sema, 1);
2160err:
2161 intel_gt_set_wedged(ce->engine->gt);
2162 return err;
2163}
2164
2165static int measure_context_switch(struct intel_context *ce)
2166{
2167 u32 *sema = hwsp_scratch(ce);
2168 const u32 offset = hwsp_offset(ce, sema);
2169 struct i915_request *fence = NULL;
2170 u32 elapsed[TF_COUNT + 1], cycles;
2171 int i, j, err;
2172 u32 *cs;
2173
2174 /*
2175 * Measure how long it takes to advance from one request in one
2176 * context to a request in another context. This allows us to
2177 * measure how long the context save/restore take, along with all
2178 * the inter-context setup we require.
2179 *
2180 * A: read CS_TIMESTAMP on GPU
2181 * switch context
2182 * B: read CS_TIMESTAMP on GPU
2183 *
2184 * Context switch latency: B - A
2185 */
2186
2187 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2188 if (err)
2189 return err;
2190
2191 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2192 struct intel_context *arr[] = {
2193 ce, ce->engine->kernel_context
2194 };
2195 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
2196
2197 for (j = 0; j < ARRAY_SIZE(arr); j++) {
2198 struct i915_request *rq;
2199
2200 rq = i915_request_create(arr[j]);
2201 if (IS_ERR(rq)) {
2202 err = PTR_ERR(rq);
2203 goto err_fence;
2204 }
2205
2206 if (fence) {
2207 err = i915_request_await_dma_fence(rq,
2208 &fence->fence);
2209 if (err) {
2210 i915_request_add(rq);
2211 goto err_fence;
2212 }
2213 }
2214
2215 cs = intel_ring_begin(rq, 4);
2216 if (IS_ERR(cs)) {
2217 i915_request_add(rq);
2218 err = PTR_ERR(cs);
2219 goto err_fence;
2220 }
2221
2222 cs = emit_timestamp_store(cs, ce, addr);
2223 addr += sizeof(u32);
2224
2225 intel_ring_advance(rq, cs);
2226
2227 i915_request_put(fence);
2228 fence = i915_request_get(rq);
2229
2230 i915_request_add(rq);
2231 }
2232 }
2233 i915_request_put(fence);
2234 intel_engine_flush_submission(ce->engine);
2235
2236 semaphore_set(sema, 1);
2237 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2238 if (err)
2239 goto err;
2240
2241 for (i = 1; i <= TF_COUNT; i++)
2242 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2243
2244 cycles = trifilter(elapsed);
2245 pr_info("%s: context switch latency %d cycles, %lluns\n",
2246 ce->engine->name, cycles >> TF_BIAS,
2247 cycles_to_ns(ce->engine, cycles));
2248
2249 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2250
2251err_fence:
2252 i915_request_put(fence);
2253 semaphore_set(sema, 1);
2254err:
2255 intel_gt_set_wedged(ce->engine->gt);
2256 return err;
2257}
2258
2259static int measure_preemption(struct intel_context *ce)
2260{
2261 u32 *sema = hwsp_scratch(ce);
2262 const u32 offset = hwsp_offset(ce, sema);
2263 u32 elapsed[TF_COUNT], cycles;
2264 u32 *cs;
2265 int err;
2266 int i;
2267
2268 /*
2269 * We measure two latencies while triggering preemption. The first
2270 * latency is how long it takes for us to submit a preempting request.
2271 * The second latency is how it takes for us to return from the
2272 * preemption back to the original context.
2273 *
2274 * A: read CS_TIMESTAMP from CPU
2275 * submit preemption
2276 * B: read CS_TIMESTAMP on GPU (in preempting context)
2277 * context switch
2278 * C: read CS_TIMESTAMP on GPU (in original context)
2279 *
2280 * Preemption dispatch latency: B - A
2281 * Preemption switch latency: C - B
2282 */
2283
2284 if (!intel_engine_has_preemption(ce->engine))
2285 return 0;
2286
2287 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2288 u32 addr = offset + 2 * i * sizeof(u32);
2289 struct i915_request *rq;
2290
2291 rq = i915_request_create(ce);
2292 if (IS_ERR(rq)) {
2293 err = PTR_ERR(rq);
2294 goto err;
2295 }
2296
2297 cs = intel_ring_begin(rq, 12);
2298 if (IS_ERR(cs)) {
2299 i915_request_add(rq);
2300 err = PTR_ERR(cs);
2301 goto err;
2302 }
2303
2304 cs = emit_store_dw(cs, addr, -1);
2305 cs = emit_semaphore_poll_until(cs, offset, i);
2306 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2307
2308 intel_ring_advance(rq, cs);
2309 i915_request_add(rq);
2310
2311 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2312 err = -EIO;
2313 goto err;
2314 }
2315
2316 rq = i915_request_create(ce->engine->kernel_context);
2317 if (IS_ERR(rq)) {
2318 err = PTR_ERR(rq);
2319 goto err;
2320 }
2321
2322 cs = intel_ring_begin(rq, 8);
2323 if (IS_ERR(cs)) {
2324 i915_request_add(rq);
2325 err = PTR_ERR(cs);
2326 goto err;
2327 }
2328
2329 cs = emit_timestamp_store(cs, ce, addr);
2330 cs = emit_store_dw(cs, offset, i);
2331
2332 intel_ring_advance(rq, cs);
2333 rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2334
2335 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2336 i915_request_add(rq);
2337 }
2338
2339 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2340 err = -EIO;
2341 goto err;
2342 }
2343
2344 for (i = 1; i <= TF_COUNT; i++)
2345 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2346
2347 cycles = trifilter(elapsed);
2348 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2349 ce->engine->name, cycles >> TF_BIAS,
2350 cycles_to_ns(ce->engine, cycles));
2351
2352 for (i = 1; i <= TF_COUNT; i++)
2353 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2354
2355 cycles = trifilter(elapsed);
2356 pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2357 ce->engine->name, cycles >> TF_BIAS,
2358 cycles_to_ns(ce->engine, cycles));
2359
2360 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2361
2362err:
2363 intel_gt_set_wedged(ce->engine->gt);
2364 return err;
2365}
2366
2367struct signal_cb {
2368 struct dma_fence_cb base;
2369 bool seen;
2370};
2371
2372static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2373{
2374 struct signal_cb *s = container_of(cb, typeof(*s), base);
2375
2376 smp_store_mb(s->seen, true); /* be safe, be strong */
2377}
2378
2379static int measure_completion(struct intel_context *ce)
2380{
2381 u32 *sema = hwsp_scratch(ce);
2382 const u32 offset = hwsp_offset(ce, sema);
2383 u32 elapsed[TF_COUNT], cycles;
2384 u32 *cs;
2385 int err;
2386 int i;
2387
2388 /*
2389 * Measure how long it takes for the signal (interrupt) to be
2390 * sent from the GPU to be processed by the CPU.
2391 *
2392 * A: read CS_TIMESTAMP on GPU
2393 * signal
2394 * B: read CS_TIMESTAMP from CPU
2395 *
2396 * Completion latency: B - A
2397 */
2398
2399 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2400 struct signal_cb cb = { .seen = false };
2401 struct i915_request *rq;
2402
2403 rq = i915_request_create(ce);
2404 if (IS_ERR(rq)) {
2405 err = PTR_ERR(rq);
2406 goto err;
2407 }
2408
2409 cs = intel_ring_begin(rq, 12);
2410 if (IS_ERR(cs)) {
2411 i915_request_add(rq);
2412 err = PTR_ERR(cs);
2413 goto err;
2414 }
2415
2416 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2417 cs = emit_semaphore_poll_until(cs, offset, i);
2418 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2419
2420 intel_ring_advance(rq, cs);
2421
2422 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2423 i915_request_add(rq);
2424
2425 intel_engine_flush_submission(ce->engine);
2426 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2427 err = -EIO;
2428 goto err;
2429 }
2430
2431 preempt_disable();
2432 semaphore_set(sema, i);
2433 while (!READ_ONCE(cb.seen))
2434 cpu_relax();
2435
2436 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2437 preempt_enable();
2438 }
2439
2440 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2441 if (err)
2442 goto err;
2443
2444 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2445 GEM_BUG_ON(sema[i + 1] == -1);
2446 elapsed[i] = elapsed[i] - sema[i + 1];
2447 }
2448
2449 cycles = trifilter(elapsed);
2450 pr_info("%s: completion latency %d cycles, %lluns\n",
2451 ce->engine->name, cycles >> TF_BIAS,
2452 cycles_to_ns(ce->engine, cycles));
2453
2454 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2455
2456err:
2457 intel_gt_set_wedged(ce->engine->gt);
2458 return err;
2459}
2460
2461static void rps_pin(struct intel_gt *gt)
2462{
2463 /* Pin the frequency to max */
2464 atomic_inc(>->rps.num_waiters);
2465 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2466
2467 mutex_lock(>->rps.lock);
2468 intel_rps_set(>->rps, gt->rps.max_freq);
2469 mutex_unlock(>->rps.lock);
2470}
2471
2472static void rps_unpin(struct intel_gt *gt)
2473{
2474 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2475 atomic_dec(>->rps.num_waiters);
2476}
2477
2478static int perf_request_latency(void *arg)
2479{
2480 struct drm_i915_private *i915 = arg;
2481 struct intel_engine_cs *engine;
2482 struct pm_qos_request qos;
2483 int err = 0;
2484
2485 if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */
2486 return 0;
2487
2488 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2489
2490 for_each_uabi_engine(engine, i915) {
2491 struct intel_context *ce;
2492
2493 ce = intel_context_create(engine);
2494 if (IS_ERR(ce)) {
2495 err = PTR_ERR(ce);
2496 goto out;
2497 }
2498
2499 err = intel_context_pin(ce);
2500 if (err) {
2501 intel_context_put(ce);
2502 goto out;
2503 }
2504
2505 st_engine_heartbeat_disable(engine);
2506 rps_pin(engine->gt);
2507
2508 if (err == 0)
2509 err = measure_semaphore_response(ce);
2510 if (err == 0)
2511 err = measure_idle_dispatch(ce);
2512 if (err == 0)
2513 err = measure_busy_dispatch(ce);
2514 if (err == 0)
2515 err = measure_inter_request(ce);
2516 if (err == 0)
2517 err = measure_context_switch(ce);
2518 if (err == 0)
2519 err = measure_preemption(ce);
2520 if (err == 0)
2521 err = measure_completion(ce);
2522
2523 rps_unpin(engine->gt);
2524 st_engine_heartbeat_enable(engine);
2525
2526 intel_context_unpin(ce);
2527 intel_context_put(ce);
2528 if (err)
2529 goto out;
2530 }
2531
2532out:
2533 if (igt_flush_test(i915))
2534 err = -EIO;
2535
2536 cpu_latency_qos_remove_request(&qos);
2537 return err;
2538}
2539
2540static int s_sync0(void *arg)
2541{
2542 struct perf_series *ps = arg;
2543 IGT_TIMEOUT(end_time);
2544 unsigned int idx = 0;
2545 int err = 0;
2546
2547 GEM_BUG_ON(!ps->nengines);
2548 do {
2549 struct i915_request *rq;
2550
2551 rq = i915_request_create(ps->ce[idx]);
2552 if (IS_ERR(rq)) {
2553 err = PTR_ERR(rq);
2554 break;
2555 }
2556
2557 i915_request_get(rq);
2558 i915_request_add(rq);
2559
2560 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2561 err = -ETIME;
2562 i915_request_put(rq);
2563 if (err)
2564 break;
2565
2566 if (++idx == ps->nengines)
2567 idx = 0;
2568 } while (!__igt_timeout(end_time, NULL));
2569
2570 return err;
2571}
2572
2573static int s_sync1(void *arg)
2574{
2575 struct perf_series *ps = arg;
2576 struct i915_request *prev = NULL;
2577 IGT_TIMEOUT(end_time);
2578 unsigned int idx = 0;
2579 int err = 0;
2580
2581 GEM_BUG_ON(!ps->nengines);
2582 do {
2583 struct i915_request *rq;
2584
2585 rq = i915_request_create(ps->ce[idx]);
2586 if (IS_ERR(rq)) {
2587 err = PTR_ERR(rq);
2588 break;
2589 }
2590
2591 i915_request_get(rq);
2592 i915_request_add(rq);
2593
2594 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2595 err = -ETIME;
2596 i915_request_put(prev);
2597 prev = rq;
2598 if (err)
2599 break;
2600
2601 if (++idx == ps->nengines)
2602 idx = 0;
2603 } while (!__igt_timeout(end_time, NULL));
2604 i915_request_put(prev);
2605
2606 return err;
2607}
2608
2609static int s_many(void *arg)
2610{
2611 struct perf_series *ps = arg;
2612 IGT_TIMEOUT(end_time);
2613 unsigned int idx = 0;
2614
2615 GEM_BUG_ON(!ps->nengines);
2616 do {
2617 struct i915_request *rq;
2618
2619 rq = i915_request_create(ps->ce[idx]);
2620 if (IS_ERR(rq))
2621 return PTR_ERR(rq);
2622
2623 i915_request_add(rq);
2624
2625 if (++idx == ps->nengines)
2626 idx = 0;
2627 } while (!__igt_timeout(end_time, NULL));
2628
2629 return 0;
2630}
2631
2632static int perf_series_engines(void *arg)
2633{
2634 struct drm_i915_private *i915 = arg;
2635 static int (* const func[])(void *arg) = {
2636 s_sync0,
2637 s_sync1,
2638 s_many,
2639 NULL,
2640 };
2641 const unsigned int nengines = num_uabi_engines(i915);
2642 struct intel_engine_cs *engine;
2643 int (* const *fn)(void *arg);
2644 struct pm_qos_request qos;
2645 struct perf_stats *stats;
2646 struct perf_series *ps;
2647 unsigned int idx;
2648 int err = 0;
2649
2650 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2651 if (!stats)
2652 return -ENOMEM;
2653
2654 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2655 if (!ps) {
2656 kfree(stats);
2657 return -ENOMEM;
2658 }
2659
2660 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2661
2662 ps->i915 = i915;
2663 ps->nengines = nengines;
2664
2665 idx = 0;
2666 for_each_uabi_engine(engine, i915) {
2667 struct intel_context *ce;
2668
2669 ce = intel_context_create(engine);
2670 if (IS_ERR(ce)) {
2671 err = PTR_ERR(ce);
2672 goto out;
2673 }
2674
2675 err = intel_context_pin(ce);
2676 if (err) {
2677 intel_context_put(ce);
2678 goto out;
2679 }
2680
2681 ps->ce[idx++] = ce;
2682 }
2683 GEM_BUG_ON(idx != ps->nengines);
2684
2685 for (fn = func; *fn && !err; fn++) {
2686 char name[KSYM_NAME_LEN];
2687 struct igt_live_test t;
2688
2689 snprintf(name, sizeof(name), "%ps", *fn);
2690 err = igt_live_test_begin(&t, i915, __func__, name);
2691 if (err)
2692 break;
2693
2694 for (idx = 0; idx < nengines; idx++) {
2695 struct perf_stats *p =
2696 memset(&stats[idx], 0, sizeof(stats[idx]));
2697 struct intel_context *ce = ps->ce[idx];
2698
2699 p->engine = ps->ce[idx]->engine;
2700 intel_engine_pm_get(p->engine);
2701
2702 if (intel_engine_supports_stats(p->engine))
2703 p->busy = intel_engine_get_busy_time(p->engine,
2704 &p->time) + 1;
2705 else
2706 p->time = ktime_get();
2707 p->runtime = -intel_context_get_total_runtime_ns(ce);
2708 }
2709
2710 err = (*fn)(ps);
2711 if (igt_live_test_end(&t))
2712 err = -EIO;
2713
2714 for (idx = 0; idx < nengines; idx++) {
2715 struct perf_stats *p = &stats[idx];
2716 struct intel_context *ce = ps->ce[idx];
2717 int integer, decimal;
2718 u64 busy, dt, now;
2719
2720 if (p->busy)
2721 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2722 &now),
2723 p->busy - 1);
2724 else
2725 now = ktime_get();
2726 p->time = ktime_sub(now, p->time);
2727
2728 err = switch_to_kernel_sync(ce, err);
2729 p->runtime += intel_context_get_total_runtime_ns(ce);
2730 intel_engine_pm_put(p->engine);
2731
2732 busy = 100 * ktime_to_ns(p->busy);
2733 dt = ktime_to_ns(p->time);
2734 if (dt) {
2735 integer = div64_u64(busy, dt);
2736 busy -= integer * dt;
2737 decimal = div64_u64(100 * busy, dt);
2738 } else {
2739 integer = 0;
2740 decimal = 0;
2741 }
2742
2743 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2744 name, p->engine->name, ce->timeline->seqno,
2745 integer, decimal,
2746 div_u64(p->runtime, 1000 * 1000),
2747 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2748 }
2749 }
2750
2751out:
2752 for (idx = 0; idx < nengines; idx++) {
2753 if (IS_ERR_OR_NULL(ps->ce[idx]))
2754 break;
2755
2756 intel_context_unpin(ps->ce[idx]);
2757 intel_context_put(ps->ce[idx]);
2758 }
2759 kfree(ps);
2760
2761 cpu_latency_qos_remove_request(&qos);
2762 kfree(stats);
2763 return err;
2764}
2765
2766static int p_sync0(void *arg)
2767{
2768 struct perf_stats *p = arg;
2769 struct intel_engine_cs *engine = p->engine;
2770 struct intel_context *ce;
2771 IGT_TIMEOUT(end_time);
2772 unsigned long count;
2773 bool busy;
2774 int err = 0;
2775
2776 ce = intel_context_create(engine);
2777 if (IS_ERR(ce))
2778 return PTR_ERR(ce);
2779
2780 err = intel_context_pin(ce);
2781 if (err) {
2782 intel_context_put(ce);
2783 return err;
2784 }
2785
2786 if (intel_engine_supports_stats(engine)) {
2787 p->busy = intel_engine_get_busy_time(engine, &p->time);
2788 busy = true;
2789 } else {
2790 p->time = ktime_get();
2791 busy = false;
2792 }
2793
2794 count = 0;
2795 do {
2796 struct i915_request *rq;
2797
2798 rq = i915_request_create(ce);
2799 if (IS_ERR(rq)) {
2800 err = PTR_ERR(rq);
2801 break;
2802 }
2803
2804 i915_request_get(rq);
2805 i915_request_add(rq);
2806
2807 err = 0;
2808 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2809 err = -ETIME;
2810 i915_request_put(rq);
2811 if (err)
2812 break;
2813
2814 count++;
2815 } while (!__igt_timeout(end_time, NULL));
2816
2817 if (busy) {
2818 ktime_t now;
2819
2820 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2821 p->busy);
2822 p->time = ktime_sub(now, p->time);
2823 } else {
2824 p->time = ktime_sub(ktime_get(), p->time);
2825 }
2826
2827 err = switch_to_kernel_sync(ce, err);
2828 p->runtime = intel_context_get_total_runtime_ns(ce);
2829 p->count = count;
2830
2831 intel_context_unpin(ce);
2832 intel_context_put(ce);
2833 return err;
2834}
2835
2836static int p_sync1(void *arg)
2837{
2838 struct perf_stats *p = arg;
2839 struct intel_engine_cs *engine = p->engine;
2840 struct i915_request *prev = NULL;
2841 struct intel_context *ce;
2842 IGT_TIMEOUT(end_time);
2843 unsigned long count;
2844 bool busy;
2845 int err = 0;
2846
2847 ce = intel_context_create(engine);
2848 if (IS_ERR(ce))
2849 return PTR_ERR(ce);
2850
2851 err = intel_context_pin(ce);
2852 if (err) {
2853 intel_context_put(ce);
2854 return err;
2855 }
2856
2857 if (intel_engine_supports_stats(engine)) {
2858 p->busy = intel_engine_get_busy_time(engine, &p->time);
2859 busy = true;
2860 } else {
2861 p->time = ktime_get();
2862 busy = false;
2863 }
2864
2865 count = 0;
2866 do {
2867 struct i915_request *rq;
2868
2869 rq = i915_request_create(ce);
2870 if (IS_ERR(rq)) {
2871 err = PTR_ERR(rq);
2872 break;
2873 }
2874
2875 i915_request_get(rq);
2876 i915_request_add(rq);
2877
2878 err = 0;
2879 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2880 err = -ETIME;
2881 i915_request_put(prev);
2882 prev = rq;
2883 if (err)
2884 break;
2885
2886 count++;
2887 } while (!__igt_timeout(end_time, NULL));
2888 i915_request_put(prev);
2889
2890 if (busy) {
2891 ktime_t now;
2892
2893 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2894 p->busy);
2895 p->time = ktime_sub(now, p->time);
2896 } else {
2897 p->time = ktime_sub(ktime_get(), p->time);
2898 }
2899
2900 err = switch_to_kernel_sync(ce, err);
2901 p->runtime = intel_context_get_total_runtime_ns(ce);
2902 p->count = count;
2903
2904 intel_context_unpin(ce);
2905 intel_context_put(ce);
2906 return err;
2907}
2908
2909static int p_many(void *arg)
2910{
2911 struct perf_stats *p = arg;
2912 struct intel_engine_cs *engine = p->engine;
2913 struct intel_context *ce;
2914 IGT_TIMEOUT(end_time);
2915 unsigned long count;
2916 int err = 0;
2917 bool busy;
2918
2919 ce = intel_context_create(engine);
2920 if (IS_ERR(ce))
2921 return PTR_ERR(ce);
2922
2923 err = intel_context_pin(ce);
2924 if (err) {
2925 intel_context_put(ce);
2926 return err;
2927 }
2928
2929 if (intel_engine_supports_stats(engine)) {
2930 p->busy = intel_engine_get_busy_time(engine, &p->time);
2931 busy = true;
2932 } else {
2933 p->time = ktime_get();
2934 busy = false;
2935 }
2936
2937 count = 0;
2938 do {
2939 struct i915_request *rq;
2940
2941 rq = i915_request_create(ce);
2942 if (IS_ERR(rq)) {
2943 err = PTR_ERR(rq);
2944 break;
2945 }
2946
2947 i915_request_add(rq);
2948 count++;
2949 } while (!__igt_timeout(end_time, NULL));
2950
2951 if (busy) {
2952 ktime_t now;
2953
2954 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2955 p->busy);
2956 p->time = ktime_sub(now, p->time);
2957 } else {
2958 p->time = ktime_sub(ktime_get(), p->time);
2959 }
2960
2961 err = switch_to_kernel_sync(ce, err);
2962 p->runtime = intel_context_get_total_runtime_ns(ce);
2963 p->count = count;
2964
2965 intel_context_unpin(ce);
2966 intel_context_put(ce);
2967 return err;
2968}
2969
2970static int perf_parallel_engines(void *arg)
2971{
2972 struct drm_i915_private *i915 = arg;
2973 static int (* const func[])(void *arg) = {
2974 p_sync0,
2975 p_sync1,
2976 p_many,
2977 NULL,
2978 };
2979 const unsigned int nengines = num_uabi_engines(i915);
2980 struct intel_engine_cs *engine;
2981 int (* const *fn)(void *arg);
2982 struct pm_qos_request qos;
2983 struct {
2984 struct perf_stats p;
2985 struct task_struct *tsk;
2986 } *engines;
2987 int err = 0;
2988
2989 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
2990 if (!engines)
2991 return -ENOMEM;
2992
2993 cpu_latency_qos_add_request(&qos, 0);
2994
2995 for (fn = func; *fn; fn++) {
2996 char name[KSYM_NAME_LEN];
2997 struct igt_live_test t;
2998 unsigned int idx;
2999
3000 snprintf(name, sizeof(name), "%ps", *fn);
3001 err = igt_live_test_begin(&t, i915, __func__, name);
3002 if (err)
3003 break;
3004
3005 atomic_set(&i915->selftest.counter, nengines);
3006
3007 idx = 0;
3008 for_each_uabi_engine(engine, i915) {
3009 intel_engine_pm_get(engine);
3010
3011 memset(&engines[idx].p, 0, sizeof(engines[idx].p));
3012 engines[idx].p.engine = engine;
3013
3014 engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
3015 "igt:%s", engine->name);
3016 if (IS_ERR(engines[idx].tsk)) {
3017 err = PTR_ERR(engines[idx].tsk);
3018 intel_engine_pm_put(engine);
3019 break;
3020 }
3021 get_task_struct(engines[idx++].tsk);
3022 }
3023
3024 yield(); /* start all threads before we kthread_stop() */
3025
3026 idx = 0;
3027 for_each_uabi_engine(engine, i915) {
3028 int status;
3029
3030 if (IS_ERR(engines[idx].tsk))
3031 break;
3032
3033 status = kthread_stop(engines[idx].tsk);
3034 if (status && !err)
3035 err = status;
3036
3037 intel_engine_pm_put(engine);
3038 put_task_struct(engines[idx++].tsk);
3039 }
3040
3041 if (igt_live_test_end(&t))
3042 err = -EIO;
3043 if (err)
3044 break;
3045
3046 idx = 0;
3047 for_each_uabi_engine(engine, i915) {
3048 struct perf_stats *p = &engines[idx].p;
3049 u64 busy = 100 * ktime_to_ns(p->busy);
3050 u64 dt = ktime_to_ns(p->time);
3051 int integer, decimal;
3052
3053 if (dt) {
3054 integer = div64_u64(busy, dt);
3055 busy -= integer * dt;
3056 decimal = div64_u64(100 * busy, dt);
3057 } else {
3058 integer = 0;
3059 decimal = 0;
3060 }
3061
3062 GEM_BUG_ON(engine != p->engine);
3063 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
3064 name, engine->name, p->count, integer, decimal,
3065 div_u64(p->runtime, 1000 * 1000),
3066 div_u64(ktime_to_ns(p->time), 1000 * 1000));
3067 idx++;
3068 }
3069 }
3070
3071 cpu_latency_qos_remove_request(&qos);
3072 kfree(engines);
3073 return err;
3074}
3075
3076int i915_request_perf_selftests(struct drm_i915_private *i915)
3077{
3078 static const struct i915_subtest tests[] = {
3079 SUBTEST(perf_request_latency),
3080 SUBTEST(perf_series_engines),
3081 SUBTEST(perf_parallel_engines),
3082 };
3083
3084 if (intel_gt_is_wedged(&i915->gt))
3085 return 0;
3086
3087 return i915_subtests(tests, i915);
3088}