Loading...
Note: File does not exist in v3.1.
1/*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25#include <linux/prime_numbers.h>
26#include <linux/pm_qos.h>
27#include <linux/sort.h>
28
29#include "gem/i915_gem_pm.h"
30#include "gem/selftests/mock_context.h"
31
32#include "gt/intel_engine_heartbeat.h"
33#include "gt/intel_engine_pm.h"
34#include "gt/intel_engine_user.h"
35#include "gt/intel_gt.h"
36#include "gt/intel_gt_requests.h"
37#include "gt/selftest_engine_heartbeat.h"
38
39#include "i915_random.h"
40#include "i915_selftest.h"
41#include "igt_flush_test.h"
42#include "igt_live_test.h"
43#include "igt_spinner.h"
44#include "lib_sw_fence.h"
45
46#include "mock_drm.h"
47#include "mock_gem_device.h"
48
49static unsigned int num_uabi_engines(struct drm_i915_private *i915)
50{
51 struct intel_engine_cs *engine;
52 unsigned int count;
53
54 count = 0;
55 for_each_uabi_engine(engine, i915)
56 count++;
57
58 return count;
59}
60
61static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
62{
63 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
64}
65
66static int igt_add_request(void *arg)
67{
68 struct drm_i915_private *i915 = arg;
69 struct i915_request *request;
70
71 /* Basic preliminary test to create a request and let it loose! */
72
73 request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
74 if (!request)
75 return -ENOMEM;
76
77 i915_request_add(request);
78
79 return 0;
80}
81
82static int igt_wait_request(void *arg)
83{
84 const long T = HZ / 4;
85 struct drm_i915_private *i915 = arg;
86 struct i915_request *request;
87 int err = -EINVAL;
88
89 /* Submit a request, then wait upon it */
90
91 request = mock_request(rcs0(i915)->kernel_context, T);
92 if (!request)
93 return -ENOMEM;
94
95 i915_request_get(request);
96
97 if (i915_request_wait(request, 0, 0) != -ETIME) {
98 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
99 goto out_request;
100 }
101
102 if (i915_request_wait(request, 0, T) != -ETIME) {
103 pr_err("request wait succeeded (expected timeout before submit!)\n");
104 goto out_request;
105 }
106
107 if (i915_request_completed(request)) {
108 pr_err("request completed before submit!!\n");
109 goto out_request;
110 }
111
112 i915_request_add(request);
113
114 if (i915_request_wait(request, 0, 0) != -ETIME) {
115 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
116 goto out_request;
117 }
118
119 if (i915_request_completed(request)) {
120 pr_err("request completed immediately!\n");
121 goto out_request;
122 }
123
124 if (i915_request_wait(request, 0, T / 2) != -ETIME) {
125 pr_err("request wait succeeded (expected timeout!)\n");
126 goto out_request;
127 }
128
129 if (i915_request_wait(request, 0, T) == -ETIME) {
130 pr_err("request wait timed out!\n");
131 goto out_request;
132 }
133
134 if (!i915_request_completed(request)) {
135 pr_err("request not complete after waiting!\n");
136 goto out_request;
137 }
138
139 if (i915_request_wait(request, 0, T) == -ETIME) {
140 pr_err("request wait timed out when already complete!\n");
141 goto out_request;
142 }
143
144 err = 0;
145out_request:
146 i915_request_put(request);
147 mock_device_flush(i915);
148 return err;
149}
150
151static int igt_fence_wait(void *arg)
152{
153 const long T = HZ / 4;
154 struct drm_i915_private *i915 = arg;
155 struct i915_request *request;
156 int err = -EINVAL;
157
158 /* Submit a request, treat it as a fence and wait upon it */
159
160 request = mock_request(rcs0(i915)->kernel_context, T);
161 if (!request)
162 return -ENOMEM;
163
164 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
165 pr_err("fence wait success before submit (expected timeout)!\n");
166 goto out;
167 }
168
169 i915_request_add(request);
170
171 if (dma_fence_is_signaled(&request->fence)) {
172 pr_err("fence signaled immediately!\n");
173 goto out;
174 }
175
176 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
177 pr_err("fence wait success after submit (expected timeout)!\n");
178 goto out;
179 }
180
181 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
182 pr_err("fence wait timed out (expected success)!\n");
183 goto out;
184 }
185
186 if (!dma_fence_is_signaled(&request->fence)) {
187 pr_err("fence unsignaled after waiting!\n");
188 goto out;
189 }
190
191 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
192 pr_err("fence wait timed out when complete (expected success)!\n");
193 goto out;
194 }
195
196 err = 0;
197out:
198 mock_device_flush(i915);
199 return err;
200}
201
202static int igt_request_rewind(void *arg)
203{
204 struct drm_i915_private *i915 = arg;
205 struct i915_request *request, *vip;
206 struct i915_gem_context *ctx[2];
207 struct intel_context *ce;
208 int err = -EINVAL;
209
210 ctx[0] = mock_context(i915, "A");
211
212 ce = i915_gem_context_get_engine(ctx[0], RCS0);
213 GEM_BUG_ON(IS_ERR(ce));
214 request = mock_request(ce, 2 * HZ);
215 intel_context_put(ce);
216 if (!request) {
217 err = -ENOMEM;
218 goto err_context_0;
219 }
220
221 i915_request_get(request);
222 i915_request_add(request);
223
224 ctx[1] = mock_context(i915, "B");
225
226 ce = i915_gem_context_get_engine(ctx[1], RCS0);
227 GEM_BUG_ON(IS_ERR(ce));
228 vip = mock_request(ce, 0);
229 intel_context_put(ce);
230 if (!vip) {
231 err = -ENOMEM;
232 goto err_context_1;
233 }
234
235 /* Simulate preemption by manual reordering */
236 if (!mock_cancel_request(request)) {
237 pr_err("failed to cancel request (already executed)!\n");
238 i915_request_add(vip);
239 goto err_context_1;
240 }
241 i915_request_get(vip);
242 i915_request_add(vip);
243 rcu_read_lock();
244 request->engine->submit_request(request);
245 rcu_read_unlock();
246
247
248 if (i915_request_wait(vip, 0, HZ) == -ETIME) {
249 pr_err("timed out waiting for high priority request\n");
250 goto err;
251 }
252
253 if (i915_request_completed(request)) {
254 pr_err("low priority request already completed\n");
255 goto err;
256 }
257
258 err = 0;
259err:
260 i915_request_put(vip);
261err_context_1:
262 mock_context_close(ctx[1]);
263 i915_request_put(request);
264err_context_0:
265 mock_context_close(ctx[0]);
266 mock_device_flush(i915);
267 return err;
268}
269
270struct smoketest {
271 struct intel_engine_cs *engine;
272 struct i915_gem_context **contexts;
273 atomic_long_t num_waits, num_fences;
274 int ncontexts, max_batch;
275 struct i915_request *(*request_alloc)(struct intel_context *ce);
276};
277
278static struct i915_request *
279__mock_request_alloc(struct intel_context *ce)
280{
281 return mock_request(ce, 0);
282}
283
284static struct i915_request *
285__live_request_alloc(struct intel_context *ce)
286{
287 return intel_context_create_request(ce);
288}
289
290static int __igt_breadcrumbs_smoketest(void *arg)
291{
292 struct smoketest *t = arg;
293 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
294 const unsigned int total = 4 * t->ncontexts + 1;
295 unsigned int num_waits = 0, num_fences = 0;
296 struct i915_request **requests;
297 I915_RND_STATE(prng);
298 unsigned int *order;
299 int err = 0;
300
301 /*
302 * A very simple test to catch the most egregious of list handling bugs.
303 *
304 * At its heart, we simply create oodles of requests running across
305 * multiple kthreads and enable signaling on them, for the sole purpose
306 * of stressing our breadcrumb handling. The only inspection we do is
307 * that the fences were marked as signaled.
308 */
309
310 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
311 if (!requests)
312 return -ENOMEM;
313
314 order = i915_random_order(total, &prng);
315 if (!order) {
316 err = -ENOMEM;
317 goto out_requests;
318 }
319
320 while (!kthread_should_stop()) {
321 struct i915_sw_fence *submit, *wait;
322 unsigned int n, count;
323
324 submit = heap_fence_create(GFP_KERNEL);
325 if (!submit) {
326 err = -ENOMEM;
327 break;
328 }
329
330 wait = heap_fence_create(GFP_KERNEL);
331 if (!wait) {
332 i915_sw_fence_commit(submit);
333 heap_fence_put(submit);
334 err = ENOMEM;
335 break;
336 }
337
338 i915_random_reorder(order, total, &prng);
339 count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
340
341 for (n = 0; n < count; n++) {
342 struct i915_gem_context *ctx =
343 t->contexts[order[n] % t->ncontexts];
344 struct i915_request *rq;
345 struct intel_context *ce;
346
347 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
348 GEM_BUG_ON(IS_ERR(ce));
349 rq = t->request_alloc(ce);
350 intel_context_put(ce);
351 if (IS_ERR(rq)) {
352 err = PTR_ERR(rq);
353 count = n;
354 break;
355 }
356
357 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
358 submit,
359 GFP_KERNEL);
360
361 requests[n] = i915_request_get(rq);
362 i915_request_add(rq);
363
364 if (err >= 0)
365 err = i915_sw_fence_await_dma_fence(wait,
366 &rq->fence,
367 0,
368 GFP_KERNEL);
369
370 if (err < 0) {
371 i915_request_put(rq);
372 count = n;
373 break;
374 }
375 }
376
377 i915_sw_fence_commit(submit);
378 i915_sw_fence_commit(wait);
379
380 if (!wait_event_timeout(wait->wait,
381 i915_sw_fence_done(wait),
382 5 * HZ)) {
383 struct i915_request *rq = requests[count - 1];
384
385 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
386 atomic_read(&wait->pending), count,
387 rq->fence.context, rq->fence.seqno,
388 t->engine->name);
389 GEM_TRACE_DUMP();
390
391 intel_gt_set_wedged(t->engine->gt);
392 GEM_BUG_ON(!i915_request_completed(rq));
393 i915_sw_fence_wait(wait);
394 err = -EIO;
395 }
396
397 for (n = 0; n < count; n++) {
398 struct i915_request *rq = requests[n];
399
400 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
401 &rq->fence.flags)) {
402 pr_err("%llu:%llu was not signaled!\n",
403 rq->fence.context, rq->fence.seqno);
404 err = -EINVAL;
405 }
406
407 i915_request_put(rq);
408 }
409
410 heap_fence_put(wait);
411 heap_fence_put(submit);
412
413 if (err < 0)
414 break;
415
416 num_fences += count;
417 num_waits++;
418
419 cond_resched();
420 }
421
422 atomic_long_add(num_fences, &t->num_fences);
423 atomic_long_add(num_waits, &t->num_waits);
424
425 kfree(order);
426out_requests:
427 kfree(requests);
428 return err;
429}
430
431static int mock_breadcrumbs_smoketest(void *arg)
432{
433 struct drm_i915_private *i915 = arg;
434 struct smoketest t = {
435 .engine = rcs0(i915),
436 .ncontexts = 1024,
437 .max_batch = 1024,
438 .request_alloc = __mock_request_alloc
439 };
440 unsigned int ncpus = num_online_cpus();
441 struct task_struct **threads;
442 unsigned int n;
443 int ret = 0;
444
445 /*
446 * Smoketest our breadcrumb/signal handling for requests across multiple
447 * threads. A very simple test to only catch the most egregious of bugs.
448 * See __igt_breadcrumbs_smoketest();
449 */
450
451 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
452 if (!threads)
453 return -ENOMEM;
454
455 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
456 if (!t.contexts) {
457 ret = -ENOMEM;
458 goto out_threads;
459 }
460
461 for (n = 0; n < t.ncontexts; n++) {
462 t.contexts[n] = mock_context(t.engine->i915, "mock");
463 if (!t.contexts[n]) {
464 ret = -ENOMEM;
465 goto out_contexts;
466 }
467 }
468
469 for (n = 0; n < ncpus; n++) {
470 threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
471 &t, "igt/%d", n);
472 if (IS_ERR(threads[n])) {
473 ret = PTR_ERR(threads[n]);
474 ncpus = n;
475 break;
476 }
477
478 get_task_struct(threads[n]);
479 }
480
481 yield(); /* start all threads before we begin */
482 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
483
484 for (n = 0; n < ncpus; n++) {
485 int err;
486
487 err = kthread_stop(threads[n]);
488 if (err < 0 && !ret)
489 ret = err;
490
491 put_task_struct(threads[n]);
492 }
493 pr_info("Completed %lu waits for %lu fence across %d cpus\n",
494 atomic_long_read(&t.num_waits),
495 atomic_long_read(&t.num_fences),
496 ncpus);
497
498out_contexts:
499 for (n = 0; n < t.ncontexts; n++) {
500 if (!t.contexts[n])
501 break;
502 mock_context_close(t.contexts[n]);
503 }
504 kfree(t.contexts);
505out_threads:
506 kfree(threads);
507 return ret;
508}
509
510int i915_request_mock_selftests(void)
511{
512 static const struct i915_subtest tests[] = {
513 SUBTEST(igt_add_request),
514 SUBTEST(igt_wait_request),
515 SUBTEST(igt_fence_wait),
516 SUBTEST(igt_request_rewind),
517 SUBTEST(mock_breadcrumbs_smoketest),
518 };
519 struct drm_i915_private *i915;
520 intel_wakeref_t wakeref;
521 int err = 0;
522
523 i915 = mock_gem_device();
524 if (!i915)
525 return -ENOMEM;
526
527 with_intel_runtime_pm(&i915->runtime_pm, wakeref)
528 err = i915_subtests(tests, i915);
529
530 drm_dev_put(&i915->drm);
531
532 return err;
533}
534
535static int live_nop_request(void *arg)
536{
537 struct drm_i915_private *i915 = arg;
538 struct intel_engine_cs *engine;
539 struct igt_live_test t;
540 int err = -ENODEV;
541
542 /*
543 * Submit various sized batches of empty requests, to each engine
544 * (individually), and wait for the batch to complete. We can check
545 * the overhead of submitting requests to the hardware.
546 */
547
548 for_each_uabi_engine(engine, i915) {
549 unsigned long n, prime;
550 IGT_TIMEOUT(end_time);
551 ktime_t times[2] = {};
552
553 err = igt_live_test_begin(&t, i915, __func__, engine->name);
554 if (err)
555 return err;
556
557 intel_engine_pm_get(engine);
558 for_each_prime_number_from(prime, 1, 8192) {
559 struct i915_request *request = NULL;
560
561 times[1] = ktime_get_raw();
562
563 for (n = 0; n < prime; n++) {
564 i915_request_put(request);
565 request = i915_request_create(engine->kernel_context);
566 if (IS_ERR(request))
567 return PTR_ERR(request);
568
569 /*
570 * This space is left intentionally blank.
571 *
572 * We do not actually want to perform any
573 * action with this request, we just want
574 * to measure the latency in allocation
575 * and submission of our breadcrumbs -
576 * ensuring that the bare request is sufficient
577 * for the system to work (i.e. proper HEAD
578 * tracking of the rings, interrupt handling,
579 * etc). It also gives us the lowest bounds
580 * for latency.
581 */
582
583 i915_request_get(request);
584 i915_request_add(request);
585 }
586 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
587 i915_request_put(request);
588
589 times[1] = ktime_sub(ktime_get_raw(), times[1]);
590 if (prime == 1)
591 times[0] = times[1];
592
593 if (__igt_timeout(end_time, NULL))
594 break;
595 }
596 intel_engine_pm_put(engine);
597
598 err = igt_live_test_end(&t);
599 if (err)
600 return err;
601
602 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
603 engine->name,
604 ktime_to_ns(times[0]),
605 prime, div64_u64(ktime_to_ns(times[1]), prime));
606 }
607
608 return err;
609}
610
611static struct i915_vma *empty_batch(struct drm_i915_private *i915)
612{
613 struct drm_i915_gem_object *obj;
614 struct i915_vma *vma;
615 u32 *cmd;
616 int err;
617
618 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
619 if (IS_ERR(obj))
620 return ERR_CAST(obj);
621
622 cmd = i915_gem_object_pin_map(obj, I915_MAP_WB);
623 if (IS_ERR(cmd)) {
624 err = PTR_ERR(cmd);
625 goto err;
626 }
627
628 *cmd = MI_BATCH_BUFFER_END;
629
630 __i915_gem_object_flush_map(obj, 0, 64);
631 i915_gem_object_unpin_map(obj);
632
633 intel_gt_chipset_flush(&i915->gt);
634
635 vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
636 if (IS_ERR(vma)) {
637 err = PTR_ERR(vma);
638 goto err;
639 }
640
641 err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
642 if (err)
643 goto err;
644
645 /* Force the wait wait now to avoid including it in the benchmark */
646 err = i915_vma_sync(vma);
647 if (err)
648 goto err_pin;
649
650 return vma;
651
652err_pin:
653 i915_vma_unpin(vma);
654err:
655 i915_gem_object_put(obj);
656 return ERR_PTR(err);
657}
658
659static struct i915_request *
660empty_request(struct intel_engine_cs *engine,
661 struct i915_vma *batch)
662{
663 struct i915_request *request;
664 int err;
665
666 request = i915_request_create(engine->kernel_context);
667 if (IS_ERR(request))
668 return request;
669
670 err = engine->emit_bb_start(request,
671 batch->node.start,
672 batch->node.size,
673 I915_DISPATCH_SECURE);
674 if (err)
675 goto out_request;
676
677 i915_request_get(request);
678out_request:
679 i915_request_add(request);
680 return err ? ERR_PTR(err) : request;
681}
682
683static int live_empty_request(void *arg)
684{
685 struct drm_i915_private *i915 = arg;
686 struct intel_engine_cs *engine;
687 struct igt_live_test t;
688 struct i915_vma *batch;
689 int err = 0;
690
691 /*
692 * Submit various sized batches of empty requests, to each engine
693 * (individually), and wait for the batch to complete. We can check
694 * the overhead of submitting requests to the hardware.
695 */
696
697 batch = empty_batch(i915);
698 if (IS_ERR(batch))
699 return PTR_ERR(batch);
700
701 for_each_uabi_engine(engine, i915) {
702 IGT_TIMEOUT(end_time);
703 struct i915_request *request;
704 unsigned long n, prime;
705 ktime_t times[2] = {};
706
707 err = igt_live_test_begin(&t, i915, __func__, engine->name);
708 if (err)
709 goto out_batch;
710
711 intel_engine_pm_get(engine);
712
713 /* Warmup / preload */
714 request = empty_request(engine, batch);
715 if (IS_ERR(request)) {
716 err = PTR_ERR(request);
717 intel_engine_pm_put(engine);
718 goto out_batch;
719 }
720 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
721
722 for_each_prime_number_from(prime, 1, 8192) {
723 times[1] = ktime_get_raw();
724
725 for (n = 0; n < prime; n++) {
726 i915_request_put(request);
727 request = empty_request(engine, batch);
728 if (IS_ERR(request)) {
729 err = PTR_ERR(request);
730 intel_engine_pm_put(engine);
731 goto out_batch;
732 }
733 }
734 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
735
736 times[1] = ktime_sub(ktime_get_raw(), times[1]);
737 if (prime == 1)
738 times[0] = times[1];
739
740 if (__igt_timeout(end_time, NULL))
741 break;
742 }
743 i915_request_put(request);
744 intel_engine_pm_put(engine);
745
746 err = igt_live_test_end(&t);
747 if (err)
748 goto out_batch;
749
750 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
751 engine->name,
752 ktime_to_ns(times[0]),
753 prime, div64_u64(ktime_to_ns(times[1]), prime));
754 }
755
756out_batch:
757 i915_vma_unpin(batch);
758 i915_vma_put(batch);
759 return err;
760}
761
762static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
763{
764 struct drm_i915_gem_object *obj;
765 const int gen = INTEL_GEN(i915);
766 struct i915_vma *vma;
767 u32 *cmd;
768 int err;
769
770 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
771 if (IS_ERR(obj))
772 return ERR_CAST(obj);
773
774 vma = i915_vma_instance(obj, i915->gt.vm, NULL);
775 if (IS_ERR(vma)) {
776 err = PTR_ERR(vma);
777 goto err;
778 }
779
780 err = i915_vma_pin(vma, 0, 0, PIN_USER);
781 if (err)
782 goto err;
783
784 cmd = i915_gem_object_pin_map(obj, I915_MAP_WC);
785 if (IS_ERR(cmd)) {
786 err = PTR_ERR(cmd);
787 goto err;
788 }
789
790 if (gen >= 8) {
791 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
792 *cmd++ = lower_32_bits(vma->node.start);
793 *cmd++ = upper_32_bits(vma->node.start);
794 } else if (gen >= 6) {
795 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
796 *cmd++ = lower_32_bits(vma->node.start);
797 } else {
798 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
799 *cmd++ = lower_32_bits(vma->node.start);
800 }
801 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
802
803 __i915_gem_object_flush_map(obj, 0, 64);
804 i915_gem_object_unpin_map(obj);
805
806 intel_gt_chipset_flush(&i915->gt);
807
808 return vma;
809
810err:
811 i915_gem_object_put(obj);
812 return ERR_PTR(err);
813}
814
815static int recursive_batch_resolve(struct i915_vma *batch)
816{
817 u32 *cmd;
818
819 cmd = i915_gem_object_pin_map(batch->obj, I915_MAP_WC);
820 if (IS_ERR(cmd))
821 return PTR_ERR(cmd);
822
823 *cmd = MI_BATCH_BUFFER_END;
824
825 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
826 i915_gem_object_unpin_map(batch->obj);
827
828 intel_gt_chipset_flush(batch->vm->gt);
829
830 return 0;
831}
832
833static int live_all_engines(void *arg)
834{
835 struct drm_i915_private *i915 = arg;
836 const unsigned int nengines = num_uabi_engines(i915);
837 struct intel_engine_cs *engine;
838 struct i915_request **request;
839 struct igt_live_test t;
840 struct i915_vma *batch;
841 unsigned int idx;
842 int err;
843
844 /*
845 * Check we can submit requests to all engines simultaneously. We
846 * send a recursive batch to each engine - checking that we don't
847 * block doing so, and that they don't complete too soon.
848 */
849
850 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
851 if (!request)
852 return -ENOMEM;
853
854 err = igt_live_test_begin(&t, i915, __func__, "");
855 if (err)
856 goto out_free;
857
858 batch = recursive_batch(i915);
859 if (IS_ERR(batch)) {
860 err = PTR_ERR(batch);
861 pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
862 goto out_free;
863 }
864
865 idx = 0;
866 for_each_uabi_engine(engine, i915) {
867 request[idx] = intel_engine_create_kernel_request(engine);
868 if (IS_ERR(request[idx])) {
869 err = PTR_ERR(request[idx]);
870 pr_err("%s: Request allocation failed with err=%d\n",
871 __func__, err);
872 goto out_request;
873 }
874
875 i915_vma_lock(batch);
876 err = i915_request_await_object(request[idx], batch->obj, 0);
877 if (err == 0)
878 err = i915_vma_move_to_active(batch, request[idx], 0);
879 i915_vma_unlock(batch);
880 GEM_BUG_ON(err);
881
882 err = engine->emit_bb_start(request[idx],
883 batch->node.start,
884 batch->node.size,
885 0);
886 GEM_BUG_ON(err);
887 request[idx]->batch = batch;
888
889 i915_request_get(request[idx]);
890 i915_request_add(request[idx]);
891 idx++;
892 }
893
894 idx = 0;
895 for_each_uabi_engine(engine, i915) {
896 if (i915_request_completed(request[idx])) {
897 pr_err("%s(%s): request completed too early!\n",
898 __func__, engine->name);
899 err = -EINVAL;
900 goto out_request;
901 }
902 idx++;
903 }
904
905 err = recursive_batch_resolve(batch);
906 if (err) {
907 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
908 goto out_request;
909 }
910
911 idx = 0;
912 for_each_uabi_engine(engine, i915) {
913 long timeout;
914
915 timeout = i915_request_wait(request[idx], 0,
916 MAX_SCHEDULE_TIMEOUT);
917 if (timeout < 0) {
918 err = timeout;
919 pr_err("%s: error waiting for request on %s, err=%d\n",
920 __func__, engine->name, err);
921 goto out_request;
922 }
923
924 GEM_BUG_ON(!i915_request_completed(request[idx]));
925 i915_request_put(request[idx]);
926 request[idx] = NULL;
927 idx++;
928 }
929
930 err = igt_live_test_end(&t);
931
932out_request:
933 idx = 0;
934 for_each_uabi_engine(engine, i915) {
935 if (request[idx])
936 i915_request_put(request[idx]);
937 idx++;
938 }
939 i915_vma_unpin(batch);
940 i915_vma_put(batch);
941out_free:
942 kfree(request);
943 return err;
944}
945
946static int live_sequential_engines(void *arg)
947{
948 struct drm_i915_private *i915 = arg;
949 const unsigned int nengines = num_uabi_engines(i915);
950 struct i915_request **request;
951 struct i915_request *prev = NULL;
952 struct intel_engine_cs *engine;
953 struct igt_live_test t;
954 unsigned int idx;
955 int err;
956
957 /*
958 * Check we can submit requests to all engines sequentially, such
959 * that each successive request waits for the earlier ones. This
960 * tests that we don't execute requests out of order, even though
961 * they are running on independent engines.
962 */
963
964 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
965 if (!request)
966 return -ENOMEM;
967
968 err = igt_live_test_begin(&t, i915, __func__, "");
969 if (err)
970 goto out_free;
971
972 idx = 0;
973 for_each_uabi_engine(engine, i915) {
974 struct i915_vma *batch;
975
976 batch = recursive_batch(i915);
977 if (IS_ERR(batch)) {
978 err = PTR_ERR(batch);
979 pr_err("%s: Unable to create batch for %s, err=%d\n",
980 __func__, engine->name, err);
981 goto out_free;
982 }
983
984 request[idx] = intel_engine_create_kernel_request(engine);
985 if (IS_ERR(request[idx])) {
986 err = PTR_ERR(request[idx]);
987 pr_err("%s: Request allocation failed for %s with err=%d\n",
988 __func__, engine->name, err);
989 goto out_request;
990 }
991
992 if (prev) {
993 err = i915_request_await_dma_fence(request[idx],
994 &prev->fence);
995 if (err) {
996 i915_request_add(request[idx]);
997 pr_err("%s: Request await failed for %s with err=%d\n",
998 __func__, engine->name, err);
999 goto out_request;
1000 }
1001 }
1002
1003 i915_vma_lock(batch);
1004 err = i915_request_await_object(request[idx],
1005 batch->obj, false);
1006 if (err == 0)
1007 err = i915_vma_move_to_active(batch, request[idx], 0);
1008 i915_vma_unlock(batch);
1009 GEM_BUG_ON(err);
1010
1011 err = engine->emit_bb_start(request[idx],
1012 batch->node.start,
1013 batch->node.size,
1014 0);
1015 GEM_BUG_ON(err);
1016 request[idx]->batch = batch;
1017
1018 i915_request_get(request[idx]);
1019 i915_request_add(request[idx]);
1020
1021 prev = request[idx];
1022 idx++;
1023 }
1024
1025 idx = 0;
1026 for_each_uabi_engine(engine, i915) {
1027 long timeout;
1028
1029 if (i915_request_completed(request[idx])) {
1030 pr_err("%s(%s): request completed too early!\n",
1031 __func__, engine->name);
1032 err = -EINVAL;
1033 goto out_request;
1034 }
1035
1036 err = recursive_batch_resolve(request[idx]->batch);
1037 if (err) {
1038 pr_err("%s: failed to resolve batch, err=%d\n",
1039 __func__, err);
1040 goto out_request;
1041 }
1042
1043 timeout = i915_request_wait(request[idx], 0,
1044 MAX_SCHEDULE_TIMEOUT);
1045 if (timeout < 0) {
1046 err = timeout;
1047 pr_err("%s: error waiting for request on %s, err=%d\n",
1048 __func__, engine->name, err);
1049 goto out_request;
1050 }
1051
1052 GEM_BUG_ON(!i915_request_completed(request[idx]));
1053 idx++;
1054 }
1055
1056 err = igt_live_test_end(&t);
1057
1058out_request:
1059 idx = 0;
1060 for_each_uabi_engine(engine, i915) {
1061 u32 *cmd;
1062
1063 if (!request[idx])
1064 break;
1065
1066 cmd = i915_gem_object_pin_map(request[idx]->batch->obj,
1067 I915_MAP_WC);
1068 if (!IS_ERR(cmd)) {
1069 *cmd = MI_BATCH_BUFFER_END;
1070
1071 __i915_gem_object_flush_map(request[idx]->batch->obj,
1072 0, sizeof(*cmd));
1073 i915_gem_object_unpin_map(request[idx]->batch->obj);
1074
1075 intel_gt_chipset_flush(engine->gt);
1076 }
1077
1078 i915_vma_put(request[idx]->batch);
1079 i915_request_put(request[idx]);
1080 idx++;
1081 }
1082out_free:
1083 kfree(request);
1084 return err;
1085}
1086
1087static int __live_parallel_engine1(void *arg)
1088{
1089 struct intel_engine_cs *engine = arg;
1090 IGT_TIMEOUT(end_time);
1091 unsigned long count;
1092 int err = 0;
1093
1094 count = 0;
1095 intel_engine_pm_get(engine);
1096 do {
1097 struct i915_request *rq;
1098
1099 rq = i915_request_create(engine->kernel_context);
1100 if (IS_ERR(rq)) {
1101 err = PTR_ERR(rq);
1102 break;
1103 }
1104
1105 i915_request_get(rq);
1106 i915_request_add(rq);
1107
1108 err = 0;
1109 if (i915_request_wait(rq, 0, HZ / 5) < 0)
1110 err = -ETIME;
1111 i915_request_put(rq);
1112 if (err)
1113 break;
1114
1115 count++;
1116 } while (!__igt_timeout(end_time, NULL));
1117 intel_engine_pm_put(engine);
1118
1119 pr_info("%s: %lu request + sync\n", engine->name, count);
1120 return err;
1121}
1122
1123static int __live_parallel_engineN(void *arg)
1124{
1125 struct intel_engine_cs *engine = arg;
1126 IGT_TIMEOUT(end_time);
1127 unsigned long count;
1128 int err = 0;
1129
1130 count = 0;
1131 intel_engine_pm_get(engine);
1132 do {
1133 struct i915_request *rq;
1134
1135 rq = i915_request_create(engine->kernel_context);
1136 if (IS_ERR(rq)) {
1137 err = PTR_ERR(rq);
1138 break;
1139 }
1140
1141 i915_request_add(rq);
1142 count++;
1143 } while (!__igt_timeout(end_time, NULL));
1144 intel_engine_pm_put(engine);
1145
1146 pr_info("%s: %lu requests\n", engine->name, count);
1147 return err;
1148}
1149
1150static bool wake_all(struct drm_i915_private *i915)
1151{
1152 if (atomic_dec_and_test(&i915->selftest.counter)) {
1153 wake_up_var(&i915->selftest.counter);
1154 return true;
1155 }
1156
1157 return false;
1158}
1159
1160static int wait_for_all(struct drm_i915_private *i915)
1161{
1162 if (wake_all(i915))
1163 return 0;
1164
1165 if (wait_var_event_timeout(&i915->selftest.counter,
1166 !atomic_read(&i915->selftest.counter),
1167 i915_selftest.timeout_jiffies))
1168 return 0;
1169
1170 return -ETIME;
1171}
1172
1173static int __live_parallel_spin(void *arg)
1174{
1175 struct intel_engine_cs *engine = arg;
1176 struct igt_spinner spin;
1177 struct i915_request *rq;
1178 int err = 0;
1179
1180 /*
1181 * Create a spinner running for eternity on each engine. If a second
1182 * spinner is incorrectly placed on the same engine, it will not be
1183 * able to start in time.
1184 */
1185
1186 if (igt_spinner_init(&spin, engine->gt)) {
1187 wake_all(engine->i915);
1188 return -ENOMEM;
1189 }
1190
1191 intel_engine_pm_get(engine);
1192 rq = igt_spinner_create_request(&spin,
1193 engine->kernel_context,
1194 MI_NOOP); /* no preemption */
1195 intel_engine_pm_put(engine);
1196 if (IS_ERR(rq)) {
1197 err = PTR_ERR(rq);
1198 if (err == -ENODEV)
1199 err = 0;
1200 wake_all(engine->i915);
1201 goto out_spin;
1202 }
1203
1204 i915_request_get(rq);
1205 i915_request_add(rq);
1206 if (igt_wait_for_spinner(&spin, rq)) {
1207 /* Occupy this engine for the whole test */
1208 err = wait_for_all(engine->i915);
1209 } else {
1210 pr_err("Failed to start spinner on %s\n", engine->name);
1211 err = -EINVAL;
1212 }
1213 igt_spinner_end(&spin);
1214
1215 if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0)
1216 err = -EIO;
1217 i915_request_put(rq);
1218
1219out_spin:
1220 igt_spinner_fini(&spin);
1221 return err;
1222}
1223
1224static int live_parallel_engines(void *arg)
1225{
1226 struct drm_i915_private *i915 = arg;
1227 static int (* const func[])(void *arg) = {
1228 __live_parallel_engine1,
1229 __live_parallel_engineN,
1230 __live_parallel_spin,
1231 NULL,
1232 };
1233 const unsigned int nengines = num_uabi_engines(i915);
1234 struct intel_engine_cs *engine;
1235 int (* const *fn)(void *arg);
1236 struct task_struct **tsk;
1237 int err = 0;
1238
1239 /*
1240 * Check we can submit requests to all engines concurrently. This
1241 * tests that we load up the system maximally.
1242 */
1243
1244 tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1245 if (!tsk)
1246 return -ENOMEM;
1247
1248 for (fn = func; !err && *fn; fn++) {
1249 char name[KSYM_NAME_LEN];
1250 struct igt_live_test t;
1251 unsigned int idx;
1252
1253 snprintf(name, sizeof(name), "%ps", *fn);
1254 err = igt_live_test_begin(&t, i915, __func__, name);
1255 if (err)
1256 break;
1257
1258 atomic_set(&i915->selftest.counter, nengines);
1259
1260 idx = 0;
1261 for_each_uabi_engine(engine, i915) {
1262 tsk[idx] = kthread_run(*fn, engine,
1263 "igt/parallel:%s",
1264 engine->name);
1265 if (IS_ERR(tsk[idx])) {
1266 err = PTR_ERR(tsk[idx]);
1267 break;
1268 }
1269 get_task_struct(tsk[idx++]);
1270 }
1271
1272 yield(); /* start all threads before we kthread_stop() */
1273
1274 idx = 0;
1275 for_each_uabi_engine(engine, i915) {
1276 int status;
1277
1278 if (IS_ERR(tsk[idx]))
1279 break;
1280
1281 status = kthread_stop(tsk[idx]);
1282 if (status && !err)
1283 err = status;
1284
1285 put_task_struct(tsk[idx++]);
1286 }
1287
1288 if (igt_live_test_end(&t))
1289 err = -EIO;
1290 }
1291
1292 kfree(tsk);
1293 return err;
1294}
1295
1296static int
1297max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1298{
1299 struct i915_request *rq;
1300 int ret;
1301
1302 /*
1303 * Before execlists, all contexts share the same ringbuffer. With
1304 * execlists, each context/engine has a separate ringbuffer and
1305 * for the purposes of this test, inexhaustible.
1306 *
1307 * For the global ringbuffer though, we have to be very careful
1308 * that we do not wrap while preventing the execution of requests
1309 * with a unsignaled fence.
1310 */
1311 if (HAS_EXECLISTS(ctx->i915))
1312 return INT_MAX;
1313
1314 rq = igt_request_alloc(ctx, engine);
1315 if (IS_ERR(rq)) {
1316 ret = PTR_ERR(rq);
1317 } else {
1318 int sz;
1319
1320 ret = rq->ring->size - rq->reserved_space;
1321 i915_request_add(rq);
1322
1323 sz = rq->ring->emit - rq->head;
1324 if (sz < 0)
1325 sz += rq->ring->size;
1326 ret /= sz;
1327 ret /= 2; /* leave half spare, in case of emergency! */
1328 }
1329
1330 return ret;
1331}
1332
1333static int live_breadcrumbs_smoketest(void *arg)
1334{
1335 struct drm_i915_private *i915 = arg;
1336 const unsigned int nengines = num_uabi_engines(i915);
1337 const unsigned int ncpus = num_online_cpus();
1338 unsigned long num_waits, num_fences;
1339 struct intel_engine_cs *engine;
1340 struct task_struct **threads;
1341 struct igt_live_test live;
1342 intel_wakeref_t wakeref;
1343 struct smoketest *smoke;
1344 unsigned int n, idx;
1345 struct file *file;
1346 int ret = 0;
1347
1348 /*
1349 * Smoketest our breadcrumb/signal handling for requests across multiple
1350 * threads. A very simple test to only catch the most egregious of bugs.
1351 * See __igt_breadcrumbs_smoketest();
1352 *
1353 * On real hardware this time.
1354 */
1355
1356 wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1357
1358 file = mock_file(i915);
1359 if (IS_ERR(file)) {
1360 ret = PTR_ERR(file);
1361 goto out_rpm;
1362 }
1363
1364 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1365 if (!smoke) {
1366 ret = -ENOMEM;
1367 goto out_file;
1368 }
1369
1370 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1371 if (!threads) {
1372 ret = -ENOMEM;
1373 goto out_smoke;
1374 }
1375
1376 smoke[0].request_alloc = __live_request_alloc;
1377 smoke[0].ncontexts = 64;
1378 smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1379 sizeof(*smoke[0].contexts),
1380 GFP_KERNEL);
1381 if (!smoke[0].contexts) {
1382 ret = -ENOMEM;
1383 goto out_threads;
1384 }
1385
1386 for (n = 0; n < smoke[0].ncontexts; n++) {
1387 smoke[0].contexts[n] = live_context(i915, file);
1388 if (!smoke[0].contexts[n]) {
1389 ret = -ENOMEM;
1390 goto out_contexts;
1391 }
1392 }
1393
1394 ret = igt_live_test_begin(&live, i915, __func__, "");
1395 if (ret)
1396 goto out_contexts;
1397
1398 idx = 0;
1399 for_each_uabi_engine(engine, i915) {
1400 smoke[idx] = smoke[0];
1401 smoke[idx].engine = engine;
1402 smoke[idx].max_batch =
1403 max_batches(smoke[0].contexts[0], engine);
1404 if (smoke[idx].max_batch < 0) {
1405 ret = smoke[idx].max_batch;
1406 goto out_flush;
1407 }
1408 /* One ring interleaved between requests from all cpus */
1409 smoke[idx].max_batch /= num_online_cpus() + 1;
1410 pr_debug("Limiting batches to %d requests on %s\n",
1411 smoke[idx].max_batch, engine->name);
1412
1413 for (n = 0; n < ncpus; n++) {
1414 struct task_struct *tsk;
1415
1416 tsk = kthread_run(__igt_breadcrumbs_smoketest,
1417 &smoke[idx], "igt/%d.%d", idx, n);
1418 if (IS_ERR(tsk)) {
1419 ret = PTR_ERR(tsk);
1420 goto out_flush;
1421 }
1422
1423 get_task_struct(tsk);
1424 threads[idx * ncpus + n] = tsk;
1425 }
1426
1427 idx++;
1428 }
1429
1430 yield(); /* start all threads before we begin */
1431 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1432
1433out_flush:
1434 idx = 0;
1435 num_waits = 0;
1436 num_fences = 0;
1437 for_each_uabi_engine(engine, i915) {
1438 for (n = 0; n < ncpus; n++) {
1439 struct task_struct *tsk = threads[idx * ncpus + n];
1440 int err;
1441
1442 if (!tsk)
1443 continue;
1444
1445 err = kthread_stop(tsk);
1446 if (err < 0 && !ret)
1447 ret = err;
1448
1449 put_task_struct(tsk);
1450 }
1451
1452 num_waits += atomic_long_read(&smoke[idx].num_waits);
1453 num_fences += atomic_long_read(&smoke[idx].num_fences);
1454 idx++;
1455 }
1456 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1457 num_waits, num_fences, idx, ncpus);
1458
1459 ret = igt_live_test_end(&live) ?: ret;
1460out_contexts:
1461 kfree(smoke[0].contexts);
1462out_threads:
1463 kfree(threads);
1464out_smoke:
1465 kfree(smoke);
1466out_file:
1467 fput(file);
1468out_rpm:
1469 intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1470
1471 return ret;
1472}
1473
1474int i915_request_live_selftests(struct drm_i915_private *i915)
1475{
1476 static const struct i915_subtest tests[] = {
1477 SUBTEST(live_nop_request),
1478 SUBTEST(live_all_engines),
1479 SUBTEST(live_sequential_engines),
1480 SUBTEST(live_parallel_engines),
1481 SUBTEST(live_empty_request),
1482 SUBTEST(live_breadcrumbs_smoketest),
1483 };
1484
1485 if (intel_gt_is_wedged(&i915->gt))
1486 return 0;
1487
1488 return i915_subtests(tests, i915);
1489}
1490
1491static int switch_to_kernel_sync(struct intel_context *ce, int err)
1492{
1493 struct i915_request *rq;
1494 struct dma_fence *fence;
1495
1496 rq = intel_engine_create_kernel_request(ce->engine);
1497 if (IS_ERR(rq))
1498 return PTR_ERR(rq);
1499
1500 fence = i915_active_fence_get(&ce->timeline->last_request);
1501 if (fence) {
1502 i915_request_await_dma_fence(rq, fence);
1503 dma_fence_put(fence);
1504 }
1505
1506 rq = i915_request_get(rq);
1507 i915_request_add(rq);
1508 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1509 err = -ETIME;
1510 i915_request_put(rq);
1511
1512 while (!err && !intel_engine_is_idle(ce->engine))
1513 intel_engine_flush_submission(ce->engine);
1514
1515 return err;
1516}
1517
1518struct perf_stats {
1519 struct intel_engine_cs *engine;
1520 unsigned long count;
1521 ktime_t time;
1522 ktime_t busy;
1523 u64 runtime;
1524};
1525
1526struct perf_series {
1527 struct drm_i915_private *i915;
1528 unsigned int nengines;
1529 struct intel_context *ce[];
1530};
1531
1532static int cmp_u32(const void *A, const void *B)
1533{
1534 const u32 *a = A, *b = B;
1535
1536 return *a - *b;
1537}
1538
1539static u32 trifilter(u32 *a)
1540{
1541 u64 sum;
1542
1543#define TF_COUNT 5
1544 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1545
1546 sum = mul_u32_u32(a[2], 2);
1547 sum += a[1];
1548 sum += a[3];
1549
1550 GEM_BUG_ON(sum > U32_MAX);
1551 return sum;
1552#define TF_BIAS 2
1553}
1554
1555static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1556{
1557 u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
1558
1559 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1560}
1561
1562static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1563{
1564 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1565 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1566 *cs++ = offset;
1567 *cs++ = 0;
1568
1569 return cs;
1570}
1571
1572static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1573{
1574 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1575 *cs++ = offset;
1576 *cs++ = 0;
1577 *cs++ = value;
1578
1579 return cs;
1580}
1581
1582static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1583{
1584 *cs++ = MI_SEMAPHORE_WAIT |
1585 MI_SEMAPHORE_GLOBAL_GTT |
1586 MI_SEMAPHORE_POLL |
1587 mode;
1588 *cs++ = value;
1589 *cs++ = offset;
1590 *cs++ = 0;
1591
1592 return cs;
1593}
1594
1595static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1596{
1597 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1598}
1599
1600static void semaphore_set(u32 *sema, u32 value)
1601{
1602 WRITE_ONCE(*sema, value);
1603 wmb(); /* flush the update to the cache, and beyond */
1604}
1605
1606static u32 *hwsp_scratch(const struct intel_context *ce)
1607{
1608 return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1609}
1610
1611static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1612{
1613 return (i915_ggtt_offset(ce->engine->status_page.vma) +
1614 offset_in_page(dw));
1615}
1616
1617static int measure_semaphore_response(struct intel_context *ce)
1618{
1619 u32 *sema = hwsp_scratch(ce);
1620 const u32 offset = hwsp_offset(ce, sema);
1621 u32 elapsed[TF_COUNT], cycles;
1622 struct i915_request *rq;
1623 u32 *cs;
1624 int err;
1625 int i;
1626
1627 /*
1628 * Measure how many cycles it takes for the HW to detect the change
1629 * in a semaphore value.
1630 *
1631 * A: read CS_TIMESTAMP from CPU
1632 * poke semaphore
1633 * B: read CS_TIMESTAMP on GPU
1634 *
1635 * Semaphore latency: B - A
1636 */
1637
1638 semaphore_set(sema, -1);
1639
1640 rq = i915_request_create(ce);
1641 if (IS_ERR(rq))
1642 return PTR_ERR(rq);
1643
1644 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1645 if (IS_ERR(cs)) {
1646 i915_request_add(rq);
1647 err = PTR_ERR(cs);
1648 goto err;
1649 }
1650
1651 cs = emit_store_dw(cs, offset, 0);
1652 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1653 cs = emit_semaphore_poll_until(cs, offset, i);
1654 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1655 cs = emit_store_dw(cs, offset, 0);
1656 }
1657
1658 intel_ring_advance(rq, cs);
1659 i915_request_add(rq);
1660
1661 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1662 err = -EIO;
1663 goto err;
1664 }
1665
1666 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1667 preempt_disable();
1668 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1669 semaphore_set(sema, i);
1670 preempt_enable();
1671
1672 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1673 err = -EIO;
1674 goto err;
1675 }
1676
1677 elapsed[i - 1] = sema[i] - cycles;
1678 }
1679
1680 cycles = trifilter(elapsed);
1681 pr_info("%s: semaphore response %d cycles, %lluns\n",
1682 ce->engine->name, cycles >> TF_BIAS,
1683 cycles_to_ns(ce->engine, cycles));
1684
1685 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1686
1687err:
1688 intel_gt_set_wedged(ce->engine->gt);
1689 return err;
1690}
1691
1692static int measure_idle_dispatch(struct intel_context *ce)
1693{
1694 u32 *sema = hwsp_scratch(ce);
1695 const u32 offset = hwsp_offset(ce, sema);
1696 u32 elapsed[TF_COUNT], cycles;
1697 u32 *cs;
1698 int err;
1699 int i;
1700
1701 /*
1702 * Measure how long it takes for us to submit a request while the
1703 * engine is idle, but is resting in our context.
1704 *
1705 * A: read CS_TIMESTAMP from CPU
1706 * submit request
1707 * B: read CS_TIMESTAMP on GPU
1708 *
1709 * Submission latency: B - A
1710 */
1711
1712 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
1713 struct i915_request *rq;
1714
1715 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1716 if (err)
1717 return err;
1718
1719 rq = i915_request_create(ce);
1720 if (IS_ERR(rq)) {
1721 err = PTR_ERR(rq);
1722 goto err;
1723 }
1724
1725 cs = intel_ring_begin(rq, 4);
1726 if (IS_ERR(cs)) {
1727 i915_request_add(rq);
1728 err = PTR_ERR(cs);
1729 goto err;
1730 }
1731
1732 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1733
1734 intel_ring_advance(rq, cs);
1735
1736 preempt_disable();
1737 local_bh_disable();
1738 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1739 i915_request_add(rq);
1740 local_bh_enable();
1741 preempt_enable();
1742 }
1743
1744 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1745 if (err)
1746 goto err;
1747
1748 for (i = 0; i < ARRAY_SIZE(elapsed); i++)
1749 elapsed[i] = sema[i] - elapsed[i];
1750
1751 cycles = trifilter(elapsed);
1752 pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
1753 ce->engine->name, cycles >> TF_BIAS,
1754 cycles_to_ns(ce->engine, cycles));
1755
1756 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1757
1758err:
1759 intel_gt_set_wedged(ce->engine->gt);
1760 return err;
1761}
1762
1763static int measure_busy_dispatch(struct intel_context *ce)
1764{
1765 u32 *sema = hwsp_scratch(ce);
1766 const u32 offset = hwsp_offset(ce, sema);
1767 u32 elapsed[TF_COUNT + 1], cycles;
1768 u32 *cs;
1769 int err;
1770 int i;
1771
1772 /*
1773 * Measure how long it takes for us to submit a request while the
1774 * engine is busy, polling on a semaphore in our context. With
1775 * direct submission, this will include the cost of a lite restore.
1776 *
1777 * A: read CS_TIMESTAMP from CPU
1778 * submit request
1779 * B: read CS_TIMESTAMP on GPU
1780 *
1781 * Submission latency: B - A
1782 */
1783
1784 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1785 struct i915_request *rq;
1786
1787 rq = i915_request_create(ce);
1788 if (IS_ERR(rq)) {
1789 err = PTR_ERR(rq);
1790 goto err;
1791 }
1792
1793 cs = intel_ring_begin(rq, 12);
1794 if (IS_ERR(cs)) {
1795 i915_request_add(rq);
1796 err = PTR_ERR(cs);
1797 goto err;
1798 }
1799
1800 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
1801 cs = emit_semaphore_poll_until(cs, offset, i);
1802 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1803
1804 intel_ring_advance(rq, cs);
1805
1806 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
1807 err = -EIO;
1808 goto err;
1809 }
1810
1811 preempt_disable();
1812 local_bh_disable();
1813 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1814 i915_request_add(rq);
1815 local_bh_enable();
1816 semaphore_set(sema, i - 1);
1817 preempt_enable();
1818 }
1819
1820 wait_for(READ_ONCE(sema[i - 1]), 500);
1821 semaphore_set(sema, i - 1);
1822
1823 for (i = 1; i <= TF_COUNT; i++) {
1824 GEM_BUG_ON(sema[i] == -1);
1825 elapsed[i - 1] = sema[i] - elapsed[i];
1826 }
1827
1828 cycles = trifilter(elapsed);
1829 pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
1830 ce->engine->name, cycles >> TF_BIAS,
1831 cycles_to_ns(ce->engine, cycles));
1832
1833 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1834
1835err:
1836 intel_gt_set_wedged(ce->engine->gt);
1837 return err;
1838}
1839
1840static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
1841{
1842 const u32 offset =
1843 i915_ggtt_offset(engine->status_page.vma) +
1844 offset_in_page(sema);
1845 struct i915_request *rq;
1846 u32 *cs;
1847
1848 rq = i915_request_create(engine->kernel_context);
1849 if (IS_ERR(rq))
1850 return PTR_ERR(rq);
1851
1852 cs = intel_ring_begin(rq, 4);
1853 if (IS_ERR(cs)) {
1854 i915_request_add(rq);
1855 return PTR_ERR(cs);
1856 }
1857
1858 cs = emit_semaphore_poll(cs, mode, value, offset);
1859
1860 intel_ring_advance(rq, cs);
1861 i915_request_add(rq);
1862
1863 return 0;
1864}
1865
1866static int measure_inter_request(struct intel_context *ce)
1867{
1868 u32 *sema = hwsp_scratch(ce);
1869 const u32 offset = hwsp_offset(ce, sema);
1870 u32 elapsed[TF_COUNT + 1], cycles;
1871 struct i915_sw_fence *submit;
1872 int i, err;
1873
1874 /*
1875 * Measure how long it takes to advance from one request into the
1876 * next. Between each request we flush the GPU caches to memory,
1877 * update the breadcrumbs, and then invalidate those caches.
1878 * We queue up all the requests to be submitted in one batch so
1879 * it should be one set of contiguous measurements.
1880 *
1881 * A: read CS_TIMESTAMP on GPU
1882 * advance request
1883 * B: read CS_TIMESTAMP on GPU
1884 *
1885 * Request latency: B - A
1886 */
1887
1888 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
1889 if (err)
1890 return err;
1891
1892 submit = heap_fence_create(GFP_KERNEL);
1893 if (!submit) {
1894 semaphore_set(sema, 1);
1895 return -ENOMEM;
1896 }
1897
1898 intel_engine_flush_submission(ce->engine);
1899 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1900 struct i915_request *rq;
1901 u32 *cs;
1902
1903 rq = i915_request_create(ce);
1904 if (IS_ERR(rq)) {
1905 err = PTR_ERR(rq);
1906 goto err_submit;
1907 }
1908
1909 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
1910 submit,
1911 GFP_KERNEL);
1912 if (err < 0) {
1913 i915_request_add(rq);
1914 goto err_submit;
1915 }
1916
1917 cs = intel_ring_begin(rq, 4);
1918 if (IS_ERR(cs)) {
1919 i915_request_add(rq);
1920 err = PTR_ERR(cs);
1921 goto err_submit;
1922 }
1923
1924 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1925
1926 intel_ring_advance(rq, cs);
1927 i915_request_add(rq);
1928 }
1929 local_bh_disable();
1930 i915_sw_fence_commit(submit);
1931 local_bh_enable();
1932 intel_engine_flush_submission(ce->engine);
1933 heap_fence_put(submit);
1934
1935 semaphore_set(sema, 1);
1936 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1937 if (err)
1938 goto err;
1939
1940 for (i = 1; i <= TF_COUNT; i++)
1941 elapsed[i - 1] = sema[i + 1] - sema[i];
1942
1943 cycles = trifilter(elapsed);
1944 pr_info("%s: inter-request latency %d cycles, %lluns\n",
1945 ce->engine->name, cycles >> TF_BIAS,
1946 cycles_to_ns(ce->engine, cycles));
1947
1948 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1949
1950err_submit:
1951 i915_sw_fence_commit(submit);
1952 heap_fence_put(submit);
1953 semaphore_set(sema, 1);
1954err:
1955 intel_gt_set_wedged(ce->engine->gt);
1956 return err;
1957}
1958
1959static int measure_context_switch(struct intel_context *ce)
1960{
1961 u32 *sema = hwsp_scratch(ce);
1962 const u32 offset = hwsp_offset(ce, sema);
1963 struct i915_request *fence = NULL;
1964 u32 elapsed[TF_COUNT + 1], cycles;
1965 int i, j, err;
1966 u32 *cs;
1967
1968 /*
1969 * Measure how long it takes to advance from one request in one
1970 * context to a request in another context. This allows us to
1971 * measure how long the context save/restore take, along with all
1972 * the inter-context setup we require.
1973 *
1974 * A: read CS_TIMESTAMP on GPU
1975 * switch context
1976 * B: read CS_TIMESTAMP on GPU
1977 *
1978 * Context switch latency: B - A
1979 */
1980
1981 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
1982 if (err)
1983 return err;
1984
1985 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1986 struct intel_context *arr[] = {
1987 ce, ce->engine->kernel_context
1988 };
1989 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
1990
1991 for (j = 0; j < ARRAY_SIZE(arr); j++) {
1992 struct i915_request *rq;
1993
1994 rq = i915_request_create(arr[j]);
1995 if (IS_ERR(rq)) {
1996 err = PTR_ERR(rq);
1997 goto err_fence;
1998 }
1999
2000 if (fence) {
2001 err = i915_request_await_dma_fence(rq,
2002 &fence->fence);
2003 if (err) {
2004 i915_request_add(rq);
2005 goto err_fence;
2006 }
2007 }
2008
2009 cs = intel_ring_begin(rq, 4);
2010 if (IS_ERR(cs)) {
2011 i915_request_add(rq);
2012 err = PTR_ERR(cs);
2013 goto err_fence;
2014 }
2015
2016 cs = emit_timestamp_store(cs, ce, addr);
2017 addr += sizeof(u32);
2018
2019 intel_ring_advance(rq, cs);
2020
2021 i915_request_put(fence);
2022 fence = i915_request_get(rq);
2023
2024 i915_request_add(rq);
2025 }
2026 }
2027 i915_request_put(fence);
2028 intel_engine_flush_submission(ce->engine);
2029
2030 semaphore_set(sema, 1);
2031 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2032 if (err)
2033 goto err;
2034
2035 for (i = 1; i <= TF_COUNT; i++)
2036 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2037
2038 cycles = trifilter(elapsed);
2039 pr_info("%s: context switch latency %d cycles, %lluns\n",
2040 ce->engine->name, cycles >> TF_BIAS,
2041 cycles_to_ns(ce->engine, cycles));
2042
2043 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2044
2045err_fence:
2046 i915_request_put(fence);
2047 semaphore_set(sema, 1);
2048err:
2049 intel_gt_set_wedged(ce->engine->gt);
2050 return err;
2051}
2052
2053static int measure_preemption(struct intel_context *ce)
2054{
2055 u32 *sema = hwsp_scratch(ce);
2056 const u32 offset = hwsp_offset(ce, sema);
2057 u32 elapsed[TF_COUNT], cycles;
2058 u32 *cs;
2059 int err;
2060 int i;
2061
2062 /*
2063 * We measure two latencies while triggering preemption. The first
2064 * latency is how long it takes for us to submit a preempting request.
2065 * The second latency is how it takes for us to return from the
2066 * preemption back to the original context.
2067 *
2068 * A: read CS_TIMESTAMP from CPU
2069 * submit preemption
2070 * B: read CS_TIMESTAMP on GPU (in preempting context)
2071 * context switch
2072 * C: read CS_TIMESTAMP on GPU (in original context)
2073 *
2074 * Preemption dispatch latency: B - A
2075 * Preemption switch latency: C - B
2076 */
2077
2078 if (!intel_engine_has_preemption(ce->engine))
2079 return 0;
2080
2081 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2082 u32 addr = offset + 2 * i * sizeof(u32);
2083 struct i915_request *rq;
2084
2085 rq = i915_request_create(ce);
2086 if (IS_ERR(rq)) {
2087 err = PTR_ERR(rq);
2088 goto err;
2089 }
2090
2091 cs = intel_ring_begin(rq, 12);
2092 if (IS_ERR(cs)) {
2093 i915_request_add(rq);
2094 err = PTR_ERR(cs);
2095 goto err;
2096 }
2097
2098 cs = emit_store_dw(cs, addr, -1);
2099 cs = emit_semaphore_poll_until(cs, offset, i);
2100 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2101
2102 intel_ring_advance(rq, cs);
2103 i915_request_add(rq);
2104
2105 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2106 err = -EIO;
2107 goto err;
2108 }
2109
2110 rq = i915_request_create(ce->engine->kernel_context);
2111 if (IS_ERR(rq)) {
2112 err = PTR_ERR(rq);
2113 goto err;
2114 }
2115
2116 cs = intel_ring_begin(rq, 8);
2117 if (IS_ERR(cs)) {
2118 i915_request_add(rq);
2119 err = PTR_ERR(cs);
2120 goto err;
2121 }
2122
2123 cs = emit_timestamp_store(cs, ce, addr);
2124 cs = emit_store_dw(cs, offset, i);
2125
2126 intel_ring_advance(rq, cs);
2127 rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2128
2129 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2130 i915_request_add(rq);
2131 }
2132
2133 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2134 err = -EIO;
2135 goto err;
2136 }
2137
2138 for (i = 1; i <= TF_COUNT; i++)
2139 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2140
2141 cycles = trifilter(elapsed);
2142 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2143 ce->engine->name, cycles >> TF_BIAS,
2144 cycles_to_ns(ce->engine, cycles));
2145
2146 for (i = 1; i <= TF_COUNT; i++)
2147 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2148
2149 cycles = trifilter(elapsed);
2150 pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2151 ce->engine->name, cycles >> TF_BIAS,
2152 cycles_to_ns(ce->engine, cycles));
2153
2154 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2155
2156err:
2157 intel_gt_set_wedged(ce->engine->gt);
2158 return err;
2159}
2160
2161struct signal_cb {
2162 struct dma_fence_cb base;
2163 bool seen;
2164};
2165
2166static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2167{
2168 struct signal_cb *s = container_of(cb, typeof(*s), base);
2169
2170 smp_store_mb(s->seen, true); /* be safe, be strong */
2171}
2172
2173static int measure_completion(struct intel_context *ce)
2174{
2175 u32 *sema = hwsp_scratch(ce);
2176 const u32 offset = hwsp_offset(ce, sema);
2177 u32 elapsed[TF_COUNT], cycles;
2178 u32 *cs;
2179 int err;
2180 int i;
2181
2182 /*
2183 * Measure how long it takes for the signal (interrupt) to be
2184 * sent from the GPU to be processed by the CPU.
2185 *
2186 * A: read CS_TIMESTAMP on GPU
2187 * signal
2188 * B: read CS_TIMESTAMP from CPU
2189 *
2190 * Completion latency: B - A
2191 */
2192
2193 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2194 struct signal_cb cb = { .seen = false };
2195 struct i915_request *rq;
2196
2197 rq = i915_request_create(ce);
2198 if (IS_ERR(rq)) {
2199 err = PTR_ERR(rq);
2200 goto err;
2201 }
2202
2203 cs = intel_ring_begin(rq, 12);
2204 if (IS_ERR(cs)) {
2205 i915_request_add(rq);
2206 err = PTR_ERR(cs);
2207 goto err;
2208 }
2209
2210 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2211 cs = emit_semaphore_poll_until(cs, offset, i);
2212 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2213
2214 intel_ring_advance(rq, cs);
2215
2216 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2217
2218 local_bh_disable();
2219 i915_request_add(rq);
2220 local_bh_enable();
2221
2222 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2223 err = -EIO;
2224 goto err;
2225 }
2226
2227 preempt_disable();
2228 semaphore_set(sema, i);
2229 while (!READ_ONCE(cb.seen))
2230 cpu_relax();
2231
2232 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2233 preempt_enable();
2234 }
2235
2236 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2237 if (err)
2238 goto err;
2239
2240 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2241 GEM_BUG_ON(sema[i + 1] == -1);
2242 elapsed[i] = elapsed[i] - sema[i + 1];
2243 }
2244
2245 cycles = trifilter(elapsed);
2246 pr_info("%s: completion latency %d cycles, %lluns\n",
2247 ce->engine->name, cycles >> TF_BIAS,
2248 cycles_to_ns(ce->engine, cycles));
2249
2250 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2251
2252err:
2253 intel_gt_set_wedged(ce->engine->gt);
2254 return err;
2255}
2256
2257static void rps_pin(struct intel_gt *gt)
2258{
2259 /* Pin the frequency to max */
2260 atomic_inc(>->rps.num_waiters);
2261 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2262
2263 mutex_lock(>->rps.lock);
2264 intel_rps_set(>->rps, gt->rps.max_freq);
2265 mutex_unlock(>->rps.lock);
2266}
2267
2268static void rps_unpin(struct intel_gt *gt)
2269{
2270 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2271 atomic_dec(>->rps.num_waiters);
2272}
2273
2274static int perf_request_latency(void *arg)
2275{
2276 struct drm_i915_private *i915 = arg;
2277 struct intel_engine_cs *engine;
2278 struct pm_qos_request qos;
2279 int err = 0;
2280
2281 if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
2282 return 0;
2283
2284 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2285
2286 for_each_uabi_engine(engine, i915) {
2287 struct intel_context *ce;
2288
2289 ce = intel_context_create(engine);
2290 if (IS_ERR(ce))
2291 goto out;
2292
2293 err = intel_context_pin(ce);
2294 if (err) {
2295 intel_context_put(ce);
2296 goto out;
2297 }
2298
2299 st_engine_heartbeat_disable(engine);
2300 rps_pin(engine->gt);
2301
2302 if (err == 0)
2303 err = measure_semaphore_response(ce);
2304 if (err == 0)
2305 err = measure_idle_dispatch(ce);
2306 if (err == 0)
2307 err = measure_busy_dispatch(ce);
2308 if (err == 0)
2309 err = measure_inter_request(ce);
2310 if (err == 0)
2311 err = measure_context_switch(ce);
2312 if (err == 0)
2313 err = measure_preemption(ce);
2314 if (err == 0)
2315 err = measure_completion(ce);
2316
2317 rps_unpin(engine->gt);
2318 st_engine_heartbeat_enable(engine);
2319
2320 intel_context_unpin(ce);
2321 intel_context_put(ce);
2322 if (err)
2323 goto out;
2324 }
2325
2326out:
2327 if (igt_flush_test(i915))
2328 err = -EIO;
2329
2330 cpu_latency_qos_remove_request(&qos);
2331 return err;
2332}
2333
2334static int s_sync0(void *arg)
2335{
2336 struct perf_series *ps = arg;
2337 IGT_TIMEOUT(end_time);
2338 unsigned int idx = 0;
2339 int err = 0;
2340
2341 GEM_BUG_ON(!ps->nengines);
2342 do {
2343 struct i915_request *rq;
2344
2345 rq = i915_request_create(ps->ce[idx]);
2346 if (IS_ERR(rq)) {
2347 err = PTR_ERR(rq);
2348 break;
2349 }
2350
2351 i915_request_get(rq);
2352 i915_request_add(rq);
2353
2354 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2355 err = -ETIME;
2356 i915_request_put(rq);
2357 if (err)
2358 break;
2359
2360 if (++idx == ps->nengines)
2361 idx = 0;
2362 } while (!__igt_timeout(end_time, NULL));
2363
2364 return err;
2365}
2366
2367static int s_sync1(void *arg)
2368{
2369 struct perf_series *ps = arg;
2370 struct i915_request *prev = NULL;
2371 IGT_TIMEOUT(end_time);
2372 unsigned int idx = 0;
2373 int err = 0;
2374
2375 GEM_BUG_ON(!ps->nengines);
2376 do {
2377 struct i915_request *rq;
2378
2379 rq = i915_request_create(ps->ce[idx]);
2380 if (IS_ERR(rq)) {
2381 err = PTR_ERR(rq);
2382 break;
2383 }
2384
2385 i915_request_get(rq);
2386 i915_request_add(rq);
2387
2388 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2389 err = -ETIME;
2390 i915_request_put(prev);
2391 prev = rq;
2392 if (err)
2393 break;
2394
2395 if (++idx == ps->nengines)
2396 idx = 0;
2397 } while (!__igt_timeout(end_time, NULL));
2398 i915_request_put(prev);
2399
2400 return err;
2401}
2402
2403static int s_many(void *arg)
2404{
2405 struct perf_series *ps = arg;
2406 IGT_TIMEOUT(end_time);
2407 unsigned int idx = 0;
2408
2409 GEM_BUG_ON(!ps->nengines);
2410 do {
2411 struct i915_request *rq;
2412
2413 rq = i915_request_create(ps->ce[idx]);
2414 if (IS_ERR(rq))
2415 return PTR_ERR(rq);
2416
2417 i915_request_add(rq);
2418
2419 if (++idx == ps->nengines)
2420 idx = 0;
2421 } while (!__igt_timeout(end_time, NULL));
2422
2423 return 0;
2424}
2425
2426static int perf_series_engines(void *arg)
2427{
2428 struct drm_i915_private *i915 = arg;
2429 static int (* const func[])(void *arg) = {
2430 s_sync0,
2431 s_sync1,
2432 s_many,
2433 NULL,
2434 };
2435 const unsigned int nengines = num_uabi_engines(i915);
2436 struct intel_engine_cs *engine;
2437 int (* const *fn)(void *arg);
2438 struct pm_qos_request qos;
2439 struct perf_stats *stats;
2440 struct perf_series *ps;
2441 unsigned int idx;
2442 int err = 0;
2443
2444 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2445 if (!stats)
2446 return -ENOMEM;
2447
2448 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2449 if (!ps) {
2450 kfree(stats);
2451 return -ENOMEM;
2452 }
2453
2454 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2455
2456 ps->i915 = i915;
2457 ps->nengines = nengines;
2458
2459 idx = 0;
2460 for_each_uabi_engine(engine, i915) {
2461 struct intel_context *ce;
2462
2463 ce = intel_context_create(engine);
2464 if (IS_ERR(ce))
2465 goto out;
2466
2467 err = intel_context_pin(ce);
2468 if (err) {
2469 intel_context_put(ce);
2470 goto out;
2471 }
2472
2473 ps->ce[idx++] = ce;
2474 }
2475 GEM_BUG_ON(idx != ps->nengines);
2476
2477 for (fn = func; *fn && !err; fn++) {
2478 char name[KSYM_NAME_LEN];
2479 struct igt_live_test t;
2480
2481 snprintf(name, sizeof(name), "%ps", *fn);
2482 err = igt_live_test_begin(&t, i915, __func__, name);
2483 if (err)
2484 break;
2485
2486 for (idx = 0; idx < nengines; idx++) {
2487 struct perf_stats *p =
2488 memset(&stats[idx], 0, sizeof(stats[idx]));
2489 struct intel_context *ce = ps->ce[idx];
2490
2491 p->engine = ps->ce[idx]->engine;
2492 intel_engine_pm_get(p->engine);
2493
2494 if (intel_engine_supports_stats(p->engine))
2495 p->busy = intel_engine_get_busy_time(p->engine,
2496 &p->time) + 1;
2497 else
2498 p->time = ktime_get();
2499 p->runtime = -intel_context_get_total_runtime_ns(ce);
2500 }
2501
2502 err = (*fn)(ps);
2503 if (igt_live_test_end(&t))
2504 err = -EIO;
2505
2506 for (idx = 0; idx < nengines; idx++) {
2507 struct perf_stats *p = &stats[idx];
2508 struct intel_context *ce = ps->ce[idx];
2509 int integer, decimal;
2510 u64 busy, dt, now;
2511
2512 if (p->busy)
2513 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2514 &now),
2515 p->busy - 1);
2516 else
2517 now = ktime_get();
2518 p->time = ktime_sub(now, p->time);
2519
2520 err = switch_to_kernel_sync(ce, err);
2521 p->runtime += intel_context_get_total_runtime_ns(ce);
2522 intel_engine_pm_put(p->engine);
2523
2524 busy = 100 * ktime_to_ns(p->busy);
2525 dt = ktime_to_ns(p->time);
2526 if (dt) {
2527 integer = div64_u64(busy, dt);
2528 busy -= integer * dt;
2529 decimal = div64_u64(100 * busy, dt);
2530 } else {
2531 integer = 0;
2532 decimal = 0;
2533 }
2534
2535 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2536 name, p->engine->name, ce->timeline->seqno,
2537 integer, decimal,
2538 div_u64(p->runtime, 1000 * 1000),
2539 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2540 }
2541 }
2542
2543out:
2544 for (idx = 0; idx < nengines; idx++) {
2545 if (IS_ERR_OR_NULL(ps->ce[idx]))
2546 break;
2547
2548 intel_context_unpin(ps->ce[idx]);
2549 intel_context_put(ps->ce[idx]);
2550 }
2551 kfree(ps);
2552
2553 cpu_latency_qos_remove_request(&qos);
2554 kfree(stats);
2555 return err;
2556}
2557
2558static int p_sync0(void *arg)
2559{
2560 struct perf_stats *p = arg;
2561 struct intel_engine_cs *engine = p->engine;
2562 struct intel_context *ce;
2563 IGT_TIMEOUT(end_time);
2564 unsigned long count;
2565 bool busy;
2566 int err = 0;
2567
2568 ce = intel_context_create(engine);
2569 if (IS_ERR(ce))
2570 return PTR_ERR(ce);
2571
2572 err = intel_context_pin(ce);
2573 if (err) {
2574 intel_context_put(ce);
2575 return err;
2576 }
2577
2578 if (intel_engine_supports_stats(engine)) {
2579 p->busy = intel_engine_get_busy_time(engine, &p->time);
2580 busy = true;
2581 } else {
2582 p->time = ktime_get();
2583 busy = false;
2584 }
2585
2586 count = 0;
2587 do {
2588 struct i915_request *rq;
2589
2590 rq = i915_request_create(ce);
2591 if (IS_ERR(rq)) {
2592 err = PTR_ERR(rq);
2593 break;
2594 }
2595
2596 i915_request_get(rq);
2597 i915_request_add(rq);
2598
2599 err = 0;
2600 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2601 err = -ETIME;
2602 i915_request_put(rq);
2603 if (err)
2604 break;
2605
2606 count++;
2607 } while (!__igt_timeout(end_time, NULL));
2608
2609 if (busy) {
2610 ktime_t now;
2611
2612 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2613 p->busy);
2614 p->time = ktime_sub(now, p->time);
2615 } else {
2616 p->time = ktime_sub(ktime_get(), p->time);
2617 }
2618
2619 err = switch_to_kernel_sync(ce, err);
2620 p->runtime = intel_context_get_total_runtime_ns(ce);
2621 p->count = count;
2622
2623 intel_context_unpin(ce);
2624 intel_context_put(ce);
2625 return err;
2626}
2627
2628static int p_sync1(void *arg)
2629{
2630 struct perf_stats *p = arg;
2631 struct intel_engine_cs *engine = p->engine;
2632 struct i915_request *prev = NULL;
2633 struct intel_context *ce;
2634 IGT_TIMEOUT(end_time);
2635 unsigned long count;
2636 bool busy;
2637 int err = 0;
2638
2639 ce = intel_context_create(engine);
2640 if (IS_ERR(ce))
2641 return PTR_ERR(ce);
2642
2643 err = intel_context_pin(ce);
2644 if (err) {
2645 intel_context_put(ce);
2646 return err;
2647 }
2648
2649 if (intel_engine_supports_stats(engine)) {
2650 p->busy = intel_engine_get_busy_time(engine, &p->time);
2651 busy = true;
2652 } else {
2653 p->time = ktime_get();
2654 busy = false;
2655 }
2656
2657 count = 0;
2658 do {
2659 struct i915_request *rq;
2660
2661 rq = i915_request_create(ce);
2662 if (IS_ERR(rq)) {
2663 err = PTR_ERR(rq);
2664 break;
2665 }
2666
2667 i915_request_get(rq);
2668 i915_request_add(rq);
2669
2670 err = 0;
2671 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2672 err = -ETIME;
2673 i915_request_put(prev);
2674 prev = rq;
2675 if (err)
2676 break;
2677
2678 count++;
2679 } while (!__igt_timeout(end_time, NULL));
2680 i915_request_put(prev);
2681
2682 if (busy) {
2683 ktime_t now;
2684
2685 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2686 p->busy);
2687 p->time = ktime_sub(now, p->time);
2688 } else {
2689 p->time = ktime_sub(ktime_get(), p->time);
2690 }
2691
2692 err = switch_to_kernel_sync(ce, err);
2693 p->runtime = intel_context_get_total_runtime_ns(ce);
2694 p->count = count;
2695
2696 intel_context_unpin(ce);
2697 intel_context_put(ce);
2698 return err;
2699}
2700
2701static int p_many(void *arg)
2702{
2703 struct perf_stats *p = arg;
2704 struct intel_engine_cs *engine = p->engine;
2705 struct intel_context *ce;
2706 IGT_TIMEOUT(end_time);
2707 unsigned long count;
2708 int err = 0;
2709 bool busy;
2710
2711 ce = intel_context_create(engine);
2712 if (IS_ERR(ce))
2713 return PTR_ERR(ce);
2714
2715 err = intel_context_pin(ce);
2716 if (err) {
2717 intel_context_put(ce);
2718 return err;
2719 }
2720
2721 if (intel_engine_supports_stats(engine)) {
2722 p->busy = intel_engine_get_busy_time(engine, &p->time);
2723 busy = true;
2724 } else {
2725 p->time = ktime_get();
2726 busy = false;
2727 }
2728
2729 count = 0;
2730 do {
2731 struct i915_request *rq;
2732
2733 rq = i915_request_create(ce);
2734 if (IS_ERR(rq)) {
2735 err = PTR_ERR(rq);
2736 break;
2737 }
2738
2739 i915_request_add(rq);
2740 count++;
2741 } while (!__igt_timeout(end_time, NULL));
2742
2743 if (busy) {
2744 ktime_t now;
2745
2746 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2747 p->busy);
2748 p->time = ktime_sub(now, p->time);
2749 } else {
2750 p->time = ktime_sub(ktime_get(), p->time);
2751 }
2752
2753 err = switch_to_kernel_sync(ce, err);
2754 p->runtime = intel_context_get_total_runtime_ns(ce);
2755 p->count = count;
2756
2757 intel_context_unpin(ce);
2758 intel_context_put(ce);
2759 return err;
2760}
2761
2762static int perf_parallel_engines(void *arg)
2763{
2764 struct drm_i915_private *i915 = arg;
2765 static int (* const func[])(void *arg) = {
2766 p_sync0,
2767 p_sync1,
2768 p_many,
2769 NULL,
2770 };
2771 const unsigned int nengines = num_uabi_engines(i915);
2772 struct intel_engine_cs *engine;
2773 int (* const *fn)(void *arg);
2774 struct pm_qos_request qos;
2775 struct {
2776 struct perf_stats p;
2777 struct task_struct *tsk;
2778 } *engines;
2779 int err = 0;
2780
2781 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
2782 if (!engines)
2783 return -ENOMEM;
2784
2785 cpu_latency_qos_add_request(&qos, 0);
2786
2787 for (fn = func; *fn; fn++) {
2788 char name[KSYM_NAME_LEN];
2789 struct igt_live_test t;
2790 unsigned int idx;
2791
2792 snprintf(name, sizeof(name), "%ps", *fn);
2793 err = igt_live_test_begin(&t, i915, __func__, name);
2794 if (err)
2795 break;
2796
2797 atomic_set(&i915->selftest.counter, nengines);
2798
2799 idx = 0;
2800 for_each_uabi_engine(engine, i915) {
2801 intel_engine_pm_get(engine);
2802
2803 memset(&engines[idx].p, 0, sizeof(engines[idx].p));
2804 engines[idx].p.engine = engine;
2805
2806 engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
2807 "igt:%s", engine->name);
2808 if (IS_ERR(engines[idx].tsk)) {
2809 err = PTR_ERR(engines[idx].tsk);
2810 intel_engine_pm_put(engine);
2811 break;
2812 }
2813 get_task_struct(engines[idx++].tsk);
2814 }
2815
2816 yield(); /* start all threads before we kthread_stop() */
2817
2818 idx = 0;
2819 for_each_uabi_engine(engine, i915) {
2820 int status;
2821
2822 if (IS_ERR(engines[idx].tsk))
2823 break;
2824
2825 status = kthread_stop(engines[idx].tsk);
2826 if (status && !err)
2827 err = status;
2828
2829 intel_engine_pm_put(engine);
2830 put_task_struct(engines[idx++].tsk);
2831 }
2832
2833 if (igt_live_test_end(&t))
2834 err = -EIO;
2835 if (err)
2836 break;
2837
2838 idx = 0;
2839 for_each_uabi_engine(engine, i915) {
2840 struct perf_stats *p = &engines[idx].p;
2841 u64 busy = 100 * ktime_to_ns(p->busy);
2842 u64 dt = ktime_to_ns(p->time);
2843 int integer, decimal;
2844
2845 if (dt) {
2846 integer = div64_u64(busy, dt);
2847 busy -= integer * dt;
2848 decimal = div64_u64(100 * busy, dt);
2849 } else {
2850 integer = 0;
2851 decimal = 0;
2852 }
2853
2854 GEM_BUG_ON(engine != p->engine);
2855 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2856 name, engine->name, p->count, integer, decimal,
2857 div_u64(p->runtime, 1000 * 1000),
2858 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2859 idx++;
2860 }
2861 }
2862
2863 cpu_latency_qos_remove_request(&qos);
2864 kfree(engines);
2865 return err;
2866}
2867
2868int i915_request_perf_selftests(struct drm_i915_private *i915)
2869{
2870 static const struct i915_subtest tests[] = {
2871 SUBTEST(perf_request_latency),
2872 SUBTEST(perf_series_engines),
2873 SUBTEST(perf_parallel_engines),
2874 };
2875
2876 if (intel_gt_is_wedged(&i915->gt))
2877 return 0;
2878
2879 return i915_subtests(tests, i915);
2880}