Loading...
1/*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Ben Widawsky <ben@bwidawsk.net>
25 * Michel Thierry <michel.thierry@intel.com>
26 * Thomas Daniel <thomas.daniel@intel.com>
27 * Oscar Mateo <oscar.mateo@intel.com>
28 *
29 */
30
31/**
32 * DOC: Logical Rings, Logical Ring Contexts and Execlists
33 *
34 * Motivation:
35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36 * These expanded contexts enable a number of new abilities, especially
37 * "Execlists" (also implemented in this file).
38 *
39 * One of the main differences with the legacy HW contexts is that logical
40 * ring contexts incorporate many more things to the context's state, like
41 * PDPs or ringbuffer control registers:
42 *
43 * The reason why PDPs are included in the context is straightforward: as
44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45 * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46 * instead, the GPU will do it for you on the context switch.
47 *
48 * But, what about the ringbuffer control registers (head, tail, etc..)?
49 * shouldn't we just need a set of those per engine command streamer? This is
50 * where the name "Logical Rings" starts to make sense: by virtualizing the
51 * rings, the engine cs shifts to a new "ring buffer" with every context
52 * switch. When you want to submit a workload to the GPU you: A) choose your
53 * context, B) find its appropriate virtualized ring, C) write commands to it
54 * and then, finally, D) tell the GPU to switch to that context.
55 *
56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57 * to a contexts is via a context execution list, ergo "Execlists".
58 *
59 * LRC implementation:
60 * Regarding the creation of contexts, we have:
61 *
62 * - One global default context.
63 * - One local default context for each opened fd.
64 * - One local extra context for each context create ioctl call.
65 *
66 * Now that ringbuffers belong per-context (and not per-engine, like before)
67 * and that contexts are uniquely tied to a given engine (and not reusable,
68 * like before) we need:
69 *
70 * - One ringbuffer per-engine inside each context.
71 * - One backing object per-engine inside each context.
72 *
73 * The global default context starts its life with these new objects fully
74 * allocated and populated. The local default context for each opened fd is
75 * more complex, because we don't know at creation time which engine is going
76 * to use them. To handle this, we have implemented a deferred creation of LR
77 * contexts:
78 *
79 * The local context starts its life as a hollow or blank holder, that only
80 * gets populated for a given engine once we receive an execbuffer. If later
81 * on we receive another execbuffer ioctl for the same context but a different
82 * engine, we allocate/populate a new ringbuffer and context backing object and
83 * so on.
84 *
85 * Finally, regarding local contexts created using the ioctl call: as they are
86 * only allowed with the render ring, we can allocate & populate them right
87 * away (no need to defer anything, at least for now).
88 *
89 * Execlists implementation:
90 * Execlists are the new method by which, on gen8+ hardware, workloads are
91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92 * This method works as follows:
93 *
94 * When a request is committed, its commands (the BB start and any leading or
95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96 * for the appropriate context. The tail pointer in the hardware context is not
97 * updated at this time, but instead, kept by the driver in the ringbuffer
98 * structure. A structure representing this request is added to a request queue
99 * for the appropriate engine: this structure contains a copy of the context's
100 * tail after the request was written to the ring buffer and a pointer to the
101 * context itself.
102 *
103 * If the engine's request queue was empty before the request was added, the
104 * queue is processed immediately. Otherwise the queue will be processed during
105 * a context switch interrupt. In any case, elements on the queue will get sent
106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107 * globally unique 20-bits submission ID.
108 *
109 * When execution of a request completes, the GPU updates the context status
110 * buffer with a context complete event and generates a context switch interrupt.
111 * During the interrupt handling, the driver examines the events in the buffer:
112 * for each context complete event, if the announced ID matches that on the head
113 * of the request queue, then that request is retired and removed from the queue.
114 *
115 * After processing, if any requests were retired and the queue is not empty
116 * then a new execution list can be submitted. The two requests at the front of
117 * the queue are next to be submitted but since a context may not occur twice in
118 * an execution list, if subsequent requests have the same ID as the first then
119 * the two requests must be combined. This is done simply by discarding requests
120 * at the head of the queue until either only one requests is left (in which case
121 * we use a NULL second context) or the first two requests have unique IDs.
122 *
123 * By always executing the first two requests in the queue the driver ensures
124 * that the GPU is kept as busy as possible. In the case where a single context
125 * completes but a second context is still executing, the request for this second
126 * context will be at the head of the queue when we remove the first one. This
127 * request will then be resubmitted along with a new request for a different context,
128 * which will cause the hardware to continue executing the second request and queue
129 * the new request (the GPU detects the condition of a context getting preempted
130 * with the same context and optimizes the context switch flow by not doing
131 * preemption, but just sampling the new tail pointer).
132 *
133 */
134#include <linux/interrupt.h>
135
136#include "gem/i915_gem_context.h"
137
138#include "i915_drv.h"
139#include "i915_perf.h"
140#include "i915_trace.h"
141#include "i915_vgpu.h"
142#include "intel_engine_pm.h"
143#include "intel_gt.h"
144#include "intel_gt_pm.h"
145#include "intel_lrc_reg.h"
146#include "intel_mocs.h"
147#include "intel_reset.h"
148#include "intel_workarounds.h"
149
150#define RING_EXECLIST_QFULL (1 << 0x2)
151#define RING_EXECLIST1_VALID (1 << 0x3)
152#define RING_EXECLIST0_VALID (1 << 0x4)
153#define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE)
154#define RING_EXECLIST1_ACTIVE (1 << 0x11)
155#define RING_EXECLIST0_ACTIVE (1 << 0x12)
156
157#define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0)
158#define GEN8_CTX_STATUS_PREEMPTED (1 << 1)
159#define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2)
160#define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3)
161#define GEN8_CTX_STATUS_COMPLETE (1 << 4)
162#define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15)
163
164#define GEN8_CTX_STATUS_COMPLETED_MASK \
165 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
166
167#define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
168
169#define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */
170#define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
171#define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15)
172#define GEN12_IDLE_CTX_ID 0x7FF
173#define GEN12_CSB_CTX_VALID(csb_dw) \
174 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
175
176/* Typical size of the average request (2 pipecontrols and a MI_BB) */
177#define EXECLISTS_REQUEST_SIZE 64 /* bytes */
178#define WA_TAIL_DWORDS 2
179#define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
180
181struct virtual_engine {
182 struct intel_engine_cs base;
183 struct intel_context context;
184
185 /*
186 * We allow only a single request through the virtual engine at a time
187 * (each request in the timeline waits for the completion fence of
188 * the previous before being submitted). By restricting ourselves to
189 * only submitting a single request, each request is placed on to a
190 * physical to maximise load spreading (by virtue of the late greedy
191 * scheduling -- each real engine takes the next available request
192 * upon idling).
193 */
194 struct i915_request *request;
195
196 /*
197 * We keep a rbtree of available virtual engines inside each physical
198 * engine, sorted by priority. Here we preallocate the nodes we need
199 * for the virtual engine, indexed by physical_engine->id.
200 */
201 struct ve_node {
202 struct rb_node rb;
203 int prio;
204 } nodes[I915_NUM_ENGINES];
205
206 /*
207 * Keep track of bonded pairs -- restrictions upon on our selection
208 * of physical engines any particular request may be submitted to.
209 * If we receive a submit-fence from a master engine, we will only
210 * use one of sibling_mask physical engines.
211 */
212 struct ve_bond {
213 const struct intel_engine_cs *master;
214 intel_engine_mask_t sibling_mask;
215 } *bonds;
216 unsigned int num_bonds;
217
218 /* And finally, which physical engines this virtual engine maps onto. */
219 unsigned int num_siblings;
220 struct intel_engine_cs *siblings[0];
221};
222
223static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
224{
225 GEM_BUG_ON(!intel_engine_is_virtual(engine));
226 return container_of(engine, struct virtual_engine, base);
227}
228
229static int __execlists_context_alloc(struct intel_context *ce,
230 struct intel_engine_cs *engine);
231
232static void execlists_init_reg_state(u32 *reg_state,
233 struct intel_context *ce,
234 struct intel_engine_cs *engine,
235 struct intel_ring *ring);
236
237static void mark_eio(struct i915_request *rq)
238{
239 if (!i915_request_signaled(rq))
240 dma_fence_set_error(&rq->fence, -EIO);
241 i915_request_mark_complete(rq);
242}
243
244static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
245{
246 return (i915_ggtt_offset(engine->status_page.vma) +
247 I915_GEM_HWS_PREEMPT_ADDR);
248}
249
250static inline void
251ring_set_paused(const struct intel_engine_cs *engine, int state)
252{
253 /*
254 * We inspect HWS_PREEMPT with a semaphore inside
255 * engine->emit_fini_breadcrumb. If the dword is true,
256 * the ring is paused as the semaphore will busywait
257 * until the dword is false.
258 */
259 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
260 if (state)
261 wmb();
262}
263
264static inline struct i915_priolist *to_priolist(struct rb_node *rb)
265{
266 return rb_entry(rb, struct i915_priolist, node);
267}
268
269static inline int rq_prio(const struct i915_request *rq)
270{
271 return rq->sched.attr.priority;
272}
273
274static int effective_prio(const struct i915_request *rq)
275{
276 int prio = rq_prio(rq);
277
278 /*
279 * If this request is special and must not be interrupted at any
280 * cost, so be it. Note we are only checking the most recent request
281 * in the context and so may be masking an earlier vip request. It
282 * is hoped that under the conditions where nopreempt is used, this
283 * will not matter (i.e. all requests to that context will be
284 * nopreempt for as long as desired).
285 */
286 if (i915_request_has_nopreempt(rq))
287 prio = I915_PRIORITY_UNPREEMPTABLE;
288
289 /*
290 * On unwinding the active request, we give it a priority bump
291 * if it has completed waiting on any semaphore. If we know that
292 * the request has already started, we can prevent an unwanted
293 * preempt-to-idle cycle by taking that into account now.
294 */
295 if (__i915_request_has_started(rq))
296 prio |= I915_PRIORITY_NOSEMAPHORE;
297
298 /* Restrict mere WAIT boosts from triggering preemption */
299 BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
300 return prio | __NO_PREEMPTION;
301}
302
303static int queue_prio(const struct intel_engine_execlists *execlists)
304{
305 struct i915_priolist *p;
306 struct rb_node *rb;
307
308 rb = rb_first_cached(&execlists->queue);
309 if (!rb)
310 return INT_MIN;
311
312 /*
313 * As the priolist[] are inverted, with the highest priority in [0],
314 * we have to flip the index value to become priority.
315 */
316 p = to_priolist(rb);
317 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
318}
319
320static inline bool need_preempt(const struct intel_engine_cs *engine,
321 const struct i915_request *rq,
322 struct rb_node *rb)
323{
324 int last_prio;
325
326 if (!intel_engine_has_semaphores(engine))
327 return false;
328
329 /*
330 * Check if the current priority hint merits a preemption attempt.
331 *
332 * We record the highest value priority we saw during rescheduling
333 * prior to this dequeue, therefore we know that if it is strictly
334 * less than the current tail of ESLP[0], we do not need to force
335 * a preempt-to-idle cycle.
336 *
337 * However, the priority hint is a mere hint that we may need to
338 * preempt. If that hint is stale or we may be trying to preempt
339 * ourselves, ignore the request.
340 */
341 last_prio = effective_prio(rq);
342 if (!i915_scheduler_need_preempt(engine->execlists.queue_priority_hint,
343 last_prio))
344 return false;
345
346 /*
347 * Check against the first request in ELSP[1], it will, thanks to the
348 * power of PI, be the highest priority of that context.
349 */
350 if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
351 rq_prio(list_next_entry(rq, sched.link)) > last_prio)
352 return true;
353
354 if (rb) {
355 struct virtual_engine *ve =
356 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
357 bool preempt = false;
358
359 if (engine == ve->siblings[0]) { /* only preempt one sibling */
360 struct i915_request *next;
361
362 rcu_read_lock();
363 next = READ_ONCE(ve->request);
364 if (next)
365 preempt = rq_prio(next) > last_prio;
366 rcu_read_unlock();
367 }
368
369 if (preempt)
370 return preempt;
371 }
372
373 /*
374 * If the inflight context did not trigger the preemption, then maybe
375 * it was the set of queued requests? Pick the highest priority in
376 * the queue (the first active priolist) and see if it deserves to be
377 * running instead of ELSP[0].
378 *
379 * The highest priority request in the queue can not be either
380 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
381 * context, it's priority would not exceed ELSP[0] aka last_prio.
382 */
383 return queue_prio(&engine->execlists) > last_prio;
384}
385
386__maybe_unused static inline bool
387assert_priority_queue(const struct i915_request *prev,
388 const struct i915_request *next)
389{
390 /*
391 * Without preemption, the prev may refer to the still active element
392 * which we refuse to let go.
393 *
394 * Even with preemption, there are times when we think it is better not
395 * to preempt and leave an ostensibly lower priority request in flight.
396 */
397 if (i915_request_is_active(prev))
398 return true;
399
400 return rq_prio(prev) >= rq_prio(next);
401}
402
403/*
404 * The context descriptor encodes various attributes of a context,
405 * including its GTT address and some flags. Because it's fairly
406 * expensive to calculate, we'll just do it once and cache the result,
407 * which remains valid until the context is unpinned.
408 *
409 * This is what a descriptor looks like, from LSB to MSB::
410 *
411 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template)
412 * bits 12-31: LRCA, GTT address of (the HWSP of) this context
413 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC)
414 * bits 53-54: mbz, reserved for use by hardware
415 * bits 55-63: group ID, currently unused and set to 0
416 *
417 * Starting from Gen11, the upper dword of the descriptor has a new format:
418 *
419 * bits 32-36: reserved
420 * bits 37-47: SW context ID
421 * bits 48:53: engine instance
422 * bit 54: mbz, reserved for use by hardware
423 * bits 55-60: SW counter
424 * bits 61-63: engine class
425 *
426 * engine info, SW context ID and SW counter need to form a unique number
427 * (Context ID) per lrc.
428 */
429static u64
430lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
431{
432 struct i915_gem_context *ctx = ce->gem_context;
433 u64 desc;
434
435 BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH)));
436 BUILD_BUG_ON(GEN11_MAX_CONTEXT_HW_ID > (BIT(GEN11_SW_CTX_ID_WIDTH)));
437
438 desc = INTEL_LEGACY_32B_CONTEXT;
439 if (i915_vm_is_4lvl(ce->vm))
440 desc = INTEL_LEGACY_64B_CONTEXT;
441 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
442
443 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
444 if (IS_GEN(engine->i915, 8))
445 desc |= GEN8_CTX_L3LLC_COHERENT;
446
447 desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE;
448 /* bits 12-31 */
449 /*
450 * The following 32bits are copied into the OA reports (dword 2).
451 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
452 * anything below.
453 */
454 if (INTEL_GEN(engine->i915) >= 11) {
455 GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
456 desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;
457 /* bits 37-47 */
458
459 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
460 /* bits 48-53 */
461
462 /* TODO: decide what to do with SW counter (bits 55-60) */
463
464 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
465 /* bits 61-63 */
466 } else {
467 GEM_BUG_ON(ctx->hw_id >= BIT(GEN8_CTX_ID_WIDTH));
468 desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT; /* bits 32-52 */
469 }
470
471 return desc;
472}
473
474static void unwind_wa_tail(struct i915_request *rq)
475{
476 rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
477 assert_ring_tail_valid(rq->ring, rq->tail);
478}
479
480static struct i915_request *
481__unwind_incomplete_requests(struct intel_engine_cs *engine)
482{
483 struct i915_request *rq, *rn, *active = NULL;
484 struct list_head *uninitialized_var(pl);
485 int prio = I915_PRIORITY_INVALID;
486
487 lockdep_assert_held(&engine->active.lock);
488
489 list_for_each_entry_safe_reverse(rq, rn,
490 &engine->active.requests,
491 sched.link) {
492 struct intel_engine_cs *owner;
493
494 if (i915_request_completed(rq))
495 continue; /* XXX */
496
497 __i915_request_unsubmit(rq);
498 unwind_wa_tail(rq);
499
500 /*
501 * Push the request back into the queue for later resubmission.
502 * If this request is not native to this physical engine (i.e.
503 * it came from a virtual source), push it back onto the virtual
504 * engine so that it can be moved across onto another physical
505 * engine as load dictates.
506 */
507 owner = rq->hw_context->engine;
508 if (likely(owner == engine)) {
509 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
510 if (rq_prio(rq) != prio) {
511 prio = rq_prio(rq);
512 pl = i915_sched_lookup_priolist(engine, prio);
513 }
514 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
515
516 list_move(&rq->sched.link, pl);
517 active = rq;
518 } else {
519 /*
520 * Decouple the virtual breadcrumb before moving it
521 * back to the virtual engine -- we don't want the
522 * request to complete in the background and try
523 * and cancel the breadcrumb on the virtual engine
524 * (instead of the old engine where it is linked)!
525 */
526 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
527 &rq->fence.flags)) {
528 spin_lock(&rq->lock);
529 i915_request_cancel_breadcrumb(rq);
530 spin_unlock(&rq->lock);
531 }
532 rq->engine = owner;
533 owner->submit_request(rq);
534 active = NULL;
535 }
536 }
537
538 return active;
539}
540
541struct i915_request *
542execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
543{
544 struct intel_engine_cs *engine =
545 container_of(execlists, typeof(*engine), execlists);
546
547 return __unwind_incomplete_requests(engine);
548}
549
550static inline void
551execlists_context_status_change(struct i915_request *rq, unsigned long status)
552{
553 /*
554 * Only used when GVT-g is enabled now. When GVT-g is disabled,
555 * The compiler should eliminate this function as dead-code.
556 */
557 if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
558 return;
559
560 atomic_notifier_call_chain(&rq->engine->context_status_notifier,
561 status, rq);
562}
563
564static inline struct intel_engine_cs *
565__execlists_schedule_in(struct i915_request *rq)
566{
567 struct intel_engine_cs * const engine = rq->engine;
568 struct intel_context * const ce = rq->hw_context;
569
570 intel_context_get(ce);
571
572 intel_gt_pm_get(engine->gt);
573 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
574 intel_engine_context_in(engine);
575
576 return engine;
577}
578
579static inline struct i915_request *
580execlists_schedule_in(struct i915_request *rq, int idx)
581{
582 struct intel_context * const ce = rq->hw_context;
583 struct intel_engine_cs *old;
584
585 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
586 trace_i915_request_in(rq, idx);
587
588 old = READ_ONCE(ce->inflight);
589 do {
590 if (!old) {
591 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
592 break;
593 }
594 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
595
596 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
597 return i915_request_get(rq);
598}
599
600static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
601{
602 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
603 struct i915_request *next = READ_ONCE(ve->request);
604
605 if (next && next->execution_mask & ~rq->execution_mask)
606 tasklet_schedule(&ve->base.execlists.tasklet);
607}
608
609static inline void
610__execlists_schedule_out(struct i915_request *rq,
611 struct intel_engine_cs * const engine)
612{
613 struct intel_context * const ce = rq->hw_context;
614
615 intel_engine_context_out(engine);
616 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
617 intel_gt_pm_put(engine->gt);
618
619 /*
620 * If this is part of a virtual engine, its next request may
621 * have been blocked waiting for access to the active context.
622 * We have to kick all the siblings again in case we need to
623 * switch (e.g. the next request is not runnable on this
624 * engine). Hopefully, we will already have submitted the next
625 * request before the tasklet runs and do not need to rebuild
626 * each virtual tree and kick everyone again.
627 */
628 if (ce->engine != engine)
629 kick_siblings(rq, ce);
630
631 intel_context_put(ce);
632}
633
634static inline void
635execlists_schedule_out(struct i915_request *rq)
636{
637 struct intel_context * const ce = rq->hw_context;
638 struct intel_engine_cs *cur, *old;
639
640 trace_i915_request_out(rq);
641
642 old = READ_ONCE(ce->inflight);
643 do
644 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
645 while (!try_cmpxchg(&ce->inflight, &old, cur));
646 if (!cur)
647 __execlists_schedule_out(rq, old);
648
649 i915_request_put(rq);
650}
651
652static u64 execlists_update_context(const struct i915_request *rq)
653{
654 struct intel_context *ce = rq->hw_context;
655 u64 desc;
656
657 ce->lrc_reg_state[CTX_RING_TAIL + 1] =
658 intel_ring_set_tail(rq->ring, rq->tail);
659
660 /*
661 * Make sure the context image is complete before we submit it to HW.
662 *
663 * Ostensibly, writes (including the WCB) should be flushed prior to
664 * an uncached write such as our mmio register access, the empirical
665 * evidence (esp. on Braswell) suggests that the WC write into memory
666 * may not be visible to the HW prior to the completion of the UC
667 * register write and that we may begin execution from the context
668 * before its image is complete leading to invalid PD chasing.
669 *
670 * Furthermore, Braswell, at least, wants a full mb to be sure that
671 * the writes are coherent in memory (visible to the GPU) prior to
672 * execution, and not just visible to other CPUs (as is the result of
673 * wmb).
674 */
675 mb();
676
677 desc = ce->lrc_desc;
678 ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
679
680 return desc;
681}
682
683static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
684{
685 if (execlists->ctrl_reg) {
686 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
687 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
688 } else {
689 writel(upper_32_bits(desc), execlists->submit_reg);
690 writel(lower_32_bits(desc), execlists->submit_reg);
691 }
692}
693
694static __maybe_unused void
695trace_ports(const struct intel_engine_execlists *execlists,
696 const char *msg,
697 struct i915_request * const *ports)
698{
699 const struct intel_engine_cs *engine =
700 container_of(execlists, typeof(*engine), execlists);
701
702 GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n",
703 engine->name, msg,
704 ports[0]->fence.context,
705 ports[0]->fence.seqno,
706 i915_request_completed(ports[0]) ? "!" :
707 i915_request_started(ports[0]) ? "*" :
708 "",
709 ports[1] ? ports[1]->fence.context : 0,
710 ports[1] ? ports[1]->fence.seqno : 0);
711}
712
713static __maybe_unused bool
714assert_pending_valid(const struct intel_engine_execlists *execlists,
715 const char *msg)
716{
717 struct i915_request * const *port, *rq;
718 struct intel_context *ce = NULL;
719
720 trace_ports(execlists, msg, execlists->pending);
721
722 if (!execlists->pending[0])
723 return false;
724
725 if (execlists->pending[execlists_num_ports(execlists)])
726 return false;
727
728 for (port = execlists->pending; (rq = *port); port++) {
729 if (ce == rq->hw_context)
730 return false;
731
732 ce = rq->hw_context;
733 if (i915_request_completed(rq))
734 continue;
735
736 if (i915_active_is_idle(&ce->active))
737 return false;
738
739 if (!i915_vma_is_pinned(ce->state))
740 return false;
741 }
742
743 return ce;
744}
745
746static void execlists_submit_ports(struct intel_engine_cs *engine)
747{
748 struct intel_engine_execlists *execlists = &engine->execlists;
749 unsigned int n;
750
751 GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
752
753 /*
754 * We can skip acquiring intel_runtime_pm_get() here as it was taken
755 * on our behalf by the request (see i915_gem_mark_busy()) and it will
756 * not be relinquished until the device is idle (see
757 * i915_gem_idle_work_handler()). As a precaution, we make sure
758 * that all ELSP are drained i.e. we have processed the CSB,
759 * before allowing ourselves to idle and calling intel_runtime_pm_put().
760 */
761 GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
762
763 /*
764 * ELSQ note: the submit queue is not cleared after being submitted
765 * to the HW so we need to make sure we always clean it up. This is
766 * currently ensured by the fact that we always write the same number
767 * of elsq entries, keep this in mind before changing the loop below.
768 */
769 for (n = execlists_num_ports(execlists); n--; ) {
770 struct i915_request *rq = execlists->pending[n];
771
772 write_desc(execlists,
773 rq ? execlists_update_context(rq) : 0,
774 n);
775 }
776
777 /* we need to manually load the submit queue */
778 if (execlists->ctrl_reg)
779 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
780}
781
782static bool ctx_single_port_submission(const struct intel_context *ce)
783{
784 return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
785 i915_gem_context_force_single_submission(ce->gem_context));
786}
787
788static bool can_merge_ctx(const struct intel_context *prev,
789 const struct intel_context *next)
790{
791 if (prev != next)
792 return false;
793
794 if (ctx_single_port_submission(prev))
795 return false;
796
797 return true;
798}
799
800static bool can_merge_rq(const struct i915_request *prev,
801 const struct i915_request *next)
802{
803 GEM_BUG_ON(prev == next);
804 GEM_BUG_ON(!assert_priority_queue(prev, next));
805
806 /*
807 * We do not submit known completed requests. Therefore if the next
808 * request is already completed, we can pretend to merge it in
809 * with the previous context (and we will skip updating the ELSP
810 * and tracking). Thus hopefully keeping the ELSP full with active
811 * contexts, despite the best efforts of preempt-to-busy to confuse
812 * us.
813 */
814 if (i915_request_completed(next))
815 return true;
816
817 if (!can_merge_ctx(prev->hw_context, next->hw_context))
818 return false;
819
820 return true;
821}
822
823static void virtual_update_register_offsets(u32 *regs,
824 struct intel_engine_cs *engine)
825{
826 u32 base = engine->mmio_base;
827
828 /* Must match execlists_init_reg_state()! */
829
830 regs[CTX_CONTEXT_CONTROL] =
831 i915_mmio_reg_offset(RING_CONTEXT_CONTROL(base));
832 regs[CTX_RING_HEAD] = i915_mmio_reg_offset(RING_HEAD(base));
833 regs[CTX_RING_TAIL] = i915_mmio_reg_offset(RING_TAIL(base));
834 regs[CTX_RING_BUFFER_START] = i915_mmio_reg_offset(RING_START(base));
835 regs[CTX_RING_BUFFER_CONTROL] = i915_mmio_reg_offset(RING_CTL(base));
836
837 regs[CTX_BB_HEAD_U] = i915_mmio_reg_offset(RING_BBADDR_UDW(base));
838 regs[CTX_BB_HEAD_L] = i915_mmio_reg_offset(RING_BBADDR(base));
839 regs[CTX_BB_STATE] = i915_mmio_reg_offset(RING_BBSTATE(base));
840 regs[CTX_SECOND_BB_HEAD_U] =
841 i915_mmio_reg_offset(RING_SBBADDR_UDW(base));
842 regs[CTX_SECOND_BB_HEAD_L] = i915_mmio_reg_offset(RING_SBBADDR(base));
843 regs[CTX_SECOND_BB_STATE] = i915_mmio_reg_offset(RING_SBBSTATE(base));
844
845 regs[CTX_CTX_TIMESTAMP] =
846 i915_mmio_reg_offset(RING_CTX_TIMESTAMP(base));
847 regs[CTX_PDP3_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 3));
848 regs[CTX_PDP3_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 3));
849 regs[CTX_PDP2_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 2));
850 regs[CTX_PDP2_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 2));
851 regs[CTX_PDP1_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 1));
852 regs[CTX_PDP1_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 1));
853 regs[CTX_PDP0_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 0));
854 regs[CTX_PDP0_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 0));
855
856 if (engine->class == RENDER_CLASS) {
857 regs[CTX_RCS_INDIRECT_CTX] =
858 i915_mmio_reg_offset(RING_INDIRECT_CTX(base));
859 regs[CTX_RCS_INDIRECT_CTX_OFFSET] =
860 i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(base));
861 regs[CTX_BB_PER_CTX_PTR] =
862 i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(base));
863
864 regs[CTX_R_PWR_CLK_STATE] =
865 i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE);
866 }
867}
868
869static bool virtual_matches(const struct virtual_engine *ve,
870 const struct i915_request *rq,
871 const struct intel_engine_cs *engine)
872{
873 const struct intel_engine_cs *inflight;
874
875 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
876 return false;
877
878 /*
879 * We track when the HW has completed saving the context image
880 * (i.e. when we have seen the final CS event switching out of
881 * the context) and must not overwrite the context image before
882 * then. This restricts us to only using the active engine
883 * while the previous virtualized request is inflight (so
884 * we reuse the register offsets). This is a very small
885 * hystersis on the greedy seelction algorithm.
886 */
887 inflight = intel_context_inflight(&ve->context);
888 if (inflight && inflight != engine)
889 return false;
890
891 return true;
892}
893
894static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
895 struct intel_engine_cs *engine)
896{
897 struct intel_engine_cs *old = ve->siblings[0];
898
899 /* All unattached (rq->engine == old) must already be completed */
900
901 spin_lock(&old->breadcrumbs.irq_lock);
902 if (!list_empty(&ve->context.signal_link)) {
903 list_move_tail(&ve->context.signal_link,
904 &engine->breadcrumbs.signalers);
905 intel_engine_queue_breadcrumbs(engine);
906 }
907 spin_unlock(&old->breadcrumbs.irq_lock);
908}
909
910static struct i915_request *
911last_active(const struct intel_engine_execlists *execlists)
912{
913 struct i915_request * const *last = READ_ONCE(execlists->active);
914
915 while (*last && i915_request_completed(*last))
916 last++;
917
918 return *last;
919}
920
921static void defer_request(struct i915_request *rq, struct list_head * const pl)
922{
923 LIST_HEAD(list);
924
925 /*
926 * We want to move the interrupted request to the back of
927 * the round-robin list (i.e. its priority level), but
928 * in doing so, we must then move all requests that were in
929 * flight and were waiting for the interrupted request to
930 * be run after it again.
931 */
932 do {
933 struct i915_dependency *p;
934
935 GEM_BUG_ON(i915_request_is_active(rq));
936 list_move_tail(&rq->sched.link, pl);
937
938 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
939 struct i915_request *w =
940 container_of(p->waiter, typeof(*w), sched);
941
942 /* Leave semaphores spinning on the other engines */
943 if (w->engine != rq->engine)
944 continue;
945
946 /* No waiter should start before its signaler */
947 GEM_BUG_ON(i915_request_started(w) &&
948 !i915_request_completed(rq));
949
950 GEM_BUG_ON(i915_request_is_active(w));
951 if (list_empty(&w->sched.link))
952 continue; /* Not yet submitted; unready */
953
954 if (rq_prio(w) < rq_prio(rq))
955 continue;
956
957 GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
958 list_move_tail(&w->sched.link, &list);
959 }
960
961 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
962 } while (rq);
963}
964
965static void defer_active(struct intel_engine_cs *engine)
966{
967 struct i915_request *rq;
968
969 rq = __unwind_incomplete_requests(engine);
970 if (!rq)
971 return;
972
973 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
974}
975
976static bool
977need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
978{
979 int hint;
980
981 if (!intel_engine_has_semaphores(engine))
982 return false;
983
984 if (list_is_last(&rq->sched.link, &engine->active.requests))
985 return false;
986
987 hint = max(rq_prio(list_next_entry(rq, sched.link)),
988 engine->execlists.queue_priority_hint);
989
990 return hint >= effective_prio(rq);
991}
992
993static int
994switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
995{
996 if (list_is_last(&rq->sched.link, &engine->active.requests))
997 return INT_MIN;
998
999 return rq_prio(list_next_entry(rq, sched.link));
1000}
1001
1002static bool
1003enable_timeslice(const struct intel_engine_execlists *execlists)
1004{
1005 const struct i915_request *rq = *execlists->active;
1006
1007 if (i915_request_completed(rq))
1008 return false;
1009
1010 return execlists->switch_priority_hint >= effective_prio(rq);
1011}
1012
1013static void record_preemption(struct intel_engine_execlists *execlists)
1014{
1015 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1016}
1017
1018static void execlists_dequeue(struct intel_engine_cs *engine)
1019{
1020 struct intel_engine_execlists * const execlists = &engine->execlists;
1021 struct i915_request **port = execlists->pending;
1022 struct i915_request ** const last_port = port + execlists->port_mask;
1023 struct i915_request *last;
1024 struct rb_node *rb;
1025 bool submit = false;
1026
1027 /*
1028 * Hardware submission is through 2 ports. Conceptually each port
1029 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1030 * static for a context, and unique to each, so we only execute
1031 * requests belonging to a single context from each ring. RING_HEAD
1032 * is maintained by the CS in the context image, it marks the place
1033 * where it got up to last time, and through RING_TAIL we tell the CS
1034 * where we want to execute up to this time.
1035 *
1036 * In this list the requests are in order of execution. Consecutive
1037 * requests from the same context are adjacent in the ringbuffer. We
1038 * can combine these requests into a single RING_TAIL update:
1039 *
1040 * RING_HEAD...req1...req2
1041 * ^- RING_TAIL
1042 * since to execute req2 the CS must first execute req1.
1043 *
1044 * Our goal then is to point each port to the end of a consecutive
1045 * sequence of requests as being the most optimal (fewest wake ups
1046 * and context switches) submission.
1047 */
1048
1049 for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1050 struct virtual_engine *ve =
1051 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1052 struct i915_request *rq = READ_ONCE(ve->request);
1053
1054 if (!rq) { /* lazily cleanup after another engine handled rq */
1055 rb_erase_cached(rb, &execlists->virtual);
1056 RB_CLEAR_NODE(rb);
1057 rb = rb_first_cached(&execlists->virtual);
1058 continue;
1059 }
1060
1061 if (!virtual_matches(ve, rq, engine)) {
1062 rb = rb_next(rb);
1063 continue;
1064 }
1065
1066 break;
1067 }
1068
1069 /*
1070 * If the queue is higher priority than the last
1071 * request in the currently active context, submit afresh.
1072 * We will resubmit again afterwards in case we need to split
1073 * the active context to interject the preemption request,
1074 * i.e. we will retrigger preemption following the ack in case
1075 * of trouble.
1076 */
1077 last = last_active(execlists);
1078 if (last) {
1079 if (need_preempt(engine, last, rb)) {
1080 GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n",
1081 engine->name,
1082 last->fence.context,
1083 last->fence.seqno,
1084 last->sched.attr.priority,
1085 execlists->queue_priority_hint);
1086 record_preemption(execlists);
1087
1088 /*
1089 * Don't let the RING_HEAD advance past the breadcrumb
1090 * as we unwind (and until we resubmit) so that we do
1091 * not accidentally tell it to go backwards.
1092 */
1093 ring_set_paused(engine, 1);
1094
1095 /*
1096 * Note that we have not stopped the GPU at this point,
1097 * so we are unwinding the incomplete requests as they
1098 * remain inflight and so by the time we do complete
1099 * the preemption, some of the unwound requests may
1100 * complete!
1101 */
1102 __unwind_incomplete_requests(engine);
1103
1104 /*
1105 * If we need to return to the preempted context, we
1106 * need to skip the lite-restore and force it to
1107 * reload the RING_TAIL. Otherwise, the HW has a
1108 * tendency to ignore us rewinding the TAIL to the
1109 * end of an earlier request.
1110 */
1111 last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1112 last = NULL;
1113 } else if (need_timeslice(engine, last) &&
1114 !timer_pending(&engine->execlists.timer)) {
1115 GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n",
1116 engine->name,
1117 last->fence.context,
1118 last->fence.seqno,
1119 last->sched.attr.priority,
1120 execlists->queue_priority_hint);
1121
1122 ring_set_paused(engine, 1);
1123 defer_active(engine);
1124
1125 /*
1126 * Unlike for preemption, if we rewind and continue
1127 * executing the same context as previously active,
1128 * the order of execution will remain the same and
1129 * the tail will only advance. We do not need to
1130 * force a full context restore, as a lite-restore
1131 * is sufficient to resample the monotonic TAIL.
1132 *
1133 * If we switch to any other context, similarly we
1134 * will not rewind TAIL of current context, and
1135 * normal save/restore will preserve state and allow
1136 * us to later continue executing the same request.
1137 */
1138 last = NULL;
1139 } else {
1140 /*
1141 * Otherwise if we already have a request pending
1142 * for execution after the current one, we can
1143 * just wait until the next CS event before
1144 * queuing more. In either case we will force a
1145 * lite-restore preemption event, but if we wait
1146 * we hopefully coalesce several updates into a single
1147 * submission.
1148 */
1149 if (!list_is_last(&last->sched.link,
1150 &engine->active.requests))
1151 return;
1152
1153 /*
1154 * WaIdleLiteRestore:bdw,skl
1155 * Apply the wa NOOPs to prevent
1156 * ring:HEAD == rq:TAIL as we resubmit the
1157 * request. See gen8_emit_fini_breadcrumb() for
1158 * where we prepare the padding after the
1159 * end of the request.
1160 */
1161 last->tail = last->wa_tail;
1162 }
1163 }
1164
1165 while (rb) { /* XXX virtual is always taking precedence */
1166 struct virtual_engine *ve =
1167 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1168 struct i915_request *rq;
1169
1170 spin_lock(&ve->base.active.lock);
1171
1172 rq = ve->request;
1173 if (unlikely(!rq)) { /* lost the race to a sibling */
1174 spin_unlock(&ve->base.active.lock);
1175 rb_erase_cached(rb, &execlists->virtual);
1176 RB_CLEAR_NODE(rb);
1177 rb = rb_first_cached(&execlists->virtual);
1178 continue;
1179 }
1180
1181 GEM_BUG_ON(rq != ve->request);
1182 GEM_BUG_ON(rq->engine != &ve->base);
1183 GEM_BUG_ON(rq->hw_context != &ve->context);
1184
1185 if (rq_prio(rq) >= queue_prio(execlists)) {
1186 if (!virtual_matches(ve, rq, engine)) {
1187 spin_unlock(&ve->base.active.lock);
1188 rb = rb_next(rb);
1189 continue;
1190 }
1191
1192 if (last && !can_merge_rq(last, rq)) {
1193 spin_unlock(&ve->base.active.lock);
1194 return; /* leave this for another */
1195 }
1196
1197 GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n",
1198 engine->name,
1199 rq->fence.context,
1200 rq->fence.seqno,
1201 i915_request_completed(rq) ? "!" :
1202 i915_request_started(rq) ? "*" :
1203 "",
1204 yesno(engine != ve->siblings[0]));
1205
1206 ve->request = NULL;
1207 ve->base.execlists.queue_priority_hint = INT_MIN;
1208 rb_erase_cached(rb, &execlists->virtual);
1209 RB_CLEAR_NODE(rb);
1210
1211 GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1212 rq->engine = engine;
1213
1214 if (engine != ve->siblings[0]) {
1215 u32 *regs = ve->context.lrc_reg_state;
1216 unsigned int n;
1217
1218 GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1219 virtual_update_register_offsets(regs, engine);
1220
1221 if (!list_empty(&ve->context.signals))
1222 virtual_xfer_breadcrumbs(ve, engine);
1223
1224 /*
1225 * Move the bound engine to the top of the list
1226 * for future execution. We then kick this
1227 * tasklet first before checking others, so that
1228 * we preferentially reuse this set of bound
1229 * registers.
1230 */
1231 for (n = 1; n < ve->num_siblings; n++) {
1232 if (ve->siblings[n] == engine) {
1233 swap(ve->siblings[n],
1234 ve->siblings[0]);
1235 break;
1236 }
1237 }
1238
1239 GEM_BUG_ON(ve->siblings[0] != engine);
1240 }
1241
1242 if (__i915_request_submit(rq)) {
1243 submit = true;
1244 last = rq;
1245 }
1246 i915_request_put(rq);
1247
1248 /*
1249 * Hmm, we have a bunch of virtual engine requests,
1250 * but the first one was already completed (thanks
1251 * preempt-to-busy!). Keep looking at the veng queue
1252 * until we have no more relevant requests (i.e.
1253 * the normal submit queue has higher priority).
1254 */
1255 if (!submit) {
1256 spin_unlock(&ve->base.active.lock);
1257 rb = rb_first_cached(&execlists->virtual);
1258 continue;
1259 }
1260 }
1261
1262 spin_unlock(&ve->base.active.lock);
1263 break;
1264 }
1265
1266 while ((rb = rb_first_cached(&execlists->queue))) {
1267 struct i915_priolist *p = to_priolist(rb);
1268 struct i915_request *rq, *rn;
1269 int i;
1270
1271 priolist_for_each_request_consume(rq, rn, p, i) {
1272 bool merge = true;
1273
1274 /*
1275 * Can we combine this request with the current port?
1276 * It has to be the same context/ringbuffer and not
1277 * have any exceptions (e.g. GVT saying never to
1278 * combine contexts).
1279 *
1280 * If we can combine the requests, we can execute both
1281 * by updating the RING_TAIL to point to the end of the
1282 * second request, and so we never need to tell the
1283 * hardware about the first.
1284 */
1285 if (last && !can_merge_rq(last, rq)) {
1286 /*
1287 * If we are on the second port and cannot
1288 * combine this request with the last, then we
1289 * are done.
1290 */
1291 if (port == last_port)
1292 goto done;
1293
1294 /*
1295 * We must not populate both ELSP[] with the
1296 * same LRCA, i.e. we must submit 2 different
1297 * contexts if we submit 2 ELSP.
1298 */
1299 if (last->hw_context == rq->hw_context)
1300 goto done;
1301
1302 /*
1303 * If GVT overrides us we only ever submit
1304 * port[0], leaving port[1] empty. Note that we
1305 * also have to be careful that we don't queue
1306 * the same context (even though a different
1307 * request) to the second port.
1308 */
1309 if (ctx_single_port_submission(last->hw_context) ||
1310 ctx_single_port_submission(rq->hw_context))
1311 goto done;
1312
1313 merge = false;
1314 }
1315
1316 if (__i915_request_submit(rq)) {
1317 if (!merge) {
1318 *port = execlists_schedule_in(last, port - execlists->pending);
1319 port++;
1320 last = NULL;
1321 }
1322
1323 GEM_BUG_ON(last &&
1324 !can_merge_ctx(last->hw_context,
1325 rq->hw_context));
1326
1327 submit = true;
1328 last = rq;
1329 }
1330 }
1331
1332 rb_erase_cached(&p->node, &execlists->queue);
1333 i915_priolist_free(p);
1334 }
1335
1336done:
1337 /*
1338 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
1339 *
1340 * We choose the priority hint such that if we add a request of greater
1341 * priority than this, we kick the submission tasklet to decide on
1342 * the right order of submitting the requests to hardware. We must
1343 * also be prepared to reorder requests as they are in-flight on the
1344 * HW. We derive the priority hint then as the first "hole" in
1345 * the HW submission ports and if there are no available slots,
1346 * the priority of the lowest executing request, i.e. last.
1347 *
1348 * When we do receive a higher priority request ready to run from the
1349 * user, see queue_request(), the priority hint is bumped to that
1350 * request triggering preemption on the next dequeue (or subsequent
1351 * interrupt for secondary ports).
1352 */
1353 execlists->queue_priority_hint = queue_prio(execlists);
1354 GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n",
1355 engine->name, execlists->queue_priority_hint,
1356 yesno(submit));
1357
1358 if (submit) {
1359 *port = execlists_schedule_in(last, port - execlists->pending);
1360 memset(port + 1, 0, (last_port - port) * sizeof(*port));
1361 execlists->switch_priority_hint =
1362 switch_prio(engine, *execlists->pending);
1363 execlists_submit_ports(engine);
1364 } else {
1365 ring_set_paused(engine, 0);
1366 }
1367}
1368
1369static void
1370cancel_port_requests(struct intel_engine_execlists * const execlists)
1371{
1372 struct i915_request * const *port, *rq;
1373
1374 for (port = execlists->pending; (rq = *port); port++)
1375 execlists_schedule_out(rq);
1376 memset(execlists->pending, 0, sizeof(execlists->pending));
1377
1378 for (port = execlists->active; (rq = *port); port++)
1379 execlists_schedule_out(rq);
1380 execlists->active =
1381 memset(execlists->inflight, 0, sizeof(execlists->inflight));
1382}
1383
1384static inline void
1385invalidate_csb_entries(const u32 *first, const u32 *last)
1386{
1387 clflush((void *)first);
1388 clflush((void *)last);
1389}
1390
1391static inline bool
1392reset_in_progress(const struct intel_engine_execlists *execlists)
1393{
1394 return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1395}
1396
1397enum csb_step {
1398 CSB_NOP,
1399 CSB_PROMOTE,
1400 CSB_PREEMPT,
1401 CSB_COMPLETE,
1402};
1403
1404/*
1405 * Starting with Gen12, the status has a new format:
1406 *
1407 * bit 0: switched to new queue
1408 * bit 1: reserved
1409 * bit 2: semaphore wait mode (poll or signal), only valid when
1410 * switch detail is set to "wait on semaphore"
1411 * bits 3-5: engine class
1412 * bits 6-11: engine instance
1413 * bits 12-14: reserved
1414 * bits 15-25: sw context id of the lrc the GT switched to
1415 * bits 26-31: sw counter of the lrc the GT switched to
1416 * bits 32-35: context switch detail
1417 * - 0: ctx complete
1418 * - 1: wait on sync flip
1419 * - 2: wait on vblank
1420 * - 3: wait on scanline
1421 * - 4: wait on semaphore
1422 * - 5: context preempted (not on SEMAPHORE_WAIT or
1423 * WAIT_FOR_EVENT)
1424 * bit 36: reserved
1425 * bits 37-43: wait detail (for switch detail 1 to 4)
1426 * bits 44-46: reserved
1427 * bits 47-57: sw context id of the lrc the GT switched away from
1428 * bits 58-63: sw counter of the lrc the GT switched away from
1429 */
1430static inline enum csb_step
1431gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
1432{
1433 u32 lower_dw = csb[0];
1434 u32 upper_dw = csb[1];
1435 bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
1436 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
1437 bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
1438
1439 if (!ctx_away_valid && ctx_to_valid)
1440 return CSB_PROMOTE;
1441
1442 /*
1443 * The context switch detail is not guaranteed to be 5 when a preemption
1444 * occurs, so we can't just check for that. The check below works for
1445 * all the cases we care about, including preemptions of WAIT
1446 * instructions and lite-restore. Preempt-to-idle via the CTRL register
1447 * would require some extra handling, but we don't support that.
1448 */
1449 if (new_queue && ctx_away_valid)
1450 return CSB_PREEMPT;
1451
1452 /*
1453 * switch detail = 5 is covered by the case above and we do not expect a
1454 * context switch on an unsuccessful wait instruction since we always
1455 * use polling mode.
1456 */
1457 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
1458
1459 if (*execlists->active) {
1460 GEM_BUG_ON(!ctx_away_valid);
1461 return CSB_COMPLETE;
1462 }
1463
1464 return CSB_NOP;
1465}
1466
1467static inline enum csb_step
1468gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
1469{
1470 unsigned int status = *csb;
1471
1472 if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
1473 return CSB_PROMOTE;
1474
1475 if (status & GEN8_CTX_STATUS_PREEMPTED)
1476 return CSB_PREEMPT;
1477
1478 if (*execlists->active)
1479 return CSB_COMPLETE;
1480
1481 return CSB_NOP;
1482}
1483
1484static void process_csb(struct intel_engine_cs *engine)
1485{
1486 struct intel_engine_execlists * const execlists = &engine->execlists;
1487 const u32 * const buf = execlists->csb_status;
1488 const u8 num_entries = execlists->csb_size;
1489 u8 head, tail;
1490
1491 GEM_BUG_ON(USES_GUC_SUBMISSION(engine->i915));
1492
1493 /*
1494 * Note that csb_write, csb_status may be either in HWSP or mmio.
1495 * When reading from the csb_write mmio register, we have to be
1496 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
1497 * the low 4bits. As it happens we know the next 4bits are always
1498 * zero and so we can simply masked off the low u8 of the register
1499 * and treat it identically to reading from the HWSP (without having
1500 * to use explicit shifting and masking, and probably bifurcating
1501 * the code to handle the legacy mmio read).
1502 */
1503 head = execlists->csb_head;
1504 tail = READ_ONCE(*execlists->csb_write);
1505 GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
1506 if (unlikely(head == tail))
1507 return;
1508
1509 /*
1510 * Hopefully paired with a wmb() in HW!
1511 *
1512 * We must complete the read of the write pointer before any reads
1513 * from the CSB, so that we do not see stale values. Without an rmb
1514 * (lfence) the HW may speculatively perform the CSB[] reads *before*
1515 * we perform the READ_ONCE(*csb_write).
1516 */
1517 rmb();
1518
1519 do {
1520 enum csb_step csb_step;
1521
1522 if (++head == num_entries)
1523 head = 0;
1524
1525 /*
1526 * We are flying near dragons again.
1527 *
1528 * We hold a reference to the request in execlist_port[]
1529 * but no more than that. We are operating in softirq
1530 * context and so cannot hold any mutex or sleep. That
1531 * prevents us stopping the requests we are processing
1532 * in port[] from being retired simultaneously (the
1533 * breadcrumb will be complete before we see the
1534 * context-switch). As we only hold the reference to the
1535 * request, any pointer chasing underneath the request
1536 * is subject to a potential use-after-free. Thus we
1537 * store all of the bookkeeping within port[] as
1538 * required, and avoid using unguarded pointers beneath
1539 * request itself. The same applies to the atomic
1540 * status notifier.
1541 */
1542
1543 GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n",
1544 engine->name, head,
1545 buf[2 * head + 0], buf[2 * head + 1]);
1546
1547 if (INTEL_GEN(engine->i915) >= 12)
1548 csb_step = gen12_csb_parse(execlists, buf + 2 * head);
1549 else
1550 csb_step = gen8_csb_parse(execlists, buf + 2 * head);
1551
1552 switch (csb_step) {
1553 case CSB_PREEMPT: /* cancel old inflight, prepare for switch */
1554 trace_ports(execlists, "preempted", execlists->active);
1555
1556 while (*execlists->active)
1557 execlists_schedule_out(*execlists->active++);
1558
1559 /* fallthrough */
1560 case CSB_PROMOTE: /* switch pending to inflight */
1561 GEM_BUG_ON(*execlists->active);
1562 GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
1563 execlists->active =
1564 memcpy(execlists->inflight,
1565 execlists->pending,
1566 execlists_num_ports(execlists) *
1567 sizeof(*execlists->pending));
1568
1569 if (enable_timeslice(execlists))
1570 mod_timer(&execlists->timer, jiffies + 1);
1571
1572 if (!inject_preempt_hang(execlists))
1573 ring_set_paused(engine, 0);
1574
1575 WRITE_ONCE(execlists->pending[0], NULL);
1576 break;
1577
1578 case CSB_COMPLETE: /* port0 completed, advanced to port1 */
1579 trace_ports(execlists, "completed", execlists->active);
1580
1581 /*
1582 * We rely on the hardware being strongly
1583 * ordered, that the breadcrumb write is
1584 * coherent (visible from the CPU) before the
1585 * user interrupt and CSB is processed.
1586 */
1587 GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
1588 !reset_in_progress(execlists));
1589 execlists_schedule_out(*execlists->active++);
1590
1591 GEM_BUG_ON(execlists->active - execlists->inflight >
1592 execlists_num_ports(execlists));
1593 break;
1594
1595 case CSB_NOP:
1596 break;
1597 }
1598 } while (head != tail);
1599
1600 execlists->csb_head = head;
1601
1602 /*
1603 * Gen11 has proven to fail wrt global observation point between
1604 * entry and tail update, failing on the ordering and thus
1605 * we see an old entry in the context status buffer.
1606 *
1607 * Forcibly evict out entries for the next gpu csb update,
1608 * to increase the odds that we get a fresh entries with non
1609 * working hardware. The cost for doing so comes out mostly with
1610 * the wash as hardware, working or not, will need to do the
1611 * invalidation before.
1612 */
1613 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
1614}
1615
1616static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
1617{
1618 lockdep_assert_held(&engine->active.lock);
1619 if (!engine->execlists.pending[0]) {
1620 rcu_read_lock(); /* protect peeking at execlists->active */
1621 execlists_dequeue(engine);
1622 rcu_read_unlock();
1623 }
1624}
1625
1626/*
1627 * Check the unread Context Status Buffers and manage the submission of new
1628 * contexts to the ELSP accordingly.
1629 */
1630static void execlists_submission_tasklet(unsigned long data)
1631{
1632 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
1633 unsigned long flags;
1634
1635 process_csb(engine);
1636 if (!READ_ONCE(engine->execlists.pending[0])) {
1637 spin_lock_irqsave(&engine->active.lock, flags);
1638 __execlists_submission_tasklet(engine);
1639 spin_unlock_irqrestore(&engine->active.lock, flags);
1640 }
1641}
1642
1643static void execlists_submission_timer(struct timer_list *timer)
1644{
1645 struct intel_engine_cs *engine =
1646 from_timer(engine, timer, execlists.timer);
1647
1648 /* Kick the tasklet for some interrupt coalescing and reset handling */
1649 tasklet_hi_schedule(&engine->execlists.tasklet);
1650}
1651
1652static void queue_request(struct intel_engine_cs *engine,
1653 struct i915_sched_node *node,
1654 int prio)
1655{
1656 GEM_BUG_ON(!list_empty(&node->link));
1657 list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
1658}
1659
1660static void __submit_queue_imm(struct intel_engine_cs *engine)
1661{
1662 struct intel_engine_execlists * const execlists = &engine->execlists;
1663
1664 if (reset_in_progress(execlists))
1665 return; /* defer until we restart the engine following reset */
1666
1667 if (execlists->tasklet.func == execlists_submission_tasklet)
1668 __execlists_submission_tasklet(engine);
1669 else
1670 tasklet_hi_schedule(&execlists->tasklet);
1671}
1672
1673static void submit_queue(struct intel_engine_cs *engine,
1674 const struct i915_request *rq)
1675{
1676 struct intel_engine_execlists *execlists = &engine->execlists;
1677
1678 if (rq_prio(rq) <= execlists->queue_priority_hint)
1679 return;
1680
1681 execlists->queue_priority_hint = rq_prio(rq);
1682 __submit_queue_imm(engine);
1683}
1684
1685static void execlists_submit_request(struct i915_request *request)
1686{
1687 struct intel_engine_cs *engine = request->engine;
1688 unsigned long flags;
1689
1690 /* Will be called from irq-context when using foreign fences. */
1691 spin_lock_irqsave(&engine->active.lock, flags);
1692
1693 queue_request(engine, &request->sched, rq_prio(request));
1694
1695 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1696 GEM_BUG_ON(list_empty(&request->sched.link));
1697
1698 submit_queue(engine, request);
1699
1700 spin_unlock_irqrestore(&engine->active.lock, flags);
1701}
1702
1703static void __execlists_context_fini(struct intel_context *ce)
1704{
1705 intel_ring_put(ce->ring);
1706 i915_vma_put(ce->state);
1707}
1708
1709static void execlists_context_destroy(struct kref *kref)
1710{
1711 struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1712
1713 GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1714 GEM_BUG_ON(intel_context_is_pinned(ce));
1715
1716 if (ce->state)
1717 __execlists_context_fini(ce);
1718
1719 intel_context_fini(ce);
1720 intel_context_free(ce);
1721}
1722
1723static void
1724set_redzone(void *vaddr, const struct intel_engine_cs *engine)
1725{
1726 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1727 return;
1728
1729 vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
1730 vaddr += engine->context_size;
1731
1732 memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
1733}
1734
1735static void
1736check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
1737{
1738 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1739 return;
1740
1741 vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
1742 vaddr += engine->context_size;
1743
1744 if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE))
1745 dev_err_once(engine->i915->drm.dev,
1746 "%s context redzone overwritten!\n",
1747 engine->name);
1748}
1749
1750static void execlists_context_unpin(struct intel_context *ce)
1751{
1752 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
1753 ce->engine);
1754
1755 i915_gem_context_unpin_hw_id(ce->gem_context);
1756 i915_gem_object_unpin_map(ce->state->obj);
1757 intel_ring_reset(ce->ring, ce->ring->tail);
1758}
1759
1760static void
1761__execlists_update_reg_state(struct intel_context *ce,
1762 struct intel_engine_cs *engine)
1763{
1764 struct intel_ring *ring = ce->ring;
1765 u32 *regs = ce->lrc_reg_state;
1766
1767 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
1768 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1769
1770 regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(ring->vma);
1771 regs[CTX_RING_HEAD + 1] = ring->head;
1772 regs[CTX_RING_TAIL + 1] = ring->tail;
1773
1774 /* RPCS */
1775 if (engine->class == RENDER_CLASS) {
1776 regs[CTX_R_PWR_CLK_STATE + 1] =
1777 intel_sseu_make_rpcs(engine->i915, &ce->sseu);
1778
1779 i915_oa_init_reg_state(engine, ce, regs);
1780 }
1781}
1782
1783static int
1784__execlists_context_pin(struct intel_context *ce,
1785 struct intel_engine_cs *engine)
1786{
1787 void *vaddr;
1788 int ret;
1789
1790 GEM_BUG_ON(!ce->state);
1791
1792 ret = intel_context_active_acquire(ce);
1793 if (ret)
1794 goto err;
1795 GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1796
1797 vaddr = i915_gem_object_pin_map(ce->state->obj,
1798 i915_coherent_map_type(engine->i915) |
1799 I915_MAP_OVERRIDE);
1800 if (IS_ERR(vaddr)) {
1801 ret = PTR_ERR(vaddr);
1802 goto unpin_active;
1803 }
1804
1805 ret = i915_gem_context_pin_hw_id(ce->gem_context);
1806 if (ret)
1807 goto unpin_map;
1808
1809 ce->lrc_desc = lrc_descriptor(ce, engine);
1810 ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
1811 __execlists_update_reg_state(ce, engine);
1812
1813 return 0;
1814
1815unpin_map:
1816 i915_gem_object_unpin_map(ce->state->obj);
1817unpin_active:
1818 intel_context_active_release(ce);
1819err:
1820 return ret;
1821}
1822
1823static int execlists_context_pin(struct intel_context *ce)
1824{
1825 return __execlists_context_pin(ce, ce->engine);
1826}
1827
1828static int execlists_context_alloc(struct intel_context *ce)
1829{
1830 return __execlists_context_alloc(ce, ce->engine);
1831}
1832
1833static void execlists_context_reset(struct intel_context *ce)
1834{
1835 /*
1836 * Because we emit WA_TAIL_DWORDS there may be a disparity
1837 * between our bookkeeping in ce->ring->head and ce->ring->tail and
1838 * that stored in context. As we only write new commands from
1839 * ce->ring->tail onwards, everything before that is junk. If the GPU
1840 * starts reading from its RING_HEAD from the context, it may try to
1841 * execute that junk and die.
1842 *
1843 * The contexts that are stilled pinned on resume belong to the
1844 * kernel, and are local to each engine. All other contexts will
1845 * have their head/tail sanitized upon pinning before use, so they
1846 * will never see garbage,
1847 *
1848 * So to avoid that we reset the context images upon resume. For
1849 * simplicity, we just zero everything out.
1850 */
1851 intel_ring_reset(ce->ring, 0);
1852 __execlists_update_reg_state(ce, ce->engine);
1853}
1854
1855static const struct intel_context_ops execlists_context_ops = {
1856 .alloc = execlists_context_alloc,
1857
1858 .pin = execlists_context_pin,
1859 .unpin = execlists_context_unpin,
1860
1861 .enter = intel_context_enter_engine,
1862 .exit = intel_context_exit_engine,
1863
1864 .reset = execlists_context_reset,
1865 .destroy = execlists_context_destroy,
1866};
1867
1868static int gen8_emit_init_breadcrumb(struct i915_request *rq)
1869{
1870 u32 *cs;
1871
1872 GEM_BUG_ON(!rq->timeline->has_initial_breadcrumb);
1873
1874 cs = intel_ring_begin(rq, 6);
1875 if (IS_ERR(cs))
1876 return PTR_ERR(cs);
1877
1878 /*
1879 * Check if we have been preempted before we even get started.
1880 *
1881 * After this point i915_request_started() reports true, even if
1882 * we get preempted and so are no longer running.
1883 */
1884 *cs++ = MI_ARB_CHECK;
1885 *cs++ = MI_NOOP;
1886
1887 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1888 *cs++ = rq->timeline->hwsp_offset;
1889 *cs++ = 0;
1890 *cs++ = rq->fence.seqno - 1;
1891
1892 intel_ring_advance(rq, cs);
1893
1894 /* Record the updated position of the request's payload */
1895 rq->infix = intel_ring_offset(rq, cs);
1896
1897 return 0;
1898}
1899
1900static int emit_pdps(struct i915_request *rq)
1901{
1902 const struct intel_engine_cs * const engine = rq->engine;
1903 struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->hw_context->vm);
1904 int err, i;
1905 u32 *cs;
1906
1907 GEM_BUG_ON(intel_vgpu_active(rq->i915));
1908
1909 /*
1910 * Beware ye of the dragons, this sequence is magic!
1911 *
1912 * Small changes to this sequence can cause anything from
1913 * GPU hangs to forcewake errors and machine lockups!
1914 */
1915
1916 /* Flush any residual operations from the context load */
1917 err = engine->emit_flush(rq, EMIT_FLUSH);
1918 if (err)
1919 return err;
1920
1921 /* Magic required to prevent forcewake errors! */
1922 err = engine->emit_flush(rq, EMIT_INVALIDATE);
1923 if (err)
1924 return err;
1925
1926 cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
1927 if (IS_ERR(cs))
1928 return PTR_ERR(cs);
1929
1930 /* Ensure the LRI have landed before we invalidate & continue */
1931 *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
1932 for (i = GEN8_3LVL_PDPES; i--; ) {
1933 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
1934 u32 base = engine->mmio_base;
1935
1936 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
1937 *cs++ = upper_32_bits(pd_daddr);
1938 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
1939 *cs++ = lower_32_bits(pd_daddr);
1940 }
1941 *cs++ = MI_NOOP;
1942
1943 intel_ring_advance(rq, cs);
1944
1945 /* Be doubly sure the LRI have landed before proceeding */
1946 err = engine->emit_flush(rq, EMIT_FLUSH);
1947 if (err)
1948 return err;
1949
1950 /* Re-invalidate the TLB for luck */
1951 return engine->emit_flush(rq, EMIT_INVALIDATE);
1952}
1953
1954static int execlists_request_alloc(struct i915_request *request)
1955{
1956 int ret;
1957
1958 GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
1959
1960 /*
1961 * Flush enough space to reduce the likelihood of waiting after
1962 * we start building the request - in which case we will just
1963 * have to repeat work.
1964 */
1965 request->reserved_space += EXECLISTS_REQUEST_SIZE;
1966
1967 /*
1968 * Note that after this point, we have committed to using
1969 * this request as it is being used to both track the
1970 * state of engine initialisation and liveness of the
1971 * golden renderstate above. Think twice before you try
1972 * to cancel/unwind this request now.
1973 */
1974
1975 /* Unconditionally invalidate GPU caches and TLBs. */
1976 if (i915_vm_is_4lvl(request->hw_context->vm))
1977 ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
1978 else
1979 ret = emit_pdps(request);
1980 if (ret)
1981 return ret;
1982
1983 request->reserved_space -= EXECLISTS_REQUEST_SIZE;
1984 return 0;
1985}
1986
1987/*
1988 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1989 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1990 * but there is a slight complication as this is applied in WA batch where the
1991 * values are only initialized once so we cannot take register value at the
1992 * beginning and reuse it further; hence we save its value to memory, upload a
1993 * constant value with bit21 set and then we restore it back with the saved value.
1994 * To simplify the WA, a constant value is formed by using the default value
1995 * of this register. This shouldn't be a problem because we are only modifying
1996 * it for a short period and this batch in non-premptible. We can ofcourse
1997 * use additional instructions that read the actual value of the register
1998 * at that time and set our bit of interest but it makes the WA complicated.
1999 *
2000 * This WA is also required for Gen9 so extracting as a function avoids
2001 * code duplication.
2002 */
2003static u32 *
2004gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
2005{
2006 /* NB no one else is allowed to scribble over scratch + 256! */
2007 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2008 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2009 *batch++ = intel_gt_scratch_offset(engine->gt,
2010 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2011 *batch++ = 0;
2012
2013 *batch++ = MI_LOAD_REGISTER_IMM(1);
2014 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2015 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
2016
2017 batch = gen8_emit_pipe_control(batch,
2018 PIPE_CONTROL_CS_STALL |
2019 PIPE_CONTROL_DC_FLUSH_ENABLE,
2020 0);
2021
2022 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2023 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2024 *batch++ = intel_gt_scratch_offset(engine->gt,
2025 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2026 *batch++ = 0;
2027
2028 return batch;
2029}
2030
2031static u32 slm_offset(struct intel_engine_cs *engine)
2032{
2033 return intel_gt_scratch_offset(engine->gt,
2034 INTEL_GT_SCRATCH_FIELD_CLEAR_SLM_WA);
2035}
2036
2037/*
2038 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
2039 * initialized at the beginning and shared across all contexts but this field
2040 * helps us to have multiple batches at different offsets and select them based
2041 * on a criteria. At the moment this batch always start at the beginning of the page
2042 * and at this point we don't have multiple wa_ctx batch buffers.
2043 *
2044 * The number of WA applied are not known at the beginning; we use this field
2045 * to return the no of DWORDS written.
2046 *
2047 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
2048 * so it adds NOOPs as padding to make it cacheline aligned.
2049 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
2050 * makes a complete batch buffer.
2051 */
2052static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2053{
2054 /* WaDisableCtxRestoreArbitration:bdw,chv */
2055 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2056
2057 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
2058 if (IS_BROADWELL(engine->i915))
2059 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2060
2061 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
2062 /* Actual scratch location is at 128 bytes offset */
2063 batch = gen8_emit_pipe_control(batch,
2064 PIPE_CONTROL_FLUSH_L3 |
2065 PIPE_CONTROL_GLOBAL_GTT_IVB |
2066 PIPE_CONTROL_CS_STALL |
2067 PIPE_CONTROL_QW_WRITE,
2068 slm_offset(engine));
2069
2070 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2071
2072 /* Pad to end of cacheline */
2073 while ((unsigned long)batch % CACHELINE_BYTES)
2074 *batch++ = MI_NOOP;
2075
2076 /*
2077 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
2078 * execution depends on the length specified in terms of cache lines
2079 * in the register CTX_RCS_INDIRECT_CTX
2080 */
2081
2082 return batch;
2083}
2084
2085struct lri {
2086 i915_reg_t reg;
2087 u32 value;
2088};
2089
2090static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
2091{
2092 GEM_BUG_ON(!count || count > 63);
2093
2094 *batch++ = MI_LOAD_REGISTER_IMM(count);
2095 do {
2096 *batch++ = i915_mmio_reg_offset(lri->reg);
2097 *batch++ = lri->value;
2098 } while (lri++, --count);
2099 *batch++ = MI_NOOP;
2100
2101 return batch;
2102}
2103
2104static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2105{
2106 static const struct lri lri[] = {
2107 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
2108 {
2109 COMMON_SLICE_CHICKEN2,
2110 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
2111 0),
2112 },
2113
2114 /* BSpec: 11391 */
2115 {
2116 FF_SLICE_CHICKEN,
2117 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
2118 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
2119 },
2120
2121 /* BSpec: 11299 */
2122 {
2123 _3D_CHICKEN3,
2124 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
2125 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
2126 }
2127 };
2128
2129 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2130
2131 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
2132 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2133
2134 batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
2135
2136 /* WaMediaPoolStateCmdInWABB:bxt,glk */
2137 if (HAS_POOLED_EU(engine->i915)) {
2138 /*
2139 * EU pool configuration is setup along with golden context
2140 * during context initialization. This value depends on
2141 * device type (2x6 or 3x6) and needs to be updated based
2142 * on which subslice is disabled especially for 2x6
2143 * devices, however it is safe to load default
2144 * configuration of 3x6 device instead of masking off
2145 * corresponding bits because HW ignores bits of a disabled
2146 * subslice and drops down to appropriate config. Please
2147 * see render_state_setup() in i915_gem_render_state.c for
2148 * possible configurations, to avoid duplication they are
2149 * not shown here again.
2150 */
2151 *batch++ = GEN9_MEDIA_POOL_STATE;
2152 *batch++ = GEN9_MEDIA_POOL_ENABLE;
2153 *batch++ = 0x00777000;
2154 *batch++ = 0;
2155 *batch++ = 0;
2156 *batch++ = 0;
2157 }
2158
2159 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2160
2161 /* Pad to end of cacheline */
2162 while ((unsigned long)batch % CACHELINE_BYTES)
2163 *batch++ = MI_NOOP;
2164
2165 return batch;
2166}
2167
2168static u32 *
2169gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2170{
2171 int i;
2172
2173 /*
2174 * WaPipeControlBefore3DStateSamplePattern: cnl
2175 *
2176 * Ensure the engine is idle prior to programming a
2177 * 3DSTATE_SAMPLE_PATTERN during a context restore.
2178 */
2179 batch = gen8_emit_pipe_control(batch,
2180 PIPE_CONTROL_CS_STALL,
2181 0);
2182 /*
2183 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
2184 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
2185 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
2186 * confusing. Since gen8_emit_pipe_control() already advances the
2187 * batch by 6 dwords, we advance the other 10 here, completing a
2188 * cacheline. It's not clear if the workaround requires this padding
2189 * before other commands, or if it's just the regular padding we would
2190 * already have for the workaround bb, so leave it here for now.
2191 */
2192 for (i = 0; i < 10; i++)
2193 *batch++ = MI_NOOP;
2194
2195 /* Pad to end of cacheline */
2196 while ((unsigned long)batch % CACHELINE_BYTES)
2197 *batch++ = MI_NOOP;
2198
2199 return batch;
2200}
2201
2202#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
2203
2204static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
2205{
2206 struct drm_i915_gem_object *obj;
2207 struct i915_vma *vma;
2208 int err;
2209
2210 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
2211 if (IS_ERR(obj))
2212 return PTR_ERR(obj);
2213
2214 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
2215 if (IS_ERR(vma)) {
2216 err = PTR_ERR(vma);
2217 goto err;
2218 }
2219
2220 err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
2221 if (err)
2222 goto err;
2223
2224 engine->wa_ctx.vma = vma;
2225 return 0;
2226
2227err:
2228 i915_gem_object_put(obj);
2229 return err;
2230}
2231
2232static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
2233{
2234 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
2235}
2236
2237typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
2238
2239static int intel_init_workaround_bb(struct intel_engine_cs *engine)
2240{
2241 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2242 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
2243 &wa_ctx->per_ctx };
2244 wa_bb_func_t wa_bb_fn[2];
2245 struct page *page;
2246 void *batch, *batch_ptr;
2247 unsigned int i;
2248 int ret;
2249
2250 if (engine->class != RENDER_CLASS)
2251 return 0;
2252
2253 switch (INTEL_GEN(engine->i915)) {
2254 case 12:
2255 case 11:
2256 return 0;
2257 case 10:
2258 wa_bb_fn[0] = gen10_init_indirectctx_bb;
2259 wa_bb_fn[1] = NULL;
2260 break;
2261 case 9:
2262 wa_bb_fn[0] = gen9_init_indirectctx_bb;
2263 wa_bb_fn[1] = NULL;
2264 break;
2265 case 8:
2266 wa_bb_fn[0] = gen8_init_indirectctx_bb;
2267 wa_bb_fn[1] = NULL;
2268 break;
2269 default:
2270 MISSING_CASE(INTEL_GEN(engine->i915));
2271 return 0;
2272 }
2273
2274 ret = lrc_setup_wa_ctx(engine);
2275 if (ret) {
2276 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
2277 return ret;
2278 }
2279
2280 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
2281 batch = batch_ptr = kmap_atomic(page);
2282
2283 /*
2284 * Emit the two workaround batch buffers, recording the offset from the
2285 * start of the workaround batch buffer object for each and their
2286 * respective sizes.
2287 */
2288 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
2289 wa_bb[i]->offset = batch_ptr - batch;
2290 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
2291 CACHELINE_BYTES))) {
2292 ret = -EINVAL;
2293 break;
2294 }
2295 if (wa_bb_fn[i])
2296 batch_ptr = wa_bb_fn[i](engine, batch_ptr);
2297 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
2298 }
2299
2300 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
2301
2302 kunmap_atomic(batch);
2303 if (ret)
2304 lrc_destroy_wa_ctx(engine);
2305
2306 return ret;
2307}
2308
2309static void enable_execlists(struct intel_engine_cs *engine)
2310{
2311 u32 mode;
2312
2313 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
2314
2315 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
2316
2317 if (INTEL_GEN(engine->i915) >= 11)
2318 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
2319 else
2320 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
2321 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
2322
2323 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
2324
2325 ENGINE_WRITE_FW(engine,
2326 RING_HWS_PGA,
2327 i915_ggtt_offset(engine->status_page.vma));
2328 ENGINE_POSTING_READ(engine, RING_HWS_PGA);
2329}
2330
2331static bool unexpected_starting_state(struct intel_engine_cs *engine)
2332{
2333 bool unexpected = false;
2334
2335 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
2336 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
2337 unexpected = true;
2338 }
2339
2340 return unexpected;
2341}
2342
2343static int execlists_resume(struct intel_engine_cs *engine)
2344{
2345 intel_engine_apply_workarounds(engine);
2346 intel_engine_apply_whitelist(engine);
2347
2348 intel_mocs_init_engine(engine);
2349
2350 intel_engine_reset_breadcrumbs(engine);
2351
2352 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
2353 struct drm_printer p = drm_debug_printer(__func__);
2354
2355 intel_engine_dump(engine, &p, NULL);
2356 }
2357
2358 enable_execlists(engine);
2359
2360 return 0;
2361}
2362
2363static void execlists_reset_prepare(struct intel_engine_cs *engine)
2364{
2365 struct intel_engine_execlists * const execlists = &engine->execlists;
2366 unsigned long flags;
2367
2368 GEM_TRACE("%s: depth<-%d\n", engine->name,
2369 atomic_read(&execlists->tasklet.count));
2370
2371 /*
2372 * Prevent request submission to the hardware until we have
2373 * completed the reset in i915_gem_reset_finish(). If a request
2374 * is completed by one engine, it may then queue a request
2375 * to a second via its execlists->tasklet *just* as we are
2376 * calling engine->resume() and also writing the ELSP.
2377 * Turning off the execlists->tasklet until the reset is over
2378 * prevents the race.
2379 */
2380 __tasklet_disable_sync_once(&execlists->tasklet);
2381 GEM_BUG_ON(!reset_in_progress(execlists));
2382
2383 /* And flush any current direct submission. */
2384 spin_lock_irqsave(&engine->active.lock, flags);
2385 spin_unlock_irqrestore(&engine->active.lock, flags);
2386
2387 /*
2388 * We stop engines, otherwise we might get failed reset and a
2389 * dead gpu (on elk). Also as modern gpu as kbl can suffer
2390 * from system hang if batchbuffer is progressing when
2391 * the reset is issued, regardless of READY_TO_RESET ack.
2392 * Thus assume it is best to stop engines on all gens
2393 * where we have a gpu reset.
2394 *
2395 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
2396 *
2397 * FIXME: Wa for more modern gens needs to be validated
2398 */
2399 intel_engine_stop_cs(engine);
2400}
2401
2402static void reset_csb_pointers(struct intel_engine_cs *engine)
2403{
2404 struct intel_engine_execlists * const execlists = &engine->execlists;
2405 const unsigned int reset_value = execlists->csb_size - 1;
2406
2407 ring_set_paused(engine, 0);
2408
2409 /*
2410 * After a reset, the HW starts writing into CSB entry [0]. We
2411 * therefore have to set our HEAD pointer back one entry so that
2412 * the *first* entry we check is entry 0. To complicate this further,
2413 * as we don't wait for the first interrupt after reset, we have to
2414 * fake the HW write to point back to the last entry so that our
2415 * inline comparison of our cached head position against the last HW
2416 * write works even before the first interrupt.
2417 */
2418 execlists->csb_head = reset_value;
2419 WRITE_ONCE(*execlists->csb_write, reset_value);
2420 wmb(); /* Make sure this is visible to HW (paranoia?) */
2421
2422 invalidate_csb_entries(&execlists->csb_status[0],
2423 &execlists->csb_status[reset_value]);
2424}
2425
2426static struct i915_request *active_request(struct i915_request *rq)
2427{
2428 const struct intel_context * const ce = rq->hw_context;
2429 struct i915_request *active = NULL;
2430 struct list_head *list;
2431
2432 if (!i915_request_is_active(rq)) /* unwound, but incomplete! */
2433 return rq;
2434
2435 list = &rq->timeline->requests;
2436 list_for_each_entry_from_reverse(rq, list, link) {
2437 if (i915_request_completed(rq))
2438 break;
2439
2440 if (rq->hw_context != ce)
2441 break;
2442
2443 active = rq;
2444 }
2445
2446 return active;
2447}
2448
2449static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
2450{
2451 struct intel_engine_execlists * const execlists = &engine->execlists;
2452 struct intel_context *ce;
2453 struct i915_request *rq;
2454 u32 *regs;
2455
2456 process_csb(engine); /* drain preemption events */
2457
2458 /* Following the reset, we need to reload the CSB read/write pointers */
2459 reset_csb_pointers(engine);
2460
2461 /*
2462 * Save the currently executing context, even if we completed
2463 * its request, it was still running at the time of the
2464 * reset and will have been clobbered.
2465 */
2466 rq = execlists_active(execlists);
2467 if (!rq)
2468 goto unwind;
2469
2470 ce = rq->hw_context;
2471 GEM_BUG_ON(i915_active_is_idle(&ce->active));
2472 GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2473 rq = active_request(rq);
2474 if (!rq) {
2475 ce->ring->head = ce->ring->tail;
2476 goto out_replay;
2477 }
2478
2479 ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
2480
2481 /*
2482 * If this request hasn't started yet, e.g. it is waiting on a
2483 * semaphore, we need to avoid skipping the request or else we
2484 * break the signaling chain. However, if the context is corrupt
2485 * the request will not restart and we will be stuck with a wedged
2486 * device. It is quite often the case that if we issue a reset
2487 * while the GPU is loading the context image, that the context
2488 * image becomes corrupt.
2489 *
2490 * Otherwise, if we have not started yet, the request should replay
2491 * perfectly and we do not need to flag the result as being erroneous.
2492 */
2493 if (!i915_request_started(rq))
2494 goto out_replay;
2495
2496 /*
2497 * If the request was innocent, we leave the request in the ELSP
2498 * and will try to replay it on restarting. The context image may
2499 * have been corrupted by the reset, in which case we may have
2500 * to service a new GPU hang, but more likely we can continue on
2501 * without impact.
2502 *
2503 * If the request was guilty, we presume the context is corrupt
2504 * and have to at least restore the RING register in the context
2505 * image back to the expected values to skip over the guilty request.
2506 */
2507 __i915_request_reset(rq, stalled);
2508 if (!stalled)
2509 goto out_replay;
2510
2511 /*
2512 * We want a simple context + ring to execute the breadcrumb update.
2513 * We cannot rely on the context being intact across the GPU hang,
2514 * so clear it and rebuild just what we need for the breadcrumb.
2515 * All pending requests for this context will be zapped, and any
2516 * future request will be after userspace has had the opportunity
2517 * to recreate its own state.
2518 */
2519 regs = ce->lrc_reg_state;
2520 if (engine->pinned_default_state) {
2521 memcpy(regs, /* skip restoring the vanilla PPHWSP */
2522 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
2523 engine->context_size - PAGE_SIZE);
2524 }
2525 execlists_init_reg_state(regs, ce, engine, ce->ring);
2526
2527out_replay:
2528 GEM_TRACE("%s replay {head:%04x, tail:%04x\n",
2529 engine->name, ce->ring->head, ce->ring->tail);
2530 intel_ring_update_space(ce->ring);
2531 __execlists_update_reg_state(ce, engine);
2532
2533unwind:
2534 /* Push back any incomplete requests for replay after the reset. */
2535 cancel_port_requests(execlists);
2536 __unwind_incomplete_requests(engine);
2537}
2538
2539static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
2540{
2541 unsigned long flags;
2542
2543 GEM_TRACE("%s\n", engine->name);
2544
2545 spin_lock_irqsave(&engine->active.lock, flags);
2546
2547 __execlists_reset(engine, stalled);
2548
2549 spin_unlock_irqrestore(&engine->active.lock, flags);
2550}
2551
2552static void nop_submission_tasklet(unsigned long data)
2553{
2554 /* The driver is wedged; don't process any more events. */
2555}
2556
2557static void execlists_cancel_requests(struct intel_engine_cs *engine)
2558{
2559 struct intel_engine_execlists * const execlists = &engine->execlists;
2560 struct i915_request *rq, *rn;
2561 struct rb_node *rb;
2562 unsigned long flags;
2563
2564 GEM_TRACE("%s\n", engine->name);
2565
2566 /*
2567 * Before we call engine->cancel_requests(), we should have exclusive
2568 * access to the submission state. This is arranged for us by the
2569 * caller disabling the interrupt generation, the tasklet and other
2570 * threads that may then access the same state, giving us a free hand
2571 * to reset state. However, we still need to let lockdep be aware that
2572 * we know this state may be accessed in hardirq context, so we
2573 * disable the irq around this manipulation and we want to keep
2574 * the spinlock focused on its duties and not accidentally conflate
2575 * coverage to the submission's irq state. (Similarly, although we
2576 * shouldn't need to disable irq around the manipulation of the
2577 * submission's irq state, we also wish to remind ourselves that
2578 * it is irq state.)
2579 */
2580 spin_lock_irqsave(&engine->active.lock, flags);
2581
2582 __execlists_reset(engine, true);
2583
2584 /* Mark all executing requests as skipped. */
2585 list_for_each_entry(rq, &engine->active.requests, sched.link)
2586 mark_eio(rq);
2587
2588 /* Flush the queued requests to the timeline list (for retiring). */
2589 while ((rb = rb_first_cached(&execlists->queue))) {
2590 struct i915_priolist *p = to_priolist(rb);
2591 int i;
2592
2593 priolist_for_each_request_consume(rq, rn, p, i) {
2594 mark_eio(rq);
2595 __i915_request_submit(rq);
2596 }
2597
2598 rb_erase_cached(&p->node, &execlists->queue);
2599 i915_priolist_free(p);
2600 }
2601
2602 /* Cancel all attached virtual engines */
2603 while ((rb = rb_first_cached(&execlists->virtual))) {
2604 struct virtual_engine *ve =
2605 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2606
2607 rb_erase_cached(rb, &execlists->virtual);
2608 RB_CLEAR_NODE(rb);
2609
2610 spin_lock(&ve->base.active.lock);
2611 rq = fetch_and_zero(&ve->request);
2612 if (rq) {
2613 mark_eio(rq);
2614
2615 rq->engine = engine;
2616 __i915_request_submit(rq);
2617 i915_request_put(rq);
2618
2619 ve->base.execlists.queue_priority_hint = INT_MIN;
2620 }
2621 spin_unlock(&ve->base.active.lock);
2622 }
2623
2624 /* Remaining _unready_ requests will be nop'ed when submitted */
2625
2626 execlists->queue_priority_hint = INT_MIN;
2627 execlists->queue = RB_ROOT_CACHED;
2628
2629 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
2630 execlists->tasklet.func = nop_submission_tasklet;
2631
2632 spin_unlock_irqrestore(&engine->active.lock, flags);
2633}
2634
2635static void execlists_reset_finish(struct intel_engine_cs *engine)
2636{
2637 struct intel_engine_execlists * const execlists = &engine->execlists;
2638
2639 /*
2640 * After a GPU reset, we may have requests to replay. Do so now while
2641 * we still have the forcewake to be sure that the GPU is not allowed
2642 * to sleep before we restart and reload a context.
2643 */
2644 GEM_BUG_ON(!reset_in_progress(execlists));
2645 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
2646 execlists->tasklet.func(execlists->tasklet.data);
2647
2648 if (__tasklet_enable(&execlists->tasklet))
2649 /* And kick in case we missed a new request submission. */
2650 tasklet_hi_schedule(&execlists->tasklet);
2651 GEM_TRACE("%s: depth->%d\n", engine->name,
2652 atomic_read(&execlists->tasklet.count));
2653}
2654
2655static int gen8_emit_bb_start(struct i915_request *rq,
2656 u64 offset, u32 len,
2657 const unsigned int flags)
2658{
2659 u32 *cs;
2660
2661 cs = intel_ring_begin(rq, 4);
2662 if (IS_ERR(cs))
2663 return PTR_ERR(cs);
2664
2665 /*
2666 * WaDisableCtxRestoreArbitration:bdw,chv
2667 *
2668 * We don't need to perform MI_ARB_ENABLE as often as we do (in
2669 * particular all the gen that do not need the w/a at all!), if we
2670 * took care to make sure that on every switch into this context
2671 * (both ordinary and for preemption) that arbitrartion was enabled
2672 * we would be fine. However, for gen8 there is another w/a that
2673 * requires us to not preempt inside GPGPU execution, so we keep
2674 * arbitration disabled for gen8 batches. Arbitration will be
2675 * re-enabled before we close the request
2676 * (engine->emit_fini_breadcrumb).
2677 */
2678 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2679
2680 /* FIXME(BDW+): Address space and security selectors. */
2681 *cs++ = MI_BATCH_BUFFER_START_GEN8 |
2682 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
2683 *cs++ = lower_32_bits(offset);
2684 *cs++ = upper_32_bits(offset);
2685
2686 intel_ring_advance(rq, cs);
2687
2688 return 0;
2689}
2690
2691static int gen9_emit_bb_start(struct i915_request *rq,
2692 u64 offset, u32 len,
2693 const unsigned int flags)
2694{
2695 u32 *cs;
2696
2697 cs = intel_ring_begin(rq, 6);
2698 if (IS_ERR(cs))
2699 return PTR_ERR(cs);
2700
2701 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2702
2703 *cs++ = MI_BATCH_BUFFER_START_GEN8 |
2704 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
2705 *cs++ = lower_32_bits(offset);
2706 *cs++ = upper_32_bits(offset);
2707
2708 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2709 *cs++ = MI_NOOP;
2710
2711 intel_ring_advance(rq, cs);
2712
2713 return 0;
2714}
2715
2716static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
2717{
2718 ENGINE_WRITE(engine, RING_IMR,
2719 ~(engine->irq_enable_mask | engine->irq_keep_mask));
2720 ENGINE_POSTING_READ(engine, RING_IMR);
2721}
2722
2723static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
2724{
2725 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
2726}
2727
2728static int gen8_emit_flush(struct i915_request *request, u32 mode)
2729{
2730 u32 cmd, *cs;
2731
2732 cs = intel_ring_begin(request, 4);
2733 if (IS_ERR(cs))
2734 return PTR_ERR(cs);
2735
2736 cmd = MI_FLUSH_DW + 1;
2737
2738 /* We always require a command barrier so that subsequent
2739 * commands, such as breadcrumb interrupts, are strictly ordered
2740 * wrt the contents of the write cache being flushed to memory
2741 * (and thus being coherent from the CPU).
2742 */
2743 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
2744
2745 if (mode & EMIT_INVALIDATE) {
2746 cmd |= MI_INVALIDATE_TLB;
2747 if (request->engine->class == VIDEO_DECODE_CLASS)
2748 cmd |= MI_INVALIDATE_BSD;
2749 }
2750
2751 *cs++ = cmd;
2752 *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
2753 *cs++ = 0; /* upper addr */
2754 *cs++ = 0; /* value */
2755 intel_ring_advance(request, cs);
2756
2757 return 0;
2758}
2759
2760static int gen8_emit_flush_render(struct i915_request *request,
2761 u32 mode)
2762{
2763 struct intel_engine_cs *engine = request->engine;
2764 u32 scratch_addr =
2765 intel_gt_scratch_offset(engine->gt,
2766 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
2767 bool vf_flush_wa = false, dc_flush_wa = false;
2768 u32 *cs, flags = 0;
2769 int len;
2770
2771 flags |= PIPE_CONTROL_CS_STALL;
2772
2773 if (mode & EMIT_FLUSH) {
2774 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
2775 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
2776 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
2777 flags |= PIPE_CONTROL_FLUSH_ENABLE;
2778 }
2779
2780 if (mode & EMIT_INVALIDATE) {
2781 flags |= PIPE_CONTROL_TLB_INVALIDATE;
2782 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
2783 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
2784 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
2785 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
2786 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
2787 flags |= PIPE_CONTROL_QW_WRITE;
2788 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
2789
2790 /*
2791 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
2792 * pipe control.
2793 */
2794 if (IS_GEN(request->i915, 9))
2795 vf_flush_wa = true;
2796
2797 /* WaForGAMHang:kbl */
2798 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
2799 dc_flush_wa = true;
2800 }
2801
2802 len = 6;
2803
2804 if (vf_flush_wa)
2805 len += 6;
2806
2807 if (dc_flush_wa)
2808 len += 12;
2809
2810 cs = intel_ring_begin(request, len);
2811 if (IS_ERR(cs))
2812 return PTR_ERR(cs);
2813
2814 if (vf_flush_wa)
2815 cs = gen8_emit_pipe_control(cs, 0, 0);
2816
2817 if (dc_flush_wa)
2818 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
2819 0);
2820
2821 cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
2822
2823 if (dc_flush_wa)
2824 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
2825
2826 intel_ring_advance(request, cs);
2827
2828 return 0;
2829}
2830
2831static int gen11_emit_flush_render(struct i915_request *request,
2832 u32 mode)
2833{
2834 struct intel_engine_cs *engine = request->engine;
2835 const u32 scratch_addr =
2836 intel_gt_scratch_offset(engine->gt,
2837 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
2838
2839 if (mode & EMIT_FLUSH) {
2840 u32 *cs;
2841 u32 flags = 0;
2842
2843 flags |= PIPE_CONTROL_CS_STALL;
2844
2845 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
2846 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
2847 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
2848 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
2849 flags |= PIPE_CONTROL_FLUSH_ENABLE;
2850 flags |= PIPE_CONTROL_QW_WRITE;
2851 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
2852
2853 cs = intel_ring_begin(request, 6);
2854 if (IS_ERR(cs))
2855 return PTR_ERR(cs);
2856
2857 cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
2858 intel_ring_advance(request, cs);
2859 }
2860
2861 if (mode & EMIT_INVALIDATE) {
2862 u32 *cs;
2863 u32 flags = 0;
2864
2865 flags |= PIPE_CONTROL_CS_STALL;
2866
2867 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
2868 flags |= PIPE_CONTROL_TLB_INVALIDATE;
2869 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
2870 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
2871 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
2872 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
2873 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
2874 flags |= PIPE_CONTROL_QW_WRITE;
2875 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
2876
2877 cs = intel_ring_begin(request, 6);
2878 if (IS_ERR(cs))
2879 return PTR_ERR(cs);
2880
2881 cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
2882 intel_ring_advance(request, cs);
2883 }
2884
2885 return 0;
2886}
2887
2888/*
2889 * Reserve space for 2 NOOPs at the end of each request to be
2890 * used as a workaround for not being allowed to do lite
2891 * restore with HEAD==TAIL (WaIdleLiteRestore).
2892 */
2893static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
2894{
2895 /* Ensure there's always at least one preemption point per-request. */
2896 *cs++ = MI_ARB_CHECK;
2897 *cs++ = MI_NOOP;
2898 request->wa_tail = intel_ring_offset(request, cs);
2899
2900 return cs;
2901}
2902
2903static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
2904{
2905 *cs++ = MI_SEMAPHORE_WAIT |
2906 MI_SEMAPHORE_GLOBAL_GTT |
2907 MI_SEMAPHORE_POLL |
2908 MI_SEMAPHORE_SAD_EQ_SDD;
2909 *cs++ = 0;
2910 *cs++ = intel_hws_preempt_address(request->engine);
2911 *cs++ = 0;
2912
2913 return cs;
2914}
2915
2916static __always_inline u32*
2917gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
2918 u32 *cs)
2919{
2920 *cs++ = MI_USER_INTERRUPT;
2921
2922 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2923 if (intel_engine_has_semaphores(request->engine))
2924 cs = emit_preempt_busywait(request, cs);
2925
2926 request->tail = intel_ring_offset(request, cs);
2927 assert_ring_tail_valid(request->ring, request->tail);
2928
2929 return gen8_emit_wa_tail(request, cs);
2930}
2931
2932static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
2933{
2934 cs = gen8_emit_ggtt_write(cs,
2935 request->fence.seqno,
2936 request->timeline->hwsp_offset,
2937 0);
2938
2939 return gen8_emit_fini_breadcrumb_footer(request, cs);
2940}
2941
2942static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
2943{
2944 cs = gen8_emit_ggtt_write_rcs(cs,
2945 request->fence.seqno,
2946 request->timeline->hwsp_offset,
2947 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
2948 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2949 PIPE_CONTROL_DC_FLUSH_ENABLE);
2950
2951 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
2952 cs = gen8_emit_pipe_control(cs,
2953 PIPE_CONTROL_FLUSH_ENABLE |
2954 PIPE_CONTROL_CS_STALL,
2955 0);
2956
2957 return gen8_emit_fini_breadcrumb_footer(request, cs);
2958}
2959
2960static u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *request,
2961 u32 *cs)
2962{
2963 cs = gen8_emit_ggtt_write_rcs(cs,
2964 request->fence.seqno,
2965 request->timeline->hwsp_offset,
2966 PIPE_CONTROL_CS_STALL |
2967 PIPE_CONTROL_TILE_CACHE_FLUSH |
2968 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
2969 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2970 PIPE_CONTROL_DC_FLUSH_ENABLE |
2971 PIPE_CONTROL_FLUSH_ENABLE);
2972
2973 return gen8_emit_fini_breadcrumb_footer(request, cs);
2974}
2975
2976static void execlists_park(struct intel_engine_cs *engine)
2977{
2978 del_timer(&engine->execlists.timer);
2979}
2980
2981void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
2982{
2983 engine->submit_request = execlists_submit_request;
2984 engine->cancel_requests = execlists_cancel_requests;
2985 engine->schedule = i915_schedule;
2986 engine->execlists.tasklet.func = execlists_submission_tasklet;
2987
2988 engine->reset.prepare = execlists_reset_prepare;
2989 engine->reset.reset = execlists_reset;
2990 engine->reset.finish = execlists_reset_finish;
2991
2992 engine->park = execlists_park;
2993 engine->unpark = NULL;
2994
2995 engine->flags |= I915_ENGINE_SUPPORTS_STATS;
2996 if (!intel_vgpu_active(engine->i915)) {
2997 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
2998 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
2999 engine->flags |= I915_ENGINE_HAS_PREEMPTION;
3000 }
3001}
3002
3003static void execlists_destroy(struct intel_engine_cs *engine)
3004{
3005 intel_engine_cleanup_common(engine);
3006 lrc_destroy_wa_ctx(engine);
3007 kfree(engine);
3008}
3009
3010static void
3011logical_ring_default_vfuncs(struct intel_engine_cs *engine)
3012{
3013 /* Default vfuncs which can be overriden by each engine. */
3014
3015 engine->destroy = execlists_destroy;
3016 engine->resume = execlists_resume;
3017
3018 engine->reset.prepare = execlists_reset_prepare;
3019 engine->reset.reset = execlists_reset;
3020 engine->reset.finish = execlists_reset_finish;
3021
3022 engine->cops = &execlists_context_ops;
3023 engine->request_alloc = execlists_request_alloc;
3024
3025 engine->emit_flush = gen8_emit_flush;
3026 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
3027 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
3028
3029 engine->set_default_submission = intel_execlists_set_default_submission;
3030
3031 if (INTEL_GEN(engine->i915) < 11) {
3032 engine->irq_enable = gen8_logical_ring_enable_irq;
3033 engine->irq_disable = gen8_logical_ring_disable_irq;
3034 } else {
3035 /*
3036 * TODO: On Gen11 interrupt masks need to be clear
3037 * to allow C6 entry. Keep interrupts enabled at
3038 * and take the hit of generating extra interrupts
3039 * until a more refined solution exists.
3040 */
3041 }
3042 if (IS_GEN(engine->i915, 8))
3043 engine->emit_bb_start = gen8_emit_bb_start;
3044 else
3045 engine->emit_bb_start = gen9_emit_bb_start;
3046}
3047
3048static inline void
3049logical_ring_default_irqs(struct intel_engine_cs *engine)
3050{
3051 unsigned int shift = 0;
3052
3053 if (INTEL_GEN(engine->i915) < 11) {
3054 const u8 irq_shifts[] = {
3055 [RCS0] = GEN8_RCS_IRQ_SHIFT,
3056 [BCS0] = GEN8_BCS_IRQ_SHIFT,
3057 [VCS0] = GEN8_VCS0_IRQ_SHIFT,
3058 [VCS1] = GEN8_VCS1_IRQ_SHIFT,
3059 [VECS0] = GEN8_VECS_IRQ_SHIFT,
3060 };
3061
3062 shift = irq_shifts[engine->id];
3063 }
3064
3065 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
3066 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
3067}
3068
3069static void rcs_submission_override(struct intel_engine_cs *engine)
3070{
3071 switch (INTEL_GEN(engine->i915)) {
3072 case 12:
3073 case 11:
3074 engine->emit_flush = gen11_emit_flush_render;
3075 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
3076 break;
3077 default:
3078 engine->emit_flush = gen8_emit_flush_render;
3079 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
3080 break;
3081 }
3082}
3083
3084int intel_execlists_submission_setup(struct intel_engine_cs *engine)
3085{
3086 tasklet_init(&engine->execlists.tasklet,
3087 execlists_submission_tasklet, (unsigned long)engine);
3088 timer_setup(&engine->execlists.timer, execlists_submission_timer, 0);
3089
3090 logical_ring_default_vfuncs(engine);
3091 logical_ring_default_irqs(engine);
3092
3093 if (engine->class == RENDER_CLASS)
3094 rcs_submission_override(engine);
3095
3096 return 0;
3097}
3098
3099int intel_execlists_submission_init(struct intel_engine_cs *engine)
3100{
3101 struct intel_engine_execlists * const execlists = &engine->execlists;
3102 struct drm_i915_private *i915 = engine->i915;
3103 struct intel_uncore *uncore = engine->uncore;
3104 u32 base = engine->mmio_base;
3105 int ret;
3106
3107 ret = intel_engine_init_common(engine);
3108 if (ret)
3109 return ret;
3110
3111 if (intel_init_workaround_bb(engine))
3112 /*
3113 * We continue even if we fail to initialize WA batch
3114 * because we only expect rare glitches but nothing
3115 * critical to prevent us from using GPU
3116 */
3117 DRM_ERROR("WA batch buffer initialization failed\n");
3118
3119 if (HAS_LOGICAL_RING_ELSQ(i915)) {
3120 execlists->submit_reg = uncore->regs +
3121 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
3122 execlists->ctrl_reg = uncore->regs +
3123 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
3124 } else {
3125 execlists->submit_reg = uncore->regs +
3126 i915_mmio_reg_offset(RING_ELSP(base));
3127 }
3128
3129 execlists->csb_status =
3130 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
3131
3132 execlists->csb_write =
3133 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
3134
3135 if (INTEL_GEN(i915) < 11)
3136 execlists->csb_size = GEN8_CSB_ENTRIES;
3137 else
3138 execlists->csb_size = GEN11_CSB_ENTRIES;
3139
3140 reset_csb_pointers(engine);
3141
3142 return 0;
3143}
3144
3145static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
3146{
3147 u32 indirect_ctx_offset;
3148
3149 switch (INTEL_GEN(engine->i915)) {
3150 default:
3151 MISSING_CASE(INTEL_GEN(engine->i915));
3152 /* fall through */
3153 case 12:
3154 indirect_ctx_offset =
3155 GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3156 break;
3157 case 11:
3158 indirect_ctx_offset =
3159 GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3160 break;
3161 case 10:
3162 indirect_ctx_offset =
3163 GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3164 break;
3165 case 9:
3166 indirect_ctx_offset =
3167 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3168 break;
3169 case 8:
3170 indirect_ctx_offset =
3171 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3172 break;
3173 }
3174
3175 return indirect_ctx_offset;
3176}
3177
3178static void execlists_init_reg_state(u32 *regs,
3179 struct intel_context *ce,
3180 struct intel_engine_cs *engine,
3181 struct intel_ring *ring)
3182{
3183 struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(ce->vm);
3184 bool rcs = engine->class == RENDER_CLASS;
3185 u32 base = engine->mmio_base;
3186
3187 /*
3188 * A context is actually a big batch buffer with several
3189 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
3190 * values we are setting here are only for the first context restore:
3191 * on a subsequent save, the GPU will recreate this batchbuffer with new
3192 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
3193 * we are not initializing here).
3194 *
3195 * Must keep consistent with virtual_update_register_offsets().
3196 */
3197 regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
3198 MI_LRI_FORCE_POSTED;
3199
3200 CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(base),
3201 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
3202 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH));
3203 if (INTEL_GEN(engine->i915) < 11) {
3204 regs[CTX_CONTEXT_CONTROL + 1] |=
3205 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
3206 CTX_CTRL_RS_CTX_ENABLE);
3207 }
3208 CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0);
3209 CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0);
3210 CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0);
3211 CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base),
3212 RING_CTL_SIZE(ring->size) | RING_VALID);
3213 CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0);
3214 CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0);
3215 CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT);
3216 CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0);
3217 CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0);
3218 CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0);
3219 if (rcs) {
3220 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3221
3222 CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0);
3223 CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET,
3224 RING_INDIRECT_CTX_OFFSET(base), 0);
3225 if (wa_ctx->indirect_ctx.size) {
3226 u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3227
3228 regs[CTX_RCS_INDIRECT_CTX + 1] =
3229 (ggtt_offset + wa_ctx->indirect_ctx.offset) |
3230 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
3231
3232 regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] =
3233 intel_lr_indirect_ctx_offset(engine) << 6;
3234 }
3235
3236 CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0);
3237 if (wa_ctx->per_ctx.size) {
3238 u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3239
3240 regs[CTX_BB_PER_CTX_PTR + 1] =
3241 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
3242 }
3243 }
3244
3245 regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED;
3246
3247 CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
3248 /* PDP values well be assigned later if needed */
3249 CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(base, 3), 0);
3250 CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(base, 3), 0);
3251 CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(base, 2), 0);
3252 CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(base, 2), 0);
3253 CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(base, 1), 0);
3254 CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(base, 1), 0);
3255 CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(base, 0), 0);
3256 CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(base, 0), 0);
3257
3258 if (i915_vm_is_4lvl(&ppgtt->vm)) {
3259 /* 64b PPGTT (48bit canonical)
3260 * PDP0_DESCRIPTOR contains the base address to PML4 and
3261 * other PDP Descriptors are ignored.
3262 */
3263 ASSIGN_CTX_PML4(ppgtt, regs);
3264 } else {
3265 ASSIGN_CTX_PDP(ppgtt, regs, 3);
3266 ASSIGN_CTX_PDP(ppgtt, regs, 2);
3267 ASSIGN_CTX_PDP(ppgtt, regs, 1);
3268 ASSIGN_CTX_PDP(ppgtt, regs, 0);
3269 }
3270
3271 if (rcs) {
3272 regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
3273 CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
3274 }
3275
3276 regs[CTX_END] = MI_BATCH_BUFFER_END;
3277 if (INTEL_GEN(engine->i915) >= 10)
3278 regs[CTX_END] |= BIT(0);
3279}
3280
3281static int
3282populate_lr_context(struct intel_context *ce,
3283 struct drm_i915_gem_object *ctx_obj,
3284 struct intel_engine_cs *engine,
3285 struct intel_ring *ring)
3286{
3287 void *vaddr;
3288 u32 *regs;
3289 int ret;
3290
3291 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
3292 if (IS_ERR(vaddr)) {
3293 ret = PTR_ERR(vaddr);
3294 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
3295 return ret;
3296 }
3297
3298 set_redzone(vaddr, engine);
3299
3300 if (engine->default_state) {
3301 /*
3302 * We only want to copy over the template context state;
3303 * skipping over the headers reserved for GuC communication,
3304 * leaving those as zero.
3305 */
3306 const unsigned long start = LRC_HEADER_PAGES * PAGE_SIZE;
3307 void *defaults;
3308
3309 defaults = i915_gem_object_pin_map(engine->default_state,
3310 I915_MAP_WB);
3311 if (IS_ERR(defaults)) {
3312 ret = PTR_ERR(defaults);
3313 goto err_unpin_ctx;
3314 }
3315
3316 memcpy(vaddr + start, defaults + start, engine->context_size);
3317 i915_gem_object_unpin_map(engine->default_state);
3318 }
3319
3320 /* The second page of the context object contains some fields which must
3321 * be set up prior to the first execution. */
3322 regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
3323 execlists_init_reg_state(regs, ce, engine, ring);
3324 if (!engine->default_state)
3325 regs[CTX_CONTEXT_CONTROL + 1] |=
3326 _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
3327
3328 ret = 0;
3329err_unpin_ctx:
3330 __i915_gem_object_flush_map(ctx_obj,
3331 LRC_HEADER_PAGES * PAGE_SIZE,
3332 engine->context_size);
3333 i915_gem_object_unpin_map(ctx_obj);
3334 return ret;
3335}
3336
3337static int __execlists_context_alloc(struct intel_context *ce,
3338 struct intel_engine_cs *engine)
3339{
3340 struct drm_i915_gem_object *ctx_obj;
3341 struct intel_ring *ring;
3342 struct i915_vma *vma;
3343 u32 context_size;
3344 int ret;
3345
3346 GEM_BUG_ON(ce->state);
3347 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
3348
3349 /*
3350 * Before the actual start of the context image, we insert a few pages
3351 * for our own use and for sharing with the GuC.
3352 */
3353 context_size += LRC_HEADER_PAGES * PAGE_SIZE;
3354 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3355 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
3356
3357 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
3358 if (IS_ERR(ctx_obj))
3359 return PTR_ERR(ctx_obj);
3360
3361 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
3362 if (IS_ERR(vma)) {
3363 ret = PTR_ERR(vma);
3364 goto error_deref_obj;
3365 }
3366
3367 if (!ce->timeline) {
3368 struct intel_timeline *tl;
3369
3370 tl = intel_timeline_create(engine->gt, NULL);
3371 if (IS_ERR(tl)) {
3372 ret = PTR_ERR(tl);
3373 goto error_deref_obj;
3374 }
3375
3376 ce->timeline = tl;
3377 }
3378
3379 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
3380 if (IS_ERR(ring)) {
3381 ret = PTR_ERR(ring);
3382 goto error_deref_obj;
3383 }
3384
3385 ret = populate_lr_context(ce, ctx_obj, engine, ring);
3386 if (ret) {
3387 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
3388 goto error_ring_free;
3389 }
3390
3391 ce->ring = ring;
3392 ce->state = vma;
3393
3394 return 0;
3395
3396error_ring_free:
3397 intel_ring_put(ring);
3398error_deref_obj:
3399 i915_gem_object_put(ctx_obj);
3400 return ret;
3401}
3402
3403static struct list_head *virtual_queue(struct virtual_engine *ve)
3404{
3405 return &ve->base.execlists.default_priolist.requests[0];
3406}
3407
3408static void virtual_context_destroy(struct kref *kref)
3409{
3410 struct virtual_engine *ve =
3411 container_of(kref, typeof(*ve), context.ref);
3412 unsigned int n;
3413
3414 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
3415 GEM_BUG_ON(ve->request);
3416 GEM_BUG_ON(ve->context.inflight);
3417
3418 for (n = 0; n < ve->num_siblings; n++) {
3419 struct intel_engine_cs *sibling = ve->siblings[n];
3420 struct rb_node *node = &ve->nodes[sibling->id].rb;
3421
3422 if (RB_EMPTY_NODE(node))
3423 continue;
3424
3425 spin_lock_irq(&sibling->active.lock);
3426
3427 /* Detachment is lazily performed in the execlists tasklet */
3428 if (!RB_EMPTY_NODE(node))
3429 rb_erase_cached(node, &sibling->execlists.virtual);
3430
3431 spin_unlock_irq(&sibling->active.lock);
3432 }
3433 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
3434
3435 if (ve->context.state)
3436 __execlists_context_fini(&ve->context);
3437 intel_context_fini(&ve->context);
3438
3439 kfree(ve->bonds);
3440 kfree(ve);
3441}
3442
3443static void virtual_engine_initial_hint(struct virtual_engine *ve)
3444{
3445 int swp;
3446
3447 /*
3448 * Pick a random sibling on starting to help spread the load around.
3449 *
3450 * New contexts are typically created with exactly the same order
3451 * of siblings, and often started in batches. Due to the way we iterate
3452 * the array of sibling when submitting requests, sibling[0] is
3453 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
3454 * randomised across the system, we also help spread the load by the
3455 * first engine we inspect being different each time.
3456 *
3457 * NB This does not force us to execute on this engine, it will just
3458 * typically be the first we inspect for submission.
3459 */
3460 swp = prandom_u32_max(ve->num_siblings);
3461 if (!swp)
3462 return;
3463
3464 swap(ve->siblings[swp], ve->siblings[0]);
3465 virtual_update_register_offsets(ve->context.lrc_reg_state,
3466 ve->siblings[0]);
3467}
3468
3469static int virtual_context_pin(struct intel_context *ce)
3470{
3471 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
3472 int err;
3473
3474 /* Note: we must use a real engine class for setting up reg state */
3475 err = __execlists_context_pin(ce, ve->siblings[0]);
3476 if (err)
3477 return err;
3478
3479 virtual_engine_initial_hint(ve);
3480 return 0;
3481}
3482
3483static void virtual_context_enter(struct intel_context *ce)
3484{
3485 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
3486 unsigned int n;
3487
3488 for (n = 0; n < ve->num_siblings; n++)
3489 intel_engine_pm_get(ve->siblings[n]);
3490
3491 intel_timeline_enter(ce->timeline);
3492}
3493
3494static void virtual_context_exit(struct intel_context *ce)
3495{
3496 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
3497 unsigned int n;
3498
3499 intel_timeline_exit(ce->timeline);
3500
3501 for (n = 0; n < ve->num_siblings; n++)
3502 intel_engine_pm_put(ve->siblings[n]);
3503}
3504
3505static const struct intel_context_ops virtual_context_ops = {
3506 .pin = virtual_context_pin,
3507 .unpin = execlists_context_unpin,
3508
3509 .enter = virtual_context_enter,
3510 .exit = virtual_context_exit,
3511
3512 .destroy = virtual_context_destroy,
3513};
3514
3515static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
3516{
3517 struct i915_request *rq;
3518 intel_engine_mask_t mask;
3519
3520 rq = READ_ONCE(ve->request);
3521 if (!rq)
3522 return 0;
3523
3524 /* The rq is ready for submission; rq->execution_mask is now stable. */
3525 mask = rq->execution_mask;
3526 if (unlikely(!mask)) {
3527 /* Invalid selection, submit to a random engine in error */
3528 i915_request_skip(rq, -ENODEV);
3529 mask = ve->siblings[0]->mask;
3530 }
3531
3532 GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n",
3533 ve->base.name,
3534 rq->fence.context, rq->fence.seqno,
3535 mask, ve->base.execlists.queue_priority_hint);
3536
3537 return mask;
3538}
3539
3540static void virtual_submission_tasklet(unsigned long data)
3541{
3542 struct virtual_engine * const ve = (struct virtual_engine *)data;
3543 const int prio = ve->base.execlists.queue_priority_hint;
3544 intel_engine_mask_t mask;
3545 unsigned int n;
3546
3547 rcu_read_lock();
3548 mask = virtual_submission_mask(ve);
3549 rcu_read_unlock();
3550 if (unlikely(!mask))
3551 return;
3552
3553 local_irq_disable();
3554 for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
3555 struct intel_engine_cs *sibling = ve->siblings[n];
3556 struct ve_node * const node = &ve->nodes[sibling->id];
3557 struct rb_node **parent, *rb;
3558 bool first;
3559
3560 if (unlikely(!(mask & sibling->mask))) {
3561 if (!RB_EMPTY_NODE(&node->rb)) {
3562 spin_lock(&sibling->active.lock);
3563 rb_erase_cached(&node->rb,
3564 &sibling->execlists.virtual);
3565 RB_CLEAR_NODE(&node->rb);
3566 spin_unlock(&sibling->active.lock);
3567 }
3568 continue;
3569 }
3570
3571 spin_lock(&sibling->active.lock);
3572
3573 if (!RB_EMPTY_NODE(&node->rb)) {
3574 /*
3575 * Cheat and avoid rebalancing the tree if we can
3576 * reuse this node in situ.
3577 */
3578 first = rb_first_cached(&sibling->execlists.virtual) ==
3579 &node->rb;
3580 if (prio == node->prio || (prio > node->prio && first))
3581 goto submit_engine;
3582
3583 rb_erase_cached(&node->rb, &sibling->execlists.virtual);
3584 }
3585
3586 rb = NULL;
3587 first = true;
3588 parent = &sibling->execlists.virtual.rb_root.rb_node;
3589 while (*parent) {
3590 struct ve_node *other;
3591
3592 rb = *parent;
3593 other = rb_entry(rb, typeof(*other), rb);
3594 if (prio > other->prio) {
3595 parent = &rb->rb_left;
3596 } else {
3597 parent = &rb->rb_right;
3598 first = false;
3599 }
3600 }
3601
3602 rb_link_node(&node->rb, rb, parent);
3603 rb_insert_color_cached(&node->rb,
3604 &sibling->execlists.virtual,
3605 first);
3606
3607submit_engine:
3608 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
3609 node->prio = prio;
3610 if (first && prio > sibling->execlists.queue_priority_hint) {
3611 sibling->execlists.queue_priority_hint = prio;
3612 tasklet_hi_schedule(&sibling->execlists.tasklet);
3613 }
3614
3615 spin_unlock(&sibling->active.lock);
3616 }
3617 local_irq_enable();
3618}
3619
3620static void virtual_submit_request(struct i915_request *rq)
3621{
3622 struct virtual_engine *ve = to_virtual_engine(rq->engine);
3623 struct i915_request *old;
3624 unsigned long flags;
3625
3626 GEM_TRACE("%s: rq=%llx:%lld\n",
3627 ve->base.name,
3628 rq->fence.context,
3629 rq->fence.seqno);
3630
3631 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
3632
3633 spin_lock_irqsave(&ve->base.active.lock, flags);
3634
3635 old = ve->request;
3636 if (old) { /* background completion event from preempt-to-busy */
3637 GEM_BUG_ON(!i915_request_completed(old));
3638 __i915_request_submit(old);
3639 i915_request_put(old);
3640 }
3641
3642 if (i915_request_completed(rq)) {
3643 __i915_request_submit(rq);
3644
3645 ve->base.execlists.queue_priority_hint = INT_MIN;
3646 ve->request = NULL;
3647 } else {
3648 ve->base.execlists.queue_priority_hint = rq_prio(rq);
3649 ve->request = i915_request_get(rq);
3650
3651 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
3652 list_move_tail(&rq->sched.link, virtual_queue(ve));
3653
3654 tasklet_schedule(&ve->base.execlists.tasklet);
3655 }
3656
3657 spin_unlock_irqrestore(&ve->base.active.lock, flags);
3658}
3659
3660static struct ve_bond *
3661virtual_find_bond(struct virtual_engine *ve,
3662 const struct intel_engine_cs *master)
3663{
3664 int i;
3665
3666 for (i = 0; i < ve->num_bonds; i++) {
3667 if (ve->bonds[i].master == master)
3668 return &ve->bonds[i];
3669 }
3670
3671 return NULL;
3672}
3673
3674static void
3675virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
3676{
3677 struct virtual_engine *ve = to_virtual_engine(rq->engine);
3678 intel_engine_mask_t allowed, exec;
3679 struct ve_bond *bond;
3680
3681 allowed = ~to_request(signal)->engine->mask;
3682
3683 bond = virtual_find_bond(ve, to_request(signal)->engine);
3684 if (bond)
3685 allowed &= bond->sibling_mask;
3686
3687 /* Restrict the bonded request to run on only the available engines */
3688 exec = READ_ONCE(rq->execution_mask);
3689 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
3690 ;
3691
3692 /* Prevent the master from being re-run on the bonded engines */
3693 to_request(signal)->execution_mask &= ~allowed;
3694}
3695
3696struct intel_context *
3697intel_execlists_create_virtual(struct i915_gem_context *ctx,
3698 struct intel_engine_cs **siblings,
3699 unsigned int count)
3700{
3701 struct virtual_engine *ve;
3702 unsigned int n;
3703 int err;
3704
3705 if (count == 0)
3706 return ERR_PTR(-EINVAL);
3707
3708 if (count == 1)
3709 return intel_context_create(ctx, siblings[0]);
3710
3711 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
3712 if (!ve)
3713 return ERR_PTR(-ENOMEM);
3714
3715 ve->base.i915 = ctx->i915;
3716 ve->base.gt = siblings[0]->gt;
3717 ve->base.id = -1;
3718 ve->base.class = OTHER_CLASS;
3719 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
3720 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
3721
3722 /*
3723 * The decision on whether to submit a request using semaphores
3724 * depends on the saturated state of the engine. We only compute
3725 * this during HW submission of the request, and we need for this
3726 * state to be globally applied to all requests being submitted
3727 * to this engine. Virtual engines encompass more than one physical
3728 * engine and so we cannot accurately tell in advance if one of those
3729 * engines is already saturated and so cannot afford to use a semaphore
3730 * and be pessimized in priority for doing so -- if we are the only
3731 * context using semaphores after all other clients have stopped, we
3732 * will be starved on the saturated system. Such a global switch for
3733 * semaphores is less than ideal, but alas is the current compromise.
3734 */
3735 ve->base.saturated = ALL_ENGINES;
3736
3737 snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
3738
3739 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
3740
3741 intel_engine_init_execlists(&ve->base);
3742
3743 ve->base.cops = &virtual_context_ops;
3744 ve->base.request_alloc = execlists_request_alloc;
3745
3746 ve->base.schedule = i915_schedule;
3747 ve->base.submit_request = virtual_submit_request;
3748 ve->base.bond_execute = virtual_bond_execute;
3749
3750 INIT_LIST_HEAD(virtual_queue(ve));
3751 ve->base.execlists.queue_priority_hint = INT_MIN;
3752 tasklet_init(&ve->base.execlists.tasklet,
3753 virtual_submission_tasklet,
3754 (unsigned long)ve);
3755
3756 intel_context_init(&ve->context, ctx, &ve->base);
3757
3758 for (n = 0; n < count; n++) {
3759 struct intel_engine_cs *sibling = siblings[n];
3760
3761 GEM_BUG_ON(!is_power_of_2(sibling->mask));
3762 if (sibling->mask & ve->base.mask) {
3763 DRM_DEBUG("duplicate %s entry in load balancer\n",
3764 sibling->name);
3765 err = -EINVAL;
3766 goto err_put;
3767 }
3768
3769 /*
3770 * The virtual engine implementation is tightly coupled to
3771 * the execlists backend -- we push out request directly
3772 * into a tree inside each physical engine. We could support
3773 * layering if we handle cloning of the requests and
3774 * submitting a copy into each backend.
3775 */
3776 if (sibling->execlists.tasklet.func !=
3777 execlists_submission_tasklet) {
3778 err = -ENODEV;
3779 goto err_put;
3780 }
3781
3782 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
3783 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
3784
3785 ve->siblings[ve->num_siblings++] = sibling;
3786 ve->base.mask |= sibling->mask;
3787
3788 /*
3789 * All physical engines must be compatible for their emission
3790 * functions (as we build the instructions during request
3791 * construction and do not alter them before submission
3792 * on the physical engine). We use the engine class as a guide
3793 * here, although that could be refined.
3794 */
3795 if (ve->base.class != OTHER_CLASS) {
3796 if (ve->base.class != sibling->class) {
3797 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
3798 sibling->class, ve->base.class);
3799 err = -EINVAL;
3800 goto err_put;
3801 }
3802 continue;
3803 }
3804
3805 ve->base.class = sibling->class;
3806 ve->base.uabi_class = sibling->uabi_class;
3807 snprintf(ve->base.name, sizeof(ve->base.name),
3808 "v%dx%d", ve->base.class, count);
3809 ve->base.context_size = sibling->context_size;
3810
3811 ve->base.emit_bb_start = sibling->emit_bb_start;
3812 ve->base.emit_flush = sibling->emit_flush;
3813 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
3814 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
3815 ve->base.emit_fini_breadcrumb_dw =
3816 sibling->emit_fini_breadcrumb_dw;
3817
3818 ve->base.flags = sibling->flags;
3819 }
3820
3821 ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
3822
3823 err = __execlists_context_alloc(&ve->context, siblings[0]);
3824 if (err)
3825 goto err_put;
3826
3827 __set_bit(CONTEXT_ALLOC_BIT, &ve->context.flags);
3828
3829 return &ve->context;
3830
3831err_put:
3832 intel_context_put(&ve->context);
3833 return ERR_PTR(err);
3834}
3835
3836struct intel_context *
3837intel_execlists_clone_virtual(struct i915_gem_context *ctx,
3838 struct intel_engine_cs *src)
3839{
3840 struct virtual_engine *se = to_virtual_engine(src);
3841 struct intel_context *dst;
3842
3843 dst = intel_execlists_create_virtual(ctx,
3844 se->siblings,
3845 se->num_siblings);
3846 if (IS_ERR(dst))
3847 return dst;
3848
3849 if (se->num_bonds) {
3850 struct virtual_engine *de = to_virtual_engine(dst->engine);
3851
3852 de->bonds = kmemdup(se->bonds,
3853 sizeof(*se->bonds) * se->num_bonds,
3854 GFP_KERNEL);
3855 if (!de->bonds) {
3856 intel_context_put(dst);
3857 return ERR_PTR(-ENOMEM);
3858 }
3859
3860 de->num_bonds = se->num_bonds;
3861 }
3862
3863 return dst;
3864}
3865
3866int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
3867 const struct intel_engine_cs *master,
3868 const struct intel_engine_cs *sibling)
3869{
3870 struct virtual_engine *ve = to_virtual_engine(engine);
3871 struct ve_bond *bond;
3872 int n;
3873
3874 /* Sanity check the sibling is part of the virtual engine */
3875 for (n = 0; n < ve->num_siblings; n++)
3876 if (sibling == ve->siblings[n])
3877 break;
3878 if (n == ve->num_siblings)
3879 return -EINVAL;
3880
3881 bond = virtual_find_bond(ve, master);
3882 if (bond) {
3883 bond->sibling_mask |= sibling->mask;
3884 return 0;
3885 }
3886
3887 bond = krealloc(ve->bonds,
3888 sizeof(*bond) * (ve->num_bonds + 1),
3889 GFP_KERNEL);
3890 if (!bond)
3891 return -ENOMEM;
3892
3893 bond[ve->num_bonds].master = master;
3894 bond[ve->num_bonds].sibling_mask = sibling->mask;
3895
3896 ve->bonds = bond;
3897 ve->num_bonds++;
3898
3899 return 0;
3900}
3901
3902void intel_execlists_show_requests(struct intel_engine_cs *engine,
3903 struct drm_printer *m,
3904 void (*show_request)(struct drm_printer *m,
3905 struct i915_request *rq,
3906 const char *prefix),
3907 unsigned int max)
3908{
3909 const struct intel_engine_execlists *execlists = &engine->execlists;
3910 struct i915_request *rq, *last;
3911 unsigned long flags;
3912 unsigned int count;
3913 struct rb_node *rb;
3914
3915 spin_lock_irqsave(&engine->active.lock, flags);
3916
3917 last = NULL;
3918 count = 0;
3919 list_for_each_entry(rq, &engine->active.requests, sched.link) {
3920 if (count++ < max - 1)
3921 show_request(m, rq, "\t\tE ");
3922 else
3923 last = rq;
3924 }
3925 if (last) {
3926 if (count > max) {
3927 drm_printf(m,
3928 "\t\t...skipping %d executing requests...\n",
3929 count - max);
3930 }
3931 show_request(m, last, "\t\tE ");
3932 }
3933
3934 last = NULL;
3935 count = 0;
3936 if (execlists->queue_priority_hint != INT_MIN)
3937 drm_printf(m, "\t\tQueue priority hint: %d\n",
3938 execlists->queue_priority_hint);
3939 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
3940 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
3941 int i;
3942
3943 priolist_for_each_request(rq, p, i) {
3944 if (count++ < max - 1)
3945 show_request(m, rq, "\t\tQ ");
3946 else
3947 last = rq;
3948 }
3949 }
3950 if (last) {
3951 if (count > max) {
3952 drm_printf(m,
3953 "\t\t...skipping %d queued requests...\n",
3954 count - max);
3955 }
3956 show_request(m, last, "\t\tQ ");
3957 }
3958
3959 last = NULL;
3960 count = 0;
3961 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
3962 struct virtual_engine *ve =
3963 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3964 struct i915_request *rq = READ_ONCE(ve->request);
3965
3966 if (rq) {
3967 if (count++ < max - 1)
3968 show_request(m, rq, "\t\tV ");
3969 else
3970 last = rq;
3971 }
3972 }
3973 if (last) {
3974 if (count > max) {
3975 drm_printf(m,
3976 "\t\t...skipping %d virtual requests...\n",
3977 count - max);
3978 }
3979 show_request(m, last, "\t\tV ");
3980 }
3981
3982 spin_unlock_irqrestore(&engine->active.lock, flags);
3983}
3984
3985void intel_lr_context_reset(struct intel_engine_cs *engine,
3986 struct intel_context *ce,
3987 u32 head,
3988 bool scrub)
3989{
3990 /*
3991 * We want a simple context + ring to execute the breadcrumb update.
3992 * We cannot rely on the context being intact across the GPU hang,
3993 * so clear it and rebuild just what we need for the breadcrumb.
3994 * All pending requests for this context will be zapped, and any
3995 * future request will be after userspace has had the opportunity
3996 * to recreate its own state.
3997 */
3998 if (scrub) {
3999 u32 *regs = ce->lrc_reg_state;
4000
4001 if (engine->pinned_default_state) {
4002 memcpy(regs, /* skip restoring the vanilla PPHWSP */
4003 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
4004 engine->context_size - PAGE_SIZE);
4005 }
4006 execlists_init_reg_state(regs, ce, engine, ce->ring);
4007 }
4008
4009 /* Rerun the request; its payload has been neutered (if guilty). */
4010 ce->ring->head = head;
4011 intel_ring_update_space(ce->ring);
4012
4013 __execlists_update_reg_state(ce, engine);
4014}
4015
4016#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
4017#include "selftest_lrc.c"
4018#endif
1// SPDX-License-Identifier: MIT
2/*
3 * Copyright © 2014 Intel Corporation
4 */
5
6#include "gem/i915_gem_lmem.h"
7
8#include "gen8_engine_cs.h"
9#include "i915_drv.h"
10#include "i915_perf.h"
11#include "i915_reg.h"
12#include "intel_context.h"
13#include "intel_engine.h"
14#include "intel_engine_regs.h"
15#include "intel_gpu_commands.h"
16#include "intel_gt.h"
17#include "intel_gt_regs.h"
18#include "intel_lrc.h"
19#include "intel_lrc_reg.h"
20#include "intel_ring.h"
21#include "shmem_utils.h"
22
23/*
24 * The per-platform tables are u8-encoded in @data. Decode @data and set the
25 * addresses' offset and commands in @regs. The following encoding is used
26 * for each byte. There are 2 steps: decoding commands and decoding addresses.
27 *
28 * Commands:
29 * [7]: create NOPs - number of NOPs are set in lower bits
30 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
31 * MI_LRI_FORCE_POSTED
32 * [5:0]: Number of NOPs or registers to set values to in case of
33 * MI_LOAD_REGISTER_IMM
34 *
35 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
36 * number of registers. They are set by using the REG/REG16 macros: the former
37 * is used for offsets smaller than 0x200 while the latter is for values bigger
38 * than that. Those macros already set all the bits documented below correctly:
39 *
40 * [7]: When a register offset needs more than 6 bits, use additional bytes, to
41 * follow, for the lower bits
42 * [6:0]: Register offset, without considering the engine base.
43 *
44 * This function only tweaks the commands and register offsets. Values are not
45 * filled out.
46 */
47static void set_offsets(u32 *regs,
48 const u8 *data,
49 const struct intel_engine_cs *engine,
50 bool close)
51#define NOP(x) (BIT(7) | (x))
52#define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
53#define POSTED BIT(0)
54#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
55#define REG16(x) \
56 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
57 (((x) >> 2) & 0x7f)
58#define END 0
59{
60 const u32 base = engine->mmio_base;
61
62 while (*data) {
63 u8 count, flags;
64
65 if (*data & BIT(7)) { /* skip */
66 count = *data++ & ~BIT(7);
67 regs += count;
68 continue;
69 }
70
71 count = *data & 0x3f;
72 flags = *data >> 6;
73 data++;
74
75 *regs = MI_LOAD_REGISTER_IMM(count);
76 if (flags & POSTED)
77 *regs |= MI_LRI_FORCE_POSTED;
78 if (GRAPHICS_VER(engine->i915) >= 11)
79 *regs |= MI_LRI_LRM_CS_MMIO;
80 regs++;
81
82 GEM_BUG_ON(!count);
83 do {
84 u32 offset = 0;
85 u8 v;
86
87 do {
88 v = *data++;
89 offset <<= 7;
90 offset |= v & ~BIT(7);
91 } while (v & BIT(7));
92
93 regs[0] = base + (offset << 2);
94 regs += 2;
95 } while (--count);
96 }
97
98 if (close) {
99 /* Close the batch; used mainly by live_lrc_layout() */
100 *regs = MI_BATCH_BUFFER_END;
101 if (GRAPHICS_VER(engine->i915) >= 11)
102 *regs |= BIT(0);
103 }
104}
105
106static const u8 gen8_xcs_offsets[] = {
107 NOP(1),
108 LRI(11, 0),
109 REG16(0x244),
110 REG(0x034),
111 REG(0x030),
112 REG(0x038),
113 REG(0x03c),
114 REG(0x168),
115 REG(0x140),
116 REG(0x110),
117 REG(0x11c),
118 REG(0x114),
119 REG(0x118),
120
121 NOP(9),
122 LRI(9, 0),
123 REG16(0x3a8),
124 REG16(0x28c),
125 REG16(0x288),
126 REG16(0x284),
127 REG16(0x280),
128 REG16(0x27c),
129 REG16(0x278),
130 REG16(0x274),
131 REG16(0x270),
132
133 NOP(13),
134 LRI(2, 0),
135 REG16(0x200),
136 REG(0x028),
137
138 END
139};
140
141static const u8 gen9_xcs_offsets[] = {
142 NOP(1),
143 LRI(14, POSTED),
144 REG16(0x244),
145 REG(0x034),
146 REG(0x030),
147 REG(0x038),
148 REG(0x03c),
149 REG(0x168),
150 REG(0x140),
151 REG(0x110),
152 REG(0x11c),
153 REG(0x114),
154 REG(0x118),
155 REG(0x1c0),
156 REG(0x1c4),
157 REG(0x1c8),
158
159 NOP(3),
160 LRI(9, POSTED),
161 REG16(0x3a8),
162 REG16(0x28c),
163 REG16(0x288),
164 REG16(0x284),
165 REG16(0x280),
166 REG16(0x27c),
167 REG16(0x278),
168 REG16(0x274),
169 REG16(0x270),
170
171 NOP(13),
172 LRI(1, POSTED),
173 REG16(0x200),
174
175 NOP(13),
176 LRI(44, POSTED),
177 REG(0x028),
178 REG(0x09c),
179 REG(0x0c0),
180 REG(0x178),
181 REG(0x17c),
182 REG16(0x358),
183 REG(0x170),
184 REG(0x150),
185 REG(0x154),
186 REG(0x158),
187 REG16(0x41c),
188 REG16(0x600),
189 REG16(0x604),
190 REG16(0x608),
191 REG16(0x60c),
192 REG16(0x610),
193 REG16(0x614),
194 REG16(0x618),
195 REG16(0x61c),
196 REG16(0x620),
197 REG16(0x624),
198 REG16(0x628),
199 REG16(0x62c),
200 REG16(0x630),
201 REG16(0x634),
202 REG16(0x638),
203 REG16(0x63c),
204 REG16(0x640),
205 REG16(0x644),
206 REG16(0x648),
207 REG16(0x64c),
208 REG16(0x650),
209 REG16(0x654),
210 REG16(0x658),
211 REG16(0x65c),
212 REG16(0x660),
213 REG16(0x664),
214 REG16(0x668),
215 REG16(0x66c),
216 REG16(0x670),
217 REG16(0x674),
218 REG16(0x678),
219 REG16(0x67c),
220 REG(0x068),
221
222 END
223};
224
225static const u8 gen12_xcs_offsets[] = {
226 NOP(1),
227 LRI(13, POSTED),
228 REG16(0x244),
229 REG(0x034),
230 REG(0x030),
231 REG(0x038),
232 REG(0x03c),
233 REG(0x168),
234 REG(0x140),
235 REG(0x110),
236 REG(0x1c0),
237 REG(0x1c4),
238 REG(0x1c8),
239 REG(0x180),
240 REG16(0x2b4),
241
242 NOP(5),
243 LRI(9, POSTED),
244 REG16(0x3a8),
245 REG16(0x28c),
246 REG16(0x288),
247 REG16(0x284),
248 REG16(0x280),
249 REG16(0x27c),
250 REG16(0x278),
251 REG16(0x274),
252 REG16(0x270),
253
254 END
255};
256
257static const u8 dg2_xcs_offsets[] = {
258 NOP(1),
259 LRI(15, POSTED),
260 REG16(0x244),
261 REG(0x034),
262 REG(0x030),
263 REG(0x038),
264 REG(0x03c),
265 REG(0x168),
266 REG(0x140),
267 REG(0x110),
268 REG(0x1c0),
269 REG(0x1c4),
270 REG(0x1c8),
271 REG(0x180),
272 REG16(0x2b4),
273 REG(0x120),
274 REG(0x124),
275
276 NOP(1),
277 LRI(9, POSTED),
278 REG16(0x3a8),
279 REG16(0x28c),
280 REG16(0x288),
281 REG16(0x284),
282 REG16(0x280),
283 REG16(0x27c),
284 REG16(0x278),
285 REG16(0x274),
286 REG16(0x270),
287
288 END
289};
290
291static const u8 gen8_rcs_offsets[] = {
292 NOP(1),
293 LRI(14, POSTED),
294 REG16(0x244),
295 REG(0x034),
296 REG(0x030),
297 REG(0x038),
298 REG(0x03c),
299 REG(0x168),
300 REG(0x140),
301 REG(0x110),
302 REG(0x11c),
303 REG(0x114),
304 REG(0x118),
305 REG(0x1c0),
306 REG(0x1c4),
307 REG(0x1c8),
308
309 NOP(3),
310 LRI(9, POSTED),
311 REG16(0x3a8),
312 REG16(0x28c),
313 REG16(0x288),
314 REG16(0x284),
315 REG16(0x280),
316 REG16(0x27c),
317 REG16(0x278),
318 REG16(0x274),
319 REG16(0x270),
320
321 NOP(13),
322 LRI(1, 0),
323 REG(0x0c8),
324
325 END
326};
327
328static const u8 gen9_rcs_offsets[] = {
329 NOP(1),
330 LRI(14, POSTED),
331 REG16(0x244),
332 REG(0x34),
333 REG(0x30),
334 REG(0x38),
335 REG(0x3c),
336 REG(0x168),
337 REG(0x140),
338 REG(0x110),
339 REG(0x11c),
340 REG(0x114),
341 REG(0x118),
342 REG(0x1c0),
343 REG(0x1c4),
344 REG(0x1c8),
345
346 NOP(3),
347 LRI(9, POSTED),
348 REG16(0x3a8),
349 REG16(0x28c),
350 REG16(0x288),
351 REG16(0x284),
352 REG16(0x280),
353 REG16(0x27c),
354 REG16(0x278),
355 REG16(0x274),
356 REG16(0x270),
357
358 NOP(13),
359 LRI(1, 0),
360 REG(0xc8),
361
362 NOP(13),
363 LRI(44, POSTED),
364 REG(0x28),
365 REG(0x9c),
366 REG(0xc0),
367 REG(0x178),
368 REG(0x17c),
369 REG16(0x358),
370 REG(0x170),
371 REG(0x150),
372 REG(0x154),
373 REG(0x158),
374 REG16(0x41c),
375 REG16(0x600),
376 REG16(0x604),
377 REG16(0x608),
378 REG16(0x60c),
379 REG16(0x610),
380 REG16(0x614),
381 REG16(0x618),
382 REG16(0x61c),
383 REG16(0x620),
384 REG16(0x624),
385 REG16(0x628),
386 REG16(0x62c),
387 REG16(0x630),
388 REG16(0x634),
389 REG16(0x638),
390 REG16(0x63c),
391 REG16(0x640),
392 REG16(0x644),
393 REG16(0x648),
394 REG16(0x64c),
395 REG16(0x650),
396 REG16(0x654),
397 REG16(0x658),
398 REG16(0x65c),
399 REG16(0x660),
400 REG16(0x664),
401 REG16(0x668),
402 REG16(0x66c),
403 REG16(0x670),
404 REG16(0x674),
405 REG16(0x678),
406 REG16(0x67c),
407 REG(0x68),
408
409 END
410};
411
412static const u8 gen11_rcs_offsets[] = {
413 NOP(1),
414 LRI(15, POSTED),
415 REG16(0x244),
416 REG(0x034),
417 REG(0x030),
418 REG(0x038),
419 REG(0x03c),
420 REG(0x168),
421 REG(0x140),
422 REG(0x110),
423 REG(0x11c),
424 REG(0x114),
425 REG(0x118),
426 REG(0x1c0),
427 REG(0x1c4),
428 REG(0x1c8),
429 REG(0x180),
430
431 NOP(1),
432 LRI(9, POSTED),
433 REG16(0x3a8),
434 REG16(0x28c),
435 REG16(0x288),
436 REG16(0x284),
437 REG16(0x280),
438 REG16(0x27c),
439 REG16(0x278),
440 REG16(0x274),
441 REG16(0x270),
442
443 LRI(1, POSTED),
444 REG(0x1b0),
445
446 NOP(10),
447 LRI(1, 0),
448 REG(0x0c8),
449
450 END
451};
452
453static const u8 gen12_rcs_offsets[] = {
454 NOP(1),
455 LRI(13, POSTED),
456 REG16(0x244),
457 REG(0x034),
458 REG(0x030),
459 REG(0x038),
460 REG(0x03c),
461 REG(0x168),
462 REG(0x140),
463 REG(0x110),
464 REG(0x1c0),
465 REG(0x1c4),
466 REG(0x1c8),
467 REG(0x180),
468 REG16(0x2b4),
469
470 NOP(5),
471 LRI(9, POSTED),
472 REG16(0x3a8),
473 REG16(0x28c),
474 REG16(0x288),
475 REG16(0x284),
476 REG16(0x280),
477 REG16(0x27c),
478 REG16(0x278),
479 REG16(0x274),
480 REG16(0x270),
481
482 LRI(3, POSTED),
483 REG(0x1b0),
484 REG16(0x5a8),
485 REG16(0x5ac),
486
487 NOP(6),
488 LRI(1, 0),
489 REG(0x0c8),
490 NOP(3 + 9 + 1),
491
492 LRI(51, POSTED),
493 REG16(0x588),
494 REG16(0x588),
495 REG16(0x588),
496 REG16(0x588),
497 REG16(0x588),
498 REG16(0x588),
499 REG(0x028),
500 REG(0x09c),
501 REG(0x0c0),
502 REG(0x178),
503 REG(0x17c),
504 REG16(0x358),
505 REG(0x170),
506 REG(0x150),
507 REG(0x154),
508 REG(0x158),
509 REG16(0x41c),
510 REG16(0x600),
511 REG16(0x604),
512 REG16(0x608),
513 REG16(0x60c),
514 REG16(0x610),
515 REG16(0x614),
516 REG16(0x618),
517 REG16(0x61c),
518 REG16(0x620),
519 REG16(0x624),
520 REG16(0x628),
521 REG16(0x62c),
522 REG16(0x630),
523 REG16(0x634),
524 REG16(0x638),
525 REG16(0x63c),
526 REG16(0x640),
527 REG16(0x644),
528 REG16(0x648),
529 REG16(0x64c),
530 REG16(0x650),
531 REG16(0x654),
532 REG16(0x658),
533 REG16(0x65c),
534 REG16(0x660),
535 REG16(0x664),
536 REG16(0x668),
537 REG16(0x66c),
538 REG16(0x670),
539 REG16(0x674),
540 REG16(0x678),
541 REG16(0x67c),
542 REG(0x068),
543 REG(0x084),
544 NOP(1),
545
546 END
547};
548
549static const u8 xehp_rcs_offsets[] = {
550 NOP(1),
551 LRI(13, POSTED),
552 REG16(0x244),
553 REG(0x034),
554 REG(0x030),
555 REG(0x038),
556 REG(0x03c),
557 REG(0x168),
558 REG(0x140),
559 REG(0x110),
560 REG(0x1c0),
561 REG(0x1c4),
562 REG(0x1c8),
563 REG(0x180),
564 REG16(0x2b4),
565
566 NOP(5),
567 LRI(9, POSTED),
568 REG16(0x3a8),
569 REG16(0x28c),
570 REG16(0x288),
571 REG16(0x284),
572 REG16(0x280),
573 REG16(0x27c),
574 REG16(0x278),
575 REG16(0x274),
576 REG16(0x270),
577
578 LRI(3, POSTED),
579 REG(0x1b0),
580 REG16(0x5a8),
581 REG16(0x5ac),
582
583 NOP(6),
584 LRI(1, 0),
585 REG(0x0c8),
586
587 END
588};
589
590static const u8 dg2_rcs_offsets[] = {
591 NOP(1),
592 LRI(15, POSTED),
593 REG16(0x244),
594 REG(0x034),
595 REG(0x030),
596 REG(0x038),
597 REG(0x03c),
598 REG(0x168),
599 REG(0x140),
600 REG(0x110),
601 REG(0x1c0),
602 REG(0x1c4),
603 REG(0x1c8),
604 REG(0x180),
605 REG16(0x2b4),
606 REG(0x120),
607 REG(0x124),
608
609 NOP(1),
610 LRI(9, POSTED),
611 REG16(0x3a8),
612 REG16(0x28c),
613 REG16(0x288),
614 REG16(0x284),
615 REG16(0x280),
616 REG16(0x27c),
617 REG16(0x278),
618 REG16(0x274),
619 REG16(0x270),
620
621 LRI(3, POSTED),
622 REG(0x1b0),
623 REG16(0x5a8),
624 REG16(0x5ac),
625
626 NOP(6),
627 LRI(1, 0),
628 REG(0x0c8),
629
630 END
631};
632
633static const u8 mtl_rcs_offsets[] = {
634 NOP(1),
635 LRI(15, POSTED),
636 REG16(0x244),
637 REG(0x034),
638 REG(0x030),
639 REG(0x038),
640 REG(0x03c),
641 REG(0x168),
642 REG(0x140),
643 REG(0x110),
644 REG(0x1c0),
645 REG(0x1c4),
646 REG(0x1c8),
647 REG(0x180),
648 REG16(0x2b4),
649 REG(0x120),
650 REG(0x124),
651
652 NOP(1),
653 LRI(9, POSTED),
654 REG16(0x3a8),
655 REG16(0x28c),
656 REG16(0x288),
657 REG16(0x284),
658 REG16(0x280),
659 REG16(0x27c),
660 REG16(0x278),
661 REG16(0x274),
662 REG16(0x270),
663
664 NOP(2),
665 LRI(2, POSTED),
666 REG16(0x5a8),
667 REG16(0x5ac),
668
669 NOP(6),
670 LRI(1, 0),
671 REG(0x0c8),
672
673 END
674};
675
676#undef END
677#undef REG16
678#undef REG
679#undef LRI
680#undef NOP
681
682static const u8 *reg_offsets(const struct intel_engine_cs *engine)
683{
684 /*
685 * The gen12+ lists only have the registers we program in the basic
686 * default state. We rely on the context image using relative
687 * addressing to automatic fixup the register state between the
688 * physical engines for virtual engine.
689 */
690 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
691 !intel_engine_has_relative_mmio(engine));
692
693 if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
694 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70))
695 return mtl_rcs_offsets;
696 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
697 return dg2_rcs_offsets;
698 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
699 return xehp_rcs_offsets;
700 else if (GRAPHICS_VER(engine->i915) >= 12)
701 return gen12_rcs_offsets;
702 else if (GRAPHICS_VER(engine->i915) >= 11)
703 return gen11_rcs_offsets;
704 else if (GRAPHICS_VER(engine->i915) >= 9)
705 return gen9_rcs_offsets;
706 else
707 return gen8_rcs_offsets;
708 } else {
709 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
710 return dg2_xcs_offsets;
711 else if (GRAPHICS_VER(engine->i915) >= 12)
712 return gen12_xcs_offsets;
713 else if (GRAPHICS_VER(engine->i915) >= 9)
714 return gen9_xcs_offsets;
715 else
716 return gen8_xcs_offsets;
717 }
718}
719
720static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
721{
722 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
723 return 0x70;
724 else if (GRAPHICS_VER(engine->i915) >= 12)
725 return 0x60;
726 else if (GRAPHICS_VER(engine->i915) >= 9)
727 return 0x54;
728 else if (engine->class == RENDER_CLASS)
729 return 0x58;
730 else
731 return -1;
732}
733
734static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
735{
736 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
737 return 0x80;
738 else if (GRAPHICS_VER(engine->i915) >= 12)
739 return 0x70;
740 else if (GRAPHICS_VER(engine->i915) >= 9)
741 return 0x64;
742 else if (GRAPHICS_VER(engine->i915) >= 8 &&
743 engine->class == RENDER_CLASS)
744 return 0xc4;
745 else
746 return -1;
747}
748
749static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
750{
751 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
752 return 0x84;
753 else if (GRAPHICS_VER(engine->i915) >= 12)
754 return 0x74;
755 else if (GRAPHICS_VER(engine->i915) >= 9)
756 return 0x68;
757 else if (engine->class == RENDER_CLASS)
758 return 0xd8;
759 else
760 return -1;
761}
762
763static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
764{
765 if (GRAPHICS_VER(engine->i915) >= 12)
766 return 0x12;
767 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
768 return 0x18;
769 else
770 return -1;
771}
772
773static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
774{
775 int x;
776
777 x = lrc_ring_wa_bb_per_ctx(engine);
778 if (x < 0)
779 return x;
780
781 return x + 2;
782}
783
784static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
785{
786 int x;
787
788 x = lrc_ring_indirect_ptr(engine);
789 if (x < 0)
790 return x;
791
792 return x + 2;
793}
794
795static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
796{
797
798 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
799 /*
800 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
801 * simply to match the RCS context image layout.
802 */
803 return 0xc6;
804 else if (engine->class != RENDER_CLASS)
805 return -1;
806 else if (GRAPHICS_VER(engine->i915) >= 12)
807 return 0xb6;
808 else if (GRAPHICS_VER(engine->i915) >= 11)
809 return 0xaa;
810 else
811 return -1;
812}
813
814static u32
815lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
816{
817 if (GRAPHICS_VER(engine->i915) >= 12)
818 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
819 else if (GRAPHICS_VER(engine->i915) >= 11)
820 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
821 else if (GRAPHICS_VER(engine->i915) >= 9)
822 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
823 else if (GRAPHICS_VER(engine->i915) >= 8)
824 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
825
826 GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8);
827
828 return 0;
829}
830
831static void
832lrc_setup_indirect_ctx(u32 *regs,
833 const struct intel_engine_cs *engine,
834 u32 ctx_bb_ggtt_addr,
835 u32 size)
836{
837 GEM_BUG_ON(!size);
838 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
839 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
840 regs[lrc_ring_indirect_ptr(engine) + 1] =
841 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
842
843 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
844 regs[lrc_ring_indirect_offset(engine) + 1] =
845 lrc_ring_indirect_offset_default(engine) << 6;
846}
847
848static void init_common_regs(u32 * const regs,
849 const struct intel_context *ce,
850 const struct intel_engine_cs *engine,
851 bool inhibit)
852{
853 u32 ctl;
854 int loc;
855
856 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
857 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
858 if (inhibit)
859 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
860 if (GRAPHICS_VER(engine->i915) < 11)
861 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
862 CTX_CTRL_RS_CTX_ENABLE);
863 regs[CTX_CONTEXT_CONTROL] = ctl;
864
865 regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
866
867 loc = lrc_ring_bb_offset(engine);
868 if (loc != -1)
869 regs[loc + 1] = 0;
870}
871
872static void init_wa_bb_regs(u32 * const regs,
873 const struct intel_engine_cs *engine)
874{
875 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
876
877 if (wa_ctx->per_ctx.size) {
878 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
879
880 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
881 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
882 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
883 }
884
885 if (wa_ctx->indirect_ctx.size) {
886 lrc_setup_indirect_ctx(regs, engine,
887 i915_ggtt_offset(wa_ctx->vma) +
888 wa_ctx->indirect_ctx.offset,
889 wa_ctx->indirect_ctx.size);
890 }
891}
892
893static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
894{
895 if (i915_vm_is_4lvl(&ppgtt->vm)) {
896 /* 64b PPGTT (48bit canonical)
897 * PDP0_DESCRIPTOR contains the base address to PML4 and
898 * other PDP Descriptors are ignored.
899 */
900 ASSIGN_CTX_PML4(ppgtt, regs);
901 } else {
902 ASSIGN_CTX_PDP(ppgtt, regs, 3);
903 ASSIGN_CTX_PDP(ppgtt, regs, 2);
904 ASSIGN_CTX_PDP(ppgtt, regs, 1);
905 ASSIGN_CTX_PDP(ppgtt, regs, 0);
906 }
907}
908
909static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
910{
911 if (i915_is_ggtt(vm))
912 return i915_vm_to_ggtt(vm)->alias;
913 else
914 return i915_vm_to_ppgtt(vm);
915}
916
917static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
918{
919 int x;
920
921 x = lrc_ring_mi_mode(engine);
922 if (x != -1) {
923 regs[x + 1] &= ~STOP_RING;
924 regs[x + 1] |= STOP_RING << 16;
925 }
926}
927
928static void __lrc_init_regs(u32 *regs,
929 const struct intel_context *ce,
930 const struct intel_engine_cs *engine,
931 bool inhibit)
932{
933 /*
934 * A context is actually a big batch buffer with several
935 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
936 * values we are setting here are only for the first context restore:
937 * on a subsequent save, the GPU will recreate this batchbuffer with new
938 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
939 * we are not initializing here).
940 *
941 * Must keep consistent with virtual_update_register_offsets().
942 */
943
944 if (inhibit)
945 memset(regs, 0, PAGE_SIZE);
946
947 set_offsets(regs, reg_offsets(engine), engine, inhibit);
948
949 init_common_regs(regs, ce, engine, inhibit);
950 init_ppgtt_regs(regs, vm_alias(ce->vm));
951
952 init_wa_bb_regs(regs, engine);
953
954 __reset_stop_ring(regs, engine);
955}
956
957void lrc_init_regs(const struct intel_context *ce,
958 const struct intel_engine_cs *engine,
959 bool inhibit)
960{
961 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
962}
963
964void lrc_reset_regs(const struct intel_context *ce,
965 const struct intel_engine_cs *engine)
966{
967 __reset_stop_ring(ce->lrc_reg_state, engine);
968}
969
970static void
971set_redzone(void *vaddr, const struct intel_engine_cs *engine)
972{
973 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
974 return;
975
976 vaddr += engine->context_size;
977
978 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
979}
980
981static void
982check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
983{
984 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
985 return;
986
987 vaddr += engine->context_size;
988
989 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
990 drm_err_once(&engine->i915->drm,
991 "%s context redzone overwritten!\n",
992 engine->name);
993}
994
995static u32 context_wa_bb_offset(const struct intel_context *ce)
996{
997 return PAGE_SIZE * ce->wa_bb_page;
998}
999
1000static u32 *context_indirect_bb(const struct intel_context *ce)
1001{
1002 void *ptr;
1003
1004 GEM_BUG_ON(!ce->wa_bb_page);
1005
1006 ptr = ce->lrc_reg_state;
1007 ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1008 ptr += context_wa_bb_offset(ce);
1009
1010 return ptr;
1011}
1012
1013void lrc_init_state(struct intel_context *ce,
1014 struct intel_engine_cs *engine,
1015 void *state)
1016{
1017 bool inhibit = true;
1018
1019 set_redzone(state, engine);
1020
1021 if (engine->default_state) {
1022 shmem_read(engine->default_state, 0,
1023 state, engine->context_size);
1024 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
1025 inhibit = false;
1026 }
1027
1028 /* Clear the ppHWSP (inc. per-context counters) */
1029 memset(state, 0, PAGE_SIZE);
1030
1031 /* Clear the indirect wa and storage */
1032 if (ce->wa_bb_page)
1033 memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
1034
1035 /*
1036 * The second page of the context object contains some registers which
1037 * must be set up prior to the first execution.
1038 */
1039 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
1040}
1041
1042u32 lrc_indirect_bb(const struct intel_context *ce)
1043{
1044 return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
1045}
1046
1047static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
1048{
1049 /* If predication is active, this will be noop'ed */
1050 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1051 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1052 *cs++ = 0;
1053 *cs++ = 0; /* No predication */
1054
1055 /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
1056 *cs++ = MI_BATCH_BUFFER_END | BIT(15);
1057 *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
1058
1059 /* Instructions are no longer predicated (disabled), we can proceed */
1060 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1061 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1062 *cs++ = 0;
1063 *cs++ = 1; /* enable predication before the next BB */
1064
1065 *cs++ = MI_BATCH_BUFFER_END;
1066 GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
1067
1068 return cs;
1069}
1070
1071static struct i915_vma *
1072__lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
1073{
1074 struct drm_i915_gem_object *obj;
1075 struct i915_vma *vma;
1076 u32 context_size;
1077
1078 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
1079
1080 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1081 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
1082
1083 if (GRAPHICS_VER(engine->i915) >= 12) {
1084 ce->wa_bb_page = context_size / PAGE_SIZE;
1085 context_size += PAGE_SIZE;
1086 }
1087
1088 if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1089 ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1090 context_size += PARENT_SCRATCH_SIZE;
1091 }
1092
1093 obj = i915_gem_object_create_lmem(engine->i915, context_size,
1094 I915_BO_ALLOC_PM_VOLATILE);
1095 if (IS_ERR(obj))
1096 obj = i915_gem_object_create_shmem(engine->i915, context_size);
1097 if (IS_ERR(obj))
1098 return ERR_CAST(obj);
1099
1100 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1101 if (IS_ERR(vma)) {
1102 i915_gem_object_put(obj);
1103 return vma;
1104 }
1105
1106 return vma;
1107}
1108
1109static struct intel_timeline *
1110pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1111{
1112 struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1113
1114 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1115}
1116
1117int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1118{
1119 struct intel_ring *ring;
1120 struct i915_vma *vma;
1121 int err;
1122
1123 GEM_BUG_ON(ce->state);
1124
1125 vma = __lrc_alloc_state(ce, engine);
1126 if (IS_ERR(vma))
1127 return PTR_ERR(vma);
1128
1129 ring = intel_engine_create_ring(engine, ce->ring_size);
1130 if (IS_ERR(ring)) {
1131 err = PTR_ERR(ring);
1132 goto err_vma;
1133 }
1134
1135 if (!page_mask_bits(ce->timeline)) {
1136 struct intel_timeline *tl;
1137
1138 /*
1139 * Use the static global HWSP for the kernel context, and
1140 * a dynamically allocated cacheline for everyone else.
1141 */
1142 if (unlikely(ce->timeline))
1143 tl = pinned_timeline(ce, engine);
1144 else
1145 tl = intel_timeline_create(engine->gt);
1146 if (IS_ERR(tl)) {
1147 err = PTR_ERR(tl);
1148 goto err_ring;
1149 }
1150
1151 ce->timeline = tl;
1152 }
1153
1154 ce->ring = ring;
1155 ce->state = vma;
1156
1157 return 0;
1158
1159err_ring:
1160 intel_ring_put(ring);
1161err_vma:
1162 i915_vma_put(vma);
1163 return err;
1164}
1165
1166void lrc_reset(struct intel_context *ce)
1167{
1168 GEM_BUG_ON(!intel_context_is_pinned(ce));
1169
1170 intel_ring_reset(ce->ring, ce->ring->emit);
1171
1172 /* Scrub away the garbage */
1173 lrc_init_regs(ce, ce->engine, true);
1174 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1175}
1176
1177int
1178lrc_pre_pin(struct intel_context *ce,
1179 struct intel_engine_cs *engine,
1180 struct i915_gem_ww_ctx *ww,
1181 void **vaddr)
1182{
1183 GEM_BUG_ON(!ce->state);
1184 GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1185
1186 *vaddr = i915_gem_object_pin_map(ce->state->obj,
1187 i915_coherent_map_type(ce->engine->i915,
1188 ce->state->obj,
1189 false) |
1190 I915_MAP_OVERRIDE);
1191
1192 return PTR_ERR_OR_ZERO(*vaddr);
1193}
1194
1195int
1196lrc_pin(struct intel_context *ce,
1197 struct intel_engine_cs *engine,
1198 void *vaddr)
1199{
1200 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1201
1202 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1203 lrc_init_state(ce, engine, vaddr);
1204
1205 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1206 return 0;
1207}
1208
1209void lrc_unpin(struct intel_context *ce)
1210{
1211 if (unlikely(ce->parallel.last_rq)) {
1212 i915_request_put(ce->parallel.last_rq);
1213 ce->parallel.last_rq = NULL;
1214 }
1215 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1216 ce->engine);
1217}
1218
1219void lrc_post_unpin(struct intel_context *ce)
1220{
1221 i915_gem_object_unpin_map(ce->state->obj);
1222}
1223
1224void lrc_fini(struct intel_context *ce)
1225{
1226 if (!ce->state)
1227 return;
1228
1229 intel_ring_put(fetch_and_zero(&ce->ring));
1230 i915_vma_put(fetch_and_zero(&ce->state));
1231}
1232
1233void lrc_destroy(struct kref *kref)
1234{
1235 struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1236
1237 GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1238 GEM_BUG_ON(intel_context_is_pinned(ce));
1239
1240 lrc_fini(ce);
1241
1242 intel_context_fini(ce);
1243 intel_context_free(ce);
1244}
1245
1246static u32 *
1247gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1248{
1249 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1250 MI_SRM_LRM_GLOBAL_GTT |
1251 MI_LRI_LRM_CS_MMIO;
1252 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1253 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1254 CTX_TIMESTAMP * sizeof(u32);
1255 *cs++ = 0;
1256
1257 *cs++ = MI_LOAD_REGISTER_REG |
1258 MI_LRR_SOURCE_CS_MMIO |
1259 MI_LRI_LRM_CS_MMIO;
1260 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1261 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1262
1263 *cs++ = MI_LOAD_REGISTER_REG |
1264 MI_LRR_SOURCE_CS_MMIO |
1265 MI_LRI_LRM_CS_MMIO;
1266 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1267 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1268
1269 return cs;
1270}
1271
1272static u32 *
1273gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1274{
1275 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1276
1277 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1278 MI_SRM_LRM_GLOBAL_GTT |
1279 MI_LRI_LRM_CS_MMIO;
1280 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1281 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1282 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1283 *cs++ = 0;
1284
1285 return cs;
1286}
1287
1288static u32 *
1289gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1290{
1291 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1292
1293 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1294 MI_SRM_LRM_GLOBAL_GTT |
1295 MI_LRI_LRM_CS_MMIO;
1296 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1297 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1298 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1299 *cs++ = 0;
1300
1301 *cs++ = MI_LOAD_REGISTER_REG |
1302 MI_LRR_SOURCE_CS_MMIO |
1303 MI_LRI_LRM_CS_MMIO;
1304 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1305 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1306
1307 return cs;
1308}
1309
1310/*
1311 * On DG2 during context restore of a preempted context in GPGPU mode,
1312 * RCS restore hang is detected. This is extremely timing dependent.
1313 * To address this below sw wabb is implemented for DG2 A steppings.
1314 */
1315static u32 *
1316dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs)
1317{
1318 *cs++ = MI_LOAD_REGISTER_IMM(1);
1319 *cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG);
1320 *cs++ = 0x21;
1321
1322 *cs++ = MI_LOAD_REGISTER_REG;
1323 *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1324 *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT1);
1325
1326 *cs++ = MI_LOAD_REGISTER_REG;
1327 *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1328 *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT2);
1329
1330 return cs;
1331}
1332
1333/*
1334 * The bspec's tuning guide asks us to program a vertical watermark value of
1335 * 0x3FF. However this register is not saved/restored properly by the
1336 * hardware, so we're required to apply the desired value via INDIRECT_CTX
1337 * batch buffer to ensure the value takes effect properly. All other bits
1338 * in this register should remain at 0 (the hardware default).
1339 */
1340static u32 *
1341dg2_emit_draw_watermark_setting(u32 *cs)
1342{
1343 *cs++ = MI_LOAD_REGISTER_IMM(1);
1344 *cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
1345 *cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
1346
1347 return cs;
1348}
1349
1350static u32 *
1351gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1352{
1353 cs = gen12_emit_timestamp_wa(ce, cs);
1354 cs = gen12_emit_cmd_buf_wa(ce, cs);
1355 cs = gen12_emit_restore_scratch(ce, cs);
1356
1357 /* Wa_22011450934:dg2 */
1358 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) ||
1359 IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0))
1360 cs = dg2_emit_rcs_hang_wabb(ce, cs);
1361
1362 /* Wa_16013000631:dg2 */
1363 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1364 IS_DG2_G11(ce->engine->i915))
1365 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1366
1367 /* hsdes: 1809175790 */
1368 if (!HAS_FLAT_CCS(ce->engine->i915))
1369 cs = gen12_emit_aux_table_inv(ce->engine->gt,
1370 cs, GEN12_GFX_CCS_AUX_NV);
1371
1372 /* Wa_16014892111 */
1373 if (IS_DG2(ce->engine->i915))
1374 cs = dg2_emit_draw_watermark_setting(cs);
1375
1376 return cs;
1377}
1378
1379static u32 *
1380gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1381{
1382 cs = gen12_emit_timestamp_wa(ce, cs);
1383 cs = gen12_emit_restore_scratch(ce, cs);
1384
1385 /* Wa_16013000631:dg2 */
1386 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1387 IS_DG2_G11(ce->engine->i915))
1388 if (ce->engine->class == COMPUTE_CLASS)
1389 cs = gen8_emit_pipe_control(cs,
1390 PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1391 0);
1392
1393 /* hsdes: 1809175790 */
1394 if (!HAS_FLAT_CCS(ce->engine->i915)) {
1395 if (ce->engine->class == VIDEO_DECODE_CLASS)
1396 cs = gen12_emit_aux_table_inv(ce->engine->gt,
1397 cs, GEN12_VD0_AUX_NV);
1398 else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS)
1399 cs = gen12_emit_aux_table_inv(ce->engine->gt,
1400 cs, GEN12_VE0_AUX_NV);
1401 }
1402
1403 return cs;
1404}
1405
1406static void
1407setup_indirect_ctx_bb(const struct intel_context *ce,
1408 const struct intel_engine_cs *engine,
1409 u32 *(*emit)(const struct intel_context *, u32 *))
1410{
1411 u32 * const start = context_indirect_bb(ce);
1412 u32 *cs;
1413
1414 cs = emit(ce, start);
1415 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1416 while ((unsigned long)cs % CACHELINE_BYTES)
1417 *cs++ = MI_NOOP;
1418
1419 GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1420 setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1421
1422 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1423 lrc_indirect_bb(ce),
1424 (cs - start) * sizeof(*cs));
1425}
1426
1427/*
1428 * The context descriptor encodes various attributes of a context,
1429 * including its GTT address and some flags. Because it's fairly
1430 * expensive to calculate, we'll just do it once and cache the result,
1431 * which remains valid until the context is unpinned.
1432 *
1433 * This is what a descriptor looks like, from LSB to MSB::
1434 *
1435 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template)
1436 * bits 12-31: LRCA, GTT address of (the HWSP of) this context
1437 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC)
1438 * bits 53-54: mbz, reserved for use by hardware
1439 * bits 55-63: group ID, currently unused and set to 0
1440 *
1441 * Starting from Gen11, the upper dword of the descriptor has a new format:
1442 *
1443 * bits 32-36: reserved
1444 * bits 37-47: SW context ID
1445 * bits 48:53: engine instance
1446 * bit 54: mbz, reserved for use by hardware
1447 * bits 55-60: SW counter
1448 * bits 61-63: engine class
1449 *
1450 * On Xe_HP, the upper dword of the descriptor has a new format:
1451 *
1452 * bits 32-37: virtual function number
1453 * bit 38: mbz, reserved for use by hardware
1454 * bits 39-54: SW context ID
1455 * bits 55-57: reserved
1456 * bits 58-63: SW counter
1457 *
1458 * engine info, SW context ID and SW counter need to form a unique number
1459 * (Context ID) per lrc.
1460 */
1461static u32 lrc_descriptor(const struct intel_context *ce)
1462{
1463 u32 desc;
1464
1465 desc = INTEL_LEGACY_32B_CONTEXT;
1466 if (i915_vm_is_4lvl(ce->vm))
1467 desc = INTEL_LEGACY_64B_CONTEXT;
1468 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1469
1470 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1471 if (GRAPHICS_VER(ce->vm->i915) == 8)
1472 desc |= GEN8_CTX_L3LLC_COHERENT;
1473
1474 return i915_ggtt_offset(ce->state) | desc;
1475}
1476
1477u32 lrc_update_regs(const struct intel_context *ce,
1478 const struct intel_engine_cs *engine,
1479 u32 head)
1480{
1481 struct intel_ring *ring = ce->ring;
1482 u32 *regs = ce->lrc_reg_state;
1483
1484 GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1485 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1486
1487 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1488 regs[CTX_RING_HEAD] = head;
1489 regs[CTX_RING_TAIL] = ring->tail;
1490 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1491
1492 /* RPCS */
1493 if (engine->class == RENDER_CLASS) {
1494 regs[CTX_R_PWR_CLK_STATE] =
1495 intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1496
1497 i915_oa_init_reg_state(ce, engine);
1498 }
1499
1500 if (ce->wa_bb_page) {
1501 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1502
1503 fn = gen12_emit_indirect_ctx_xcs;
1504 if (ce->engine->class == RENDER_CLASS)
1505 fn = gen12_emit_indirect_ctx_rcs;
1506
1507 /* Mutually exclusive wrt to global indirect bb */
1508 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1509 setup_indirect_ctx_bb(ce, engine, fn);
1510 }
1511
1512 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1513}
1514
1515void lrc_update_offsets(struct intel_context *ce,
1516 struct intel_engine_cs *engine)
1517{
1518 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1519}
1520
1521void lrc_check_regs(const struct intel_context *ce,
1522 const struct intel_engine_cs *engine,
1523 const char *when)
1524{
1525 const struct intel_ring *ring = ce->ring;
1526 u32 *regs = ce->lrc_reg_state;
1527 bool valid = true;
1528 int x;
1529
1530 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1531 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1532 engine->name,
1533 regs[CTX_RING_START],
1534 i915_ggtt_offset(ring->vma));
1535 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1536 valid = false;
1537 }
1538
1539 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1540 (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1541 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1542 engine->name,
1543 regs[CTX_RING_CTL],
1544 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1545 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1546 valid = false;
1547 }
1548
1549 x = lrc_ring_mi_mode(engine);
1550 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1551 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1552 engine->name, regs[x + 1]);
1553 regs[x + 1] &= ~STOP_RING;
1554 regs[x + 1] |= STOP_RING << 16;
1555 valid = false;
1556 }
1557
1558 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1559}
1560
1561/*
1562 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1563 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1564 * but there is a slight complication as this is applied in WA batch where the
1565 * values are only initialized once so we cannot take register value at the
1566 * beginning and reuse it further; hence we save its value to memory, upload a
1567 * constant value with bit21 set and then we restore it back with the saved value.
1568 * To simplify the WA, a constant value is formed by using the default value
1569 * of this register. This shouldn't be a problem because we are only modifying
1570 * it for a short period and this batch in non-premptible. We can ofcourse
1571 * use additional instructions that read the actual value of the register
1572 * at that time and set our bit of interest but it makes the WA complicated.
1573 *
1574 * This WA is also required for Gen9 so extracting as a function avoids
1575 * code duplication.
1576 */
1577static u32 *
1578gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1579{
1580 /* NB no one else is allowed to scribble over scratch + 256! */
1581 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1582 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1583 *batch++ = intel_gt_scratch_offset(engine->gt,
1584 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1585 *batch++ = 0;
1586
1587 *batch++ = MI_LOAD_REGISTER_IMM(1);
1588 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1589 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1590
1591 batch = gen8_emit_pipe_control(batch,
1592 PIPE_CONTROL_CS_STALL |
1593 PIPE_CONTROL_DC_FLUSH_ENABLE,
1594 0);
1595
1596 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1597 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1598 *batch++ = intel_gt_scratch_offset(engine->gt,
1599 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1600 *batch++ = 0;
1601
1602 return batch;
1603}
1604
1605/*
1606 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1607 * initialized at the beginning and shared across all contexts but this field
1608 * helps us to have multiple batches at different offsets and select them based
1609 * on a criteria. At the moment this batch always start at the beginning of the page
1610 * and at this point we don't have multiple wa_ctx batch buffers.
1611 *
1612 * The number of WA applied are not known at the beginning; we use this field
1613 * to return the no of DWORDS written.
1614 *
1615 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1616 * so it adds NOOPs as padding to make it cacheline aligned.
1617 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1618 * makes a complete batch buffer.
1619 */
1620static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1621{
1622 /* WaDisableCtxRestoreArbitration:bdw,chv */
1623 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1624
1625 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1626 if (IS_BROADWELL(engine->i915))
1627 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1628
1629 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1630 /* Actual scratch location is at 128 bytes offset */
1631 batch = gen8_emit_pipe_control(batch,
1632 PIPE_CONTROL_FLUSH_L3 |
1633 PIPE_CONTROL_STORE_DATA_INDEX |
1634 PIPE_CONTROL_CS_STALL |
1635 PIPE_CONTROL_QW_WRITE,
1636 LRC_PPHWSP_SCRATCH_ADDR);
1637
1638 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1639
1640 /* Pad to end of cacheline */
1641 while ((unsigned long)batch % CACHELINE_BYTES)
1642 *batch++ = MI_NOOP;
1643
1644 /*
1645 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1646 * execution depends on the length specified in terms of cache lines
1647 * in the register CTX_RCS_INDIRECT_CTX
1648 */
1649
1650 return batch;
1651}
1652
1653struct lri {
1654 i915_reg_t reg;
1655 u32 value;
1656};
1657
1658static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1659{
1660 GEM_BUG_ON(!count || count > 63);
1661
1662 *batch++ = MI_LOAD_REGISTER_IMM(count);
1663 do {
1664 *batch++ = i915_mmio_reg_offset(lri->reg);
1665 *batch++ = lri->value;
1666 } while (lri++, --count);
1667 *batch++ = MI_NOOP;
1668
1669 return batch;
1670}
1671
1672static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1673{
1674 static const struct lri lri[] = {
1675 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1676 {
1677 COMMON_SLICE_CHICKEN2,
1678 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1679 0),
1680 },
1681
1682 /* BSpec: 11391 */
1683 {
1684 FF_SLICE_CHICKEN,
1685 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1686 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1687 },
1688
1689 /* BSpec: 11299 */
1690 {
1691 _3D_CHICKEN3,
1692 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1693 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1694 }
1695 };
1696
1697 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1698
1699 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1700 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1701
1702 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1703 batch = gen8_emit_pipe_control(batch,
1704 PIPE_CONTROL_FLUSH_L3 |
1705 PIPE_CONTROL_STORE_DATA_INDEX |
1706 PIPE_CONTROL_CS_STALL |
1707 PIPE_CONTROL_QW_WRITE,
1708 LRC_PPHWSP_SCRATCH_ADDR);
1709
1710 batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1711
1712 /* WaMediaPoolStateCmdInWABB:bxt,glk */
1713 if (HAS_POOLED_EU(engine->i915)) {
1714 /*
1715 * EU pool configuration is setup along with golden context
1716 * during context initialization. This value depends on
1717 * device type (2x6 or 3x6) and needs to be updated based
1718 * on which subslice is disabled especially for 2x6
1719 * devices, however it is safe to load default
1720 * configuration of 3x6 device instead of masking off
1721 * corresponding bits because HW ignores bits of a disabled
1722 * subslice and drops down to appropriate config. Please
1723 * see render_state_setup() in i915_gem_render_state.c for
1724 * possible configurations, to avoid duplication they are
1725 * not shown here again.
1726 */
1727 *batch++ = GEN9_MEDIA_POOL_STATE;
1728 *batch++ = GEN9_MEDIA_POOL_ENABLE;
1729 *batch++ = 0x00777000;
1730 *batch++ = 0;
1731 *batch++ = 0;
1732 *batch++ = 0;
1733 }
1734
1735 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1736
1737 /* Pad to end of cacheline */
1738 while ((unsigned long)batch % CACHELINE_BYTES)
1739 *batch++ = MI_NOOP;
1740
1741 return batch;
1742}
1743
1744#define CTX_WA_BB_SIZE (PAGE_SIZE)
1745
1746static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1747{
1748 struct drm_i915_gem_object *obj;
1749 struct i915_vma *vma;
1750 int err;
1751
1752 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1753 if (IS_ERR(obj))
1754 return PTR_ERR(obj);
1755
1756 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1757 if (IS_ERR(vma)) {
1758 err = PTR_ERR(vma);
1759 goto err;
1760 }
1761
1762 engine->wa_ctx.vma = vma;
1763 return 0;
1764
1765err:
1766 i915_gem_object_put(obj);
1767 return err;
1768}
1769
1770void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1771{
1772 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1773}
1774
1775typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1776
1777void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1778{
1779 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1780 struct i915_wa_ctx_bb *wa_bb[] = {
1781 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1782 };
1783 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1784 struct i915_gem_ww_ctx ww;
1785 void *batch, *batch_ptr;
1786 unsigned int i;
1787 int err;
1788
1789 if (GRAPHICS_VER(engine->i915) >= 11 ||
1790 !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1791 return;
1792
1793 if (GRAPHICS_VER(engine->i915) == 9) {
1794 wa_bb_fn[0] = gen9_init_indirectctx_bb;
1795 wa_bb_fn[1] = NULL;
1796 } else if (GRAPHICS_VER(engine->i915) == 8) {
1797 wa_bb_fn[0] = gen8_init_indirectctx_bb;
1798 wa_bb_fn[1] = NULL;
1799 }
1800
1801 err = lrc_create_wa_ctx(engine);
1802 if (err) {
1803 /*
1804 * We continue even if we fail to initialize WA batch
1805 * because we only expect rare glitches but nothing
1806 * critical to prevent us from using GPU
1807 */
1808 drm_err(&engine->i915->drm,
1809 "Ignoring context switch w/a allocation error:%d\n",
1810 err);
1811 return;
1812 }
1813
1814 if (!engine->wa_ctx.vma)
1815 return;
1816
1817 i915_gem_ww_ctx_init(&ww, true);
1818retry:
1819 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1820 if (!err)
1821 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1822 if (err)
1823 goto err;
1824
1825 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1826 if (IS_ERR(batch)) {
1827 err = PTR_ERR(batch);
1828 goto err_unpin;
1829 }
1830
1831 /*
1832 * Emit the two workaround batch buffers, recording the offset from the
1833 * start of the workaround batch buffer object for each and their
1834 * respective sizes.
1835 */
1836 batch_ptr = batch;
1837 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1838 wa_bb[i]->offset = batch_ptr - batch;
1839 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1840 CACHELINE_BYTES))) {
1841 err = -EINVAL;
1842 break;
1843 }
1844 if (wa_bb_fn[i])
1845 batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1846 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1847 }
1848 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1849
1850 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1851 __i915_gem_object_release_map(wa_ctx->vma->obj);
1852
1853 /* Verify that we can handle failure to setup the wa_ctx */
1854 if (!err)
1855 err = i915_inject_probe_error(engine->i915, -ENODEV);
1856
1857err_unpin:
1858 if (err)
1859 i915_vma_unpin(wa_ctx->vma);
1860err:
1861 if (err == -EDEADLK) {
1862 err = i915_gem_ww_ctx_backoff(&ww);
1863 if (!err)
1864 goto retry;
1865 }
1866 i915_gem_ww_ctx_fini(&ww);
1867
1868 if (err) {
1869 i915_vma_put(engine->wa_ctx.vma);
1870
1871 /* Clear all flags to prevent further use */
1872 memset(wa_ctx, 0, sizeof(*wa_ctx));
1873 }
1874}
1875
1876static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1877{
1878#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1879 stats->runtime.num_underflow++;
1880 stats->runtime.max_underflow =
1881 max_t(u32, stats->runtime.max_underflow, -dt);
1882#endif
1883}
1884
1885static u32 lrc_get_runtime(const struct intel_context *ce)
1886{
1887 /*
1888 * We can use either ppHWSP[16] which is recorded before the context
1889 * switch (and so excludes the cost of context switches) or use the
1890 * value from the context image itself, which is saved/restored earlier
1891 * and so includes the cost of the save.
1892 */
1893 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1894}
1895
1896void lrc_update_runtime(struct intel_context *ce)
1897{
1898 struct intel_context_stats *stats = &ce->stats;
1899 u32 old;
1900 s32 dt;
1901
1902 old = stats->runtime.last;
1903 stats->runtime.last = lrc_get_runtime(ce);
1904 dt = stats->runtime.last - old;
1905 if (!dt)
1906 return;
1907
1908 if (unlikely(dt < 0)) {
1909 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1910 old, stats->runtime.last, dt);
1911 st_runtime_underflow(stats, dt);
1912 return;
1913 }
1914
1915 ewma_runtime_add(&stats->runtime.avg, dt);
1916 stats->runtime.total += dt;
1917}
1918
1919#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1920#include "selftest_lrc.c"
1921#endif