intel_lrc.c - drivers/gpu/drm/i915/gt/intel_lrc.c - Linux source code v4.6

Note: File does not exist in v4.6.
   1/*
   2 * Copyright © 2014 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * Authors:
  24 *    Ben Widawsky <ben@bwidawsk.net>
  25 *    Michel Thierry <michel.thierry@intel.com>
  26 *    Thomas Daniel <thomas.daniel@intel.com>
  27 *    Oscar Mateo <oscar.mateo@intel.com>
  28 *
  29 */
  30
  31/**
  32 * DOC: Logical Rings, Logical Ring Contexts and Execlists
  33 *
  34 * Motivation:
  35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
  36 * These expanded contexts enable a number of new abilities, especially
  37 * "Execlists" (also implemented in this file).
  38 *
  39 * One of the main differences with the legacy HW contexts is that logical
  40 * ring contexts incorporate many more things to the context's state, like
  41 * PDPs or ringbuffer control registers:
  42 *
  43 * The reason why PDPs are included in the context is straightforward: as
  44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
  45 * contained there mean you don't need to do a ppgtt->switch_mm yourself,
  46 * instead, the GPU will do it for you on the context switch.
  47 *
  48 * But, what about the ringbuffer control registers (head, tail, etc..)?
  49 * shouldn't we just need a set of those per engine command streamer? This is
  50 * where the name "Logical Rings" starts to make sense: by virtualizing the
  51 * rings, the engine cs shifts to a new "ring buffer" with every context
  52 * switch. When you want to submit a workload to the GPU you: A) choose your
  53 * context, B) find its appropriate virtualized ring, C) write commands to it
  54 * and then, finally, D) tell the GPU to switch to that context.
  55 *
  56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
  57 * to a contexts is via a context execution list, ergo "Execlists".
  58 *
  59 * LRC implementation:
  60 * Regarding the creation of contexts, we have:
  61 *
  62 * - One global default context.
  63 * - One local default context for each opened fd.
  64 * - One local extra context for each context create ioctl call.
  65 *
  66 * Now that ringbuffers belong per-context (and not per-engine, like before)
  67 * and that contexts are uniquely tied to a given engine (and not reusable,
  68 * like before) we need:
  69 *
  70 * - One ringbuffer per-engine inside each context.
  71 * - One backing object per-engine inside each context.
  72 *
  73 * The global default context starts its life with these new objects fully
  74 * allocated and populated. The local default context for each opened fd is
  75 * more complex, because we don't know at creation time which engine is going
  76 * to use them. To handle this, we have implemented a deferred creation of LR
  77 * contexts:
  78 *
  79 * The local context starts its life as a hollow or blank holder, that only
  80 * gets populated for a given engine once we receive an execbuffer. If later
  81 * on we receive another execbuffer ioctl for the same context but a different
  82 * engine, we allocate/populate a new ringbuffer and context backing object and
  83 * so on.
  84 *
  85 * Finally, regarding local contexts created using the ioctl call: as they are
  86 * only allowed with the render ring, we can allocate & populate them right
  87 * away (no need to defer anything, at least for now).
  88 *
  89 * Execlists implementation:
  90 * Execlists are the new method by which, on gen8+ hardware, workloads are
  91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
  92 * This method works as follows:
  93 *
  94 * When a request is committed, its commands (the BB start and any leading or
  95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
  96 * for the appropriate context. The tail pointer in the hardware context is not
  97 * updated at this time, but instead, kept by the driver in the ringbuffer
  98 * structure. A structure representing this request is added to a request queue
  99 * for the appropriate engine: this structure contains a copy of the context's
 100 * tail after the request was written to the ring buffer and a pointer to the
 101 * context itself.
 102 *
 103 * If the engine's request queue was empty before the request was added, the
 104 * queue is processed immediately. Otherwise the queue will be processed during
 105 * a context switch interrupt. In any case, elements on the queue will get sent
 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 107 * globally unique 20-bits submission ID.
 108 *
 109 * When execution of a request completes, the GPU updates the context status
 110 * buffer with a context complete event and generates a context switch interrupt.
 111 * During the interrupt handling, the driver examines the events in the buffer:
 112 * for each context complete event, if the announced ID matches that on the head
 113 * of the request queue, then that request is retired and removed from the queue.
 114 *
 115 * After processing, if any requests were retired and the queue is not empty
 116 * then a new execution list can be submitted. The two requests at the front of
 117 * the queue are next to be submitted but since a context may not occur twice in
 118 * an execution list, if subsequent requests have the same ID as the first then
 119 * the two requests must be combined. This is done simply by discarding requests
 120 * at the head of the queue until either only one requests is left (in which case
 121 * we use a NULL second context) or the first two requests have unique IDs.
 122 *
 123 * By always executing the first two requests in the queue the driver ensures
 124 * that the GPU is kept as busy as possible. In the case where a single context
 125 * completes but a second context is still executing, the request for this second
 126 * context will be at the head of the queue when we remove the first one. This
 127 * request will then be resubmitted along with a new request for a different context,
 128 * which will cause the hardware to continue executing the second request and queue
 129 * the new request (the GPU detects the condition of a context getting preempted
 130 * with the same context and optimizes the context switch flow by not doing
 131 * preemption, but just sampling the new tail pointer).
 132 *
 133 */
 134#include <linux/interrupt.h>
 135
 136#include "i915_drv.h"
 137#include "i915_perf.h"
 138#include "i915_trace.h"
 139#include "i915_vgpu.h"
 140#include "intel_context.h"
 141#include "intel_engine_pm.h"
 142#include "intel_gt.h"
 143#include "intel_gt_pm.h"
 144#include "intel_gt_requests.h"
 145#include "intel_lrc_reg.h"
 146#include "intel_mocs.h"
 147#include "intel_reset.h"
 148#include "intel_ring.h"
 149#include "intel_workarounds.h"
 150#include "shmem_utils.h"
 151
 152#define RING_EXECLIST_QFULL		(1 << 0x2)
 153#define RING_EXECLIST1_VALID		(1 << 0x3)
 154#define RING_EXECLIST0_VALID		(1 << 0x4)
 155#define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
 156#define RING_EXECLIST1_ACTIVE		(1 << 0x11)
 157#define RING_EXECLIST0_ACTIVE		(1 << 0x12)
 158
 159#define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
 160#define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
 161#define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
 162#define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
 163#define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
 164#define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
 165
 166#define GEN8_CTX_STATUS_COMPLETED_MASK \
 167	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
 168
 169#define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
 170
 171#define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
 172#define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
 173#define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
 174#define GEN12_IDLE_CTX_ID		0x7FF
 175#define GEN12_CSB_CTX_VALID(csb_dw) \
 176	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
 177
 178/* Typical size of the average request (2 pipecontrols and a MI_BB) */
 179#define EXECLISTS_REQUEST_SIZE 64 /* bytes */
 180
 181struct virtual_engine {
 182	struct intel_engine_cs base;
 183	struct intel_context context;
 184
 185	/*
 186	 * We allow only a single request through the virtual engine at a time
 187	 * (each request in the timeline waits for the completion fence of
 188	 * the previous before being submitted). By restricting ourselves to
 189	 * only submitting a single request, each request is placed on to a
 190	 * physical to maximise load spreading (by virtue of the late greedy
 191	 * scheduling -- each real engine takes the next available request
 192	 * upon idling).
 193	 */
 194	struct i915_request *request;
 195
 196	/*
 197	 * We keep a rbtree of available virtual engines inside each physical
 198	 * engine, sorted by priority. Here we preallocate the nodes we need
 199	 * for the virtual engine, indexed by physical_engine->id.
 200	 */
 201	struct ve_node {
 202		struct rb_node rb;
 203		int prio;
 204	} nodes[I915_NUM_ENGINES];
 205
 206	/*
 207	 * Keep track of bonded pairs -- restrictions upon on our selection
 208	 * of physical engines any particular request may be submitted to.
 209	 * If we receive a submit-fence from a master engine, we will only
 210	 * use one of sibling_mask physical engines.
 211	 */
 212	struct ve_bond {
 213		const struct intel_engine_cs *master;
 214		intel_engine_mask_t sibling_mask;
 215	} *bonds;
 216	unsigned int num_bonds;
 217
 218	/* And finally, which physical engines this virtual engine maps onto. */
 219	unsigned int num_siblings;
 220	struct intel_engine_cs *siblings[];
 221};
 222
 223static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
 224{
 225	GEM_BUG_ON(!intel_engine_is_virtual(engine));
 226	return container_of(engine, struct virtual_engine, base);
 227}
 228
 229static int __execlists_context_alloc(struct intel_context *ce,
 230				     struct intel_engine_cs *engine);
 231
 232static void execlists_init_reg_state(u32 *reg_state,
 233				     const struct intel_context *ce,
 234				     const struct intel_engine_cs *engine,
 235				     const struct intel_ring *ring,
 236				     bool close);
 237static void
 238__execlists_update_reg_state(const struct intel_context *ce,
 239			     const struct intel_engine_cs *engine,
 240			     u32 head);
 241
 242static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
 243{
 244	if (INTEL_GEN(engine->i915) >= 12)
 245		return 0x60;
 246	else if (INTEL_GEN(engine->i915) >= 9)
 247		return 0x54;
 248	else if (engine->class == RENDER_CLASS)
 249		return 0x58;
 250	else
 251		return -1;
 252}
 253
 254static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
 255{
 256	if (INTEL_GEN(engine->i915) >= 12)
 257		return 0x74;
 258	else if (INTEL_GEN(engine->i915) >= 9)
 259		return 0x68;
 260	else if (engine->class == RENDER_CLASS)
 261		return 0xd8;
 262	else
 263		return -1;
 264}
 265
 266static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
 267{
 268	if (INTEL_GEN(engine->i915) >= 12)
 269		return 0x12;
 270	else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
 271		return 0x18;
 272	else
 273		return -1;
 274}
 275
 276static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
 277{
 278	int x;
 279
 280	x = lrc_ring_wa_bb_per_ctx(engine);
 281	if (x < 0)
 282		return x;
 283
 284	return x + 2;
 285}
 286
 287static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
 288{
 289	int x;
 290
 291	x = lrc_ring_indirect_ptr(engine);
 292	if (x < 0)
 293		return x;
 294
 295	return x + 2;
 296}
 297
 298static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
 299{
 300	if (engine->class != RENDER_CLASS)
 301		return -1;
 302
 303	if (INTEL_GEN(engine->i915) >= 12)
 304		return 0xb6;
 305	else if (INTEL_GEN(engine->i915) >= 11)
 306		return 0xaa;
 307	else
 308		return -1;
 309}
 310
 311static u32
 312lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
 313{
 314	switch (INTEL_GEN(engine->i915)) {
 315	default:
 316		MISSING_CASE(INTEL_GEN(engine->i915));
 317		fallthrough;
 318	case 12:
 319		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 320	case 11:
 321		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 322	case 10:
 323		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 324	case 9:
 325		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 326	case 8:
 327		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 328	}
 329}
 330
 331static void
 332lrc_ring_setup_indirect_ctx(u32 *regs,
 333			    const struct intel_engine_cs *engine,
 334			    u32 ctx_bb_ggtt_addr,
 335			    u32 size)
 336{
 337	GEM_BUG_ON(!size);
 338	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
 339	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
 340	regs[lrc_ring_indirect_ptr(engine) + 1] =
 341		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
 342
 343	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
 344	regs[lrc_ring_indirect_offset(engine) + 1] =
 345		lrc_ring_indirect_offset_default(engine) << 6;
 346}
 347
 348static u32 intel_context_get_runtime(const struct intel_context *ce)
 349{
 350	/*
 351	 * We can use either ppHWSP[16] which is recorded before the context
 352	 * switch (and so excludes the cost of context switches) or use the
 353	 * value from the context image itself, which is saved/restored earlier
 354	 * and so includes the cost of the save.
 355	 */
 356	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
 357}
 358
 359static void mark_eio(struct i915_request *rq)
 360{
 361	if (i915_request_completed(rq))
 362		return;
 363
 364	GEM_BUG_ON(i915_request_signaled(rq));
 365
 366	i915_request_set_error_once(rq, -EIO);
 367	i915_request_mark_complete(rq);
 368}
 369
 370static struct i915_request *
 371active_request(const struct intel_timeline * const tl, struct i915_request *rq)
 372{
 373	struct i915_request *active = rq;
 374
 375	rcu_read_lock();
 376	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
 377		if (i915_request_completed(rq))
 378			break;
 379
 380		active = rq;
 381	}
 382	rcu_read_unlock();
 383
 384	return active;
 385}
 386
 387static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
 388{
 389	return (i915_ggtt_offset(engine->status_page.vma) +
 390		I915_GEM_HWS_PREEMPT_ADDR);
 391}
 392
 393static inline void
 394ring_set_paused(const struct intel_engine_cs *engine, int state)
 395{
 396	/*
 397	 * We inspect HWS_PREEMPT with a semaphore inside
 398	 * engine->emit_fini_breadcrumb. If the dword is true,
 399	 * the ring is paused as the semaphore will busywait
 400	 * until the dword is false.
 401	 */
 402	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
 403	if (state)
 404		wmb();
 405}
 406
 407static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 408{
 409	return rb_entry(rb, struct i915_priolist, node);
 410}
 411
 412static inline int rq_prio(const struct i915_request *rq)
 413{
 414	return READ_ONCE(rq->sched.attr.priority);
 415}
 416
 417static int effective_prio(const struct i915_request *rq)
 418{
 419	int prio = rq_prio(rq);
 420
 421	/*
 422	 * If this request is special and must not be interrupted at any
 423	 * cost, so be it. Note we are only checking the most recent request
 424	 * in the context and so may be masking an earlier vip request. It
 425	 * is hoped that under the conditions where nopreempt is used, this
 426	 * will not matter (i.e. all requests to that context will be
 427	 * nopreempt for as long as desired).
 428	 */
 429	if (i915_request_has_nopreempt(rq))
 430		prio = I915_PRIORITY_UNPREEMPTABLE;
 431
 432	return prio;
 433}
 434
 435static int queue_prio(const struct intel_engine_execlists *execlists)
 436{
 437	struct i915_priolist *p;
 438	struct rb_node *rb;
 439
 440	rb = rb_first_cached(&execlists->queue);
 441	if (!rb)
 442		return INT_MIN;
 443
 444	/*
 445	 * As the priolist[] are inverted, with the highest priority in [0],
 446	 * we have to flip the index value to become priority.
 447	 */
 448	p = to_priolist(rb);
 449	if (!I915_USER_PRIORITY_SHIFT)
 450		return p->priority;
 451
 452	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
 453}
 454
 455static inline bool need_preempt(const struct intel_engine_cs *engine,
 456				const struct i915_request *rq,
 457				struct rb_node *rb)
 458{
 459	int last_prio;
 460
 461	if (!intel_engine_has_semaphores(engine))
 462		return false;
 463
 464	/*
 465	 * Check if the current priority hint merits a preemption attempt.
 466	 *
 467	 * We record the highest value priority we saw during rescheduling
 468	 * prior to this dequeue, therefore we know that if it is strictly
 469	 * less than the current tail of ESLP[0], we do not need to force
 470	 * a preempt-to-idle cycle.
 471	 *
 472	 * However, the priority hint is a mere hint that we may need to
 473	 * preempt. If that hint is stale or we may be trying to preempt
 474	 * ourselves, ignore the request.
 475	 *
 476	 * More naturally we would write
 477	 *      prio >= max(0, last);
 478	 * except that we wish to prevent triggering preemption at the same
 479	 * priority level: the task that is running should remain running
 480	 * to preserve FIFO ordering of dependencies.
 481	 */
 482	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
 483	if (engine->execlists.queue_priority_hint <= last_prio)
 484		return false;
 485
 486	/*
 487	 * Check against the first request in ELSP[1], it will, thanks to the
 488	 * power of PI, be the highest priority of that context.
 489	 */
 490	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
 491	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
 492		return true;
 493
 494	if (rb) {
 495		struct virtual_engine *ve =
 496			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
 497		bool preempt = false;
 498
 499		if (engine == ve->siblings[0]) { /* only preempt one sibling */
 500			struct i915_request *next;
 501
 502			rcu_read_lock();
 503			next = READ_ONCE(ve->request);
 504			if (next)
 505				preempt = rq_prio(next) > last_prio;
 506			rcu_read_unlock();
 507		}
 508
 509		if (preempt)
 510			return preempt;
 511	}
 512
 513	/*
 514	 * If the inflight context did not trigger the preemption, then maybe
 515	 * it was the set of queued requests? Pick the highest priority in
 516	 * the queue (the first active priolist) and see if it deserves to be
 517	 * running instead of ELSP[0].
 518	 *
 519	 * The highest priority request in the queue can not be either
 520	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
 521	 * context, it's priority would not exceed ELSP[0] aka last_prio.
 522	 */
 523	return queue_prio(&engine->execlists) > last_prio;
 524}
 525
 526__maybe_unused static inline bool
 527assert_priority_queue(const struct i915_request *prev,
 528		      const struct i915_request *next)
 529{
 530	/*
 531	 * Without preemption, the prev may refer to the still active element
 532	 * which we refuse to let go.
 533	 *
 534	 * Even with preemption, there are times when we think it is better not
 535	 * to preempt and leave an ostensibly lower priority request in flight.
 536	 */
 537	if (i915_request_is_active(prev))
 538		return true;
 539
 540	return rq_prio(prev) >= rq_prio(next);
 541}
 542
 543/*
 544 * The context descriptor encodes various attributes of a context,
 545 * including its GTT address and some flags. Because it's fairly
 546 * expensive to calculate, we'll just do it once and cache the result,
 547 * which remains valid until the context is unpinned.
 548 *
 549 * This is what a descriptor looks like, from LSB to MSB::
 550 *
 551 *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
 552 *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
 553 *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
 554 *      bits 53-54:    mbz, reserved for use by hardware
 555 *      bits 55-63:    group ID, currently unused and set to 0
 556 *
 557 * Starting from Gen11, the upper dword of the descriptor has a new format:
 558 *
 559 *      bits 32-36:    reserved
 560 *      bits 37-47:    SW context ID
 561 *      bits 48:53:    engine instance
 562 *      bit 54:        mbz, reserved for use by hardware
 563 *      bits 55-60:    SW counter
 564 *      bits 61-63:    engine class
 565 *
 566 * engine info, SW context ID and SW counter need to form a unique number
 567 * (Context ID) per lrc.
 568 */
 569static u32
 570lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
 571{
 572	u32 desc;
 573
 574	desc = INTEL_LEGACY_32B_CONTEXT;
 575	if (i915_vm_is_4lvl(ce->vm))
 576		desc = INTEL_LEGACY_64B_CONTEXT;
 577	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
 578
 579	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
 580	if (IS_GEN(engine->i915, 8))
 581		desc |= GEN8_CTX_L3LLC_COHERENT;
 582
 583	return i915_ggtt_offset(ce->state) | desc;
 584}
 585
 586static inline unsigned int dword_in_page(void *addr)
 587{
 588	return offset_in_page(addr) / sizeof(u32);
 589}
 590
 591static void set_offsets(u32 *regs,
 592			const u8 *data,
 593			const struct intel_engine_cs *engine,
 594			bool clear)
 595#define NOP(x) (BIT(7) | (x))
 596#define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
 597#define POSTED BIT(0)
 598#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
 599#define REG16(x) \
 600	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
 601	(((x) >> 2) & 0x7f)
 602#define END(total_state_size) 0, (total_state_size)
 603{
 604	const u32 base = engine->mmio_base;
 605
 606	while (*data) {
 607		u8 count, flags;
 608
 609		if (*data & BIT(7)) { /* skip */
 610			count = *data++ & ~BIT(7);
 611			if (clear)
 612				memset32(regs, MI_NOOP, count);
 613			regs += count;
 614			continue;
 615		}
 616
 617		count = *data & 0x3f;
 618		flags = *data >> 6;
 619		data++;
 620
 621		*regs = MI_LOAD_REGISTER_IMM(count);
 622		if (flags & POSTED)
 623			*regs |= MI_LRI_FORCE_POSTED;
 624		if (INTEL_GEN(engine->i915) >= 11)
 625			*regs |= MI_LRI_LRM_CS_MMIO;
 626		regs++;
 627
 628		GEM_BUG_ON(!count);
 629		do {
 630			u32 offset = 0;
 631			u8 v;
 632
 633			do {
 634				v = *data++;
 635				offset <<= 7;
 636				offset |= v & ~BIT(7);
 637			} while (v & BIT(7));
 638
 639			regs[0] = base + (offset << 2);
 640			if (clear)
 641				regs[1] = 0;
 642			regs += 2;
 643		} while (--count);
 644	}
 645
 646	if (clear) {
 647		u8 count = *++data;
 648
 649		/* Clear past the tail for HW access */
 650		GEM_BUG_ON(dword_in_page(regs) > count);
 651		memset32(regs, MI_NOOP, count - dword_in_page(regs));
 652
 653		/* Close the batch; used mainly by live_lrc_layout() */
 654		*regs = MI_BATCH_BUFFER_END;
 655		if (INTEL_GEN(engine->i915) >= 10)
 656			*regs |= BIT(0);
 657	}
 658}
 659
 660static const u8 gen8_xcs_offsets[] = {
 661	NOP(1),
 662	LRI(11, 0),
 663	REG16(0x244),
 664	REG(0x034),
 665	REG(0x030),
 666	REG(0x038),
 667	REG(0x03c),
 668	REG(0x168),
 669	REG(0x140),
 670	REG(0x110),
 671	REG(0x11c),
 672	REG(0x114),
 673	REG(0x118),
 674
 675	NOP(9),
 676	LRI(9, 0),
 677	REG16(0x3a8),
 678	REG16(0x28c),
 679	REG16(0x288),
 680	REG16(0x284),
 681	REG16(0x280),
 682	REG16(0x27c),
 683	REG16(0x278),
 684	REG16(0x274),
 685	REG16(0x270),
 686
 687	NOP(13),
 688	LRI(2, 0),
 689	REG16(0x200),
 690	REG(0x028),
 691
 692	END(80)
 693};
 694
 695static const u8 gen9_xcs_offsets[] = {
 696	NOP(1),
 697	LRI(14, POSTED),
 698	REG16(0x244),
 699	REG(0x034),
 700	REG(0x030),
 701	REG(0x038),
 702	REG(0x03c),
 703	REG(0x168),
 704	REG(0x140),
 705	REG(0x110),
 706	REG(0x11c),
 707	REG(0x114),
 708	REG(0x118),
 709	REG(0x1c0),
 710	REG(0x1c4),
 711	REG(0x1c8),
 712
 713	NOP(3),
 714	LRI(9, POSTED),
 715	REG16(0x3a8),
 716	REG16(0x28c),
 717	REG16(0x288),
 718	REG16(0x284),
 719	REG16(0x280),
 720	REG16(0x27c),
 721	REG16(0x278),
 722	REG16(0x274),
 723	REG16(0x270),
 724
 725	NOP(13),
 726	LRI(1, POSTED),
 727	REG16(0x200),
 728
 729	NOP(13),
 730	LRI(44, POSTED),
 731	REG(0x028),
 732	REG(0x09c),
 733	REG(0x0c0),
 734	REG(0x178),
 735	REG(0x17c),
 736	REG16(0x358),
 737	REG(0x170),
 738	REG(0x150),
 739	REG(0x154),
 740	REG(0x158),
 741	REG16(0x41c),
 742	REG16(0x600),
 743	REG16(0x604),
 744	REG16(0x608),
 745	REG16(0x60c),
 746	REG16(0x610),
 747	REG16(0x614),
 748	REG16(0x618),
 749	REG16(0x61c),
 750	REG16(0x620),
 751	REG16(0x624),
 752	REG16(0x628),
 753	REG16(0x62c),
 754	REG16(0x630),
 755	REG16(0x634),
 756	REG16(0x638),
 757	REG16(0x63c),
 758	REG16(0x640),
 759	REG16(0x644),
 760	REG16(0x648),
 761	REG16(0x64c),
 762	REG16(0x650),
 763	REG16(0x654),
 764	REG16(0x658),
 765	REG16(0x65c),
 766	REG16(0x660),
 767	REG16(0x664),
 768	REG16(0x668),
 769	REG16(0x66c),
 770	REG16(0x670),
 771	REG16(0x674),
 772	REG16(0x678),
 773	REG16(0x67c),
 774	REG(0x068),
 775
 776	END(176)
 777};
 778
 779static const u8 gen12_xcs_offsets[] = {
 780	NOP(1),
 781	LRI(13, POSTED),
 782	REG16(0x244),
 783	REG(0x034),
 784	REG(0x030),
 785	REG(0x038),
 786	REG(0x03c),
 787	REG(0x168),
 788	REG(0x140),
 789	REG(0x110),
 790	REG(0x1c0),
 791	REG(0x1c4),
 792	REG(0x1c8),
 793	REG(0x180),
 794	REG16(0x2b4),
 795
 796	NOP(5),
 797	LRI(9, POSTED),
 798	REG16(0x3a8),
 799	REG16(0x28c),
 800	REG16(0x288),
 801	REG16(0x284),
 802	REG16(0x280),
 803	REG16(0x27c),
 804	REG16(0x278),
 805	REG16(0x274),
 806	REG16(0x270),
 807
 808	END(80)
 809};
 810
 811static const u8 gen8_rcs_offsets[] = {
 812	NOP(1),
 813	LRI(14, POSTED),
 814	REG16(0x244),
 815	REG(0x034),
 816	REG(0x030),
 817	REG(0x038),
 818	REG(0x03c),
 819	REG(0x168),
 820	REG(0x140),
 821	REG(0x110),
 822	REG(0x11c),
 823	REG(0x114),
 824	REG(0x118),
 825	REG(0x1c0),
 826	REG(0x1c4),
 827	REG(0x1c8),
 828
 829	NOP(3),
 830	LRI(9, POSTED),
 831	REG16(0x3a8),
 832	REG16(0x28c),
 833	REG16(0x288),
 834	REG16(0x284),
 835	REG16(0x280),
 836	REG16(0x27c),
 837	REG16(0x278),
 838	REG16(0x274),
 839	REG16(0x270),
 840
 841	NOP(13),
 842	LRI(1, 0),
 843	REG(0x0c8),
 844
 845	END(80)
 846};
 847
 848static const u8 gen9_rcs_offsets[] = {
 849	NOP(1),
 850	LRI(14, POSTED),
 851	REG16(0x244),
 852	REG(0x34),
 853	REG(0x30),
 854	REG(0x38),
 855	REG(0x3c),
 856	REG(0x168),
 857	REG(0x140),
 858	REG(0x110),
 859	REG(0x11c),
 860	REG(0x114),
 861	REG(0x118),
 862	REG(0x1c0),
 863	REG(0x1c4),
 864	REG(0x1c8),
 865
 866	NOP(3),
 867	LRI(9, POSTED),
 868	REG16(0x3a8),
 869	REG16(0x28c),
 870	REG16(0x288),
 871	REG16(0x284),
 872	REG16(0x280),
 873	REG16(0x27c),
 874	REG16(0x278),
 875	REG16(0x274),
 876	REG16(0x270),
 877
 878	NOP(13),
 879	LRI(1, 0),
 880	REG(0xc8),
 881
 882	NOP(13),
 883	LRI(44, POSTED),
 884	REG(0x28),
 885	REG(0x9c),
 886	REG(0xc0),
 887	REG(0x178),
 888	REG(0x17c),
 889	REG16(0x358),
 890	REG(0x170),
 891	REG(0x150),
 892	REG(0x154),
 893	REG(0x158),
 894	REG16(0x41c),
 895	REG16(0x600),
 896	REG16(0x604),
 897	REG16(0x608),
 898	REG16(0x60c),
 899	REG16(0x610),
 900	REG16(0x614),
 901	REG16(0x618),
 902	REG16(0x61c),
 903	REG16(0x620),
 904	REG16(0x624),
 905	REG16(0x628),
 906	REG16(0x62c),
 907	REG16(0x630),
 908	REG16(0x634),
 909	REG16(0x638),
 910	REG16(0x63c),
 911	REG16(0x640),
 912	REG16(0x644),
 913	REG16(0x648),
 914	REG16(0x64c),
 915	REG16(0x650),
 916	REG16(0x654),
 917	REG16(0x658),
 918	REG16(0x65c),
 919	REG16(0x660),
 920	REG16(0x664),
 921	REG16(0x668),
 922	REG16(0x66c),
 923	REG16(0x670),
 924	REG16(0x674),
 925	REG16(0x678),
 926	REG16(0x67c),
 927	REG(0x68),
 928
 929	END(176)
 930};
 931
 932static const u8 gen11_rcs_offsets[] = {
 933	NOP(1),
 934	LRI(15, POSTED),
 935	REG16(0x244),
 936	REG(0x034),
 937	REG(0x030),
 938	REG(0x038),
 939	REG(0x03c),
 940	REG(0x168),
 941	REG(0x140),
 942	REG(0x110),
 943	REG(0x11c),
 944	REG(0x114),
 945	REG(0x118),
 946	REG(0x1c0),
 947	REG(0x1c4),
 948	REG(0x1c8),
 949	REG(0x180),
 950
 951	NOP(1),
 952	LRI(9, POSTED),
 953	REG16(0x3a8),
 954	REG16(0x28c),
 955	REG16(0x288),
 956	REG16(0x284),
 957	REG16(0x280),
 958	REG16(0x27c),
 959	REG16(0x278),
 960	REG16(0x274),
 961	REG16(0x270),
 962
 963	LRI(1, POSTED),
 964	REG(0x1b0),
 965
 966	NOP(10),
 967	LRI(1, 0),
 968	REG(0x0c8),
 969
 970	END(80)
 971};
 972
 973static const u8 gen12_rcs_offsets[] = {
 974	NOP(1),
 975	LRI(13, POSTED),
 976	REG16(0x244),
 977	REG(0x034),
 978	REG(0x030),
 979	REG(0x038),
 980	REG(0x03c),
 981	REG(0x168),
 982	REG(0x140),
 983	REG(0x110),
 984	REG(0x1c0),
 985	REG(0x1c4),
 986	REG(0x1c8),
 987	REG(0x180),
 988	REG16(0x2b4),
 989
 990	NOP(5),
 991	LRI(9, POSTED),
 992	REG16(0x3a8),
 993	REG16(0x28c),
 994	REG16(0x288),
 995	REG16(0x284),
 996	REG16(0x280),
 997	REG16(0x27c),
 998	REG16(0x278),
 999	REG16(0x274),
1000	REG16(0x270),
1001
1002	LRI(3, POSTED),
1003	REG(0x1b0),
1004	REG16(0x5a8),
1005	REG16(0x5ac),
1006
1007	NOP(6),
1008	LRI(1, 0),
1009	REG(0x0c8),
1010	NOP(3 + 9 + 1),
1011
1012	LRI(51, POSTED),
1013	REG16(0x588),
1014	REG16(0x588),
1015	REG16(0x588),
1016	REG16(0x588),
1017	REG16(0x588),
1018	REG16(0x588),
1019	REG(0x028),
1020	REG(0x09c),
1021	REG(0x0c0),
1022	REG(0x178),
1023	REG(0x17c),
1024	REG16(0x358),
1025	REG(0x170),
1026	REG(0x150),
1027	REG(0x154),
1028	REG(0x158),
1029	REG16(0x41c),
1030	REG16(0x600),
1031	REG16(0x604),
1032	REG16(0x608),
1033	REG16(0x60c),
1034	REG16(0x610),
1035	REG16(0x614),
1036	REG16(0x618),
1037	REG16(0x61c),
1038	REG16(0x620),
1039	REG16(0x624),
1040	REG16(0x628),
1041	REG16(0x62c),
1042	REG16(0x630),
1043	REG16(0x634),
1044	REG16(0x638),
1045	REG16(0x63c),
1046	REG16(0x640),
1047	REG16(0x644),
1048	REG16(0x648),
1049	REG16(0x64c),
1050	REG16(0x650),
1051	REG16(0x654),
1052	REG16(0x658),
1053	REG16(0x65c),
1054	REG16(0x660),
1055	REG16(0x664),
1056	REG16(0x668),
1057	REG16(0x66c),
1058	REG16(0x670),
1059	REG16(0x674),
1060	REG16(0x678),
1061	REG16(0x67c),
1062	REG(0x068),
1063	REG(0x084),
1064	NOP(1),
1065
1066	END(192)
1067};
1068
1069#undef END
1070#undef REG16
1071#undef REG
1072#undef LRI
1073#undef NOP
1074
1075static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1076{
1077	/*
1078	 * The gen12+ lists only have the registers we program in the basic
1079	 * default state. We rely on the context image using relative
1080	 * addressing to automatic fixup the register state between the
1081	 * physical engines for virtual engine.
1082	 */
1083	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1084		   !intel_engine_has_relative_mmio(engine));
1085
1086	if (engine->class == RENDER_CLASS) {
1087		if (INTEL_GEN(engine->i915) >= 12)
1088			return gen12_rcs_offsets;
1089		else if (INTEL_GEN(engine->i915) >= 11)
1090			return gen11_rcs_offsets;
1091		else if (INTEL_GEN(engine->i915) >= 9)
1092			return gen9_rcs_offsets;
1093		else
1094			return gen8_rcs_offsets;
1095	} else {
1096		if (INTEL_GEN(engine->i915) >= 12)
1097			return gen12_xcs_offsets;
1098		else if (INTEL_GEN(engine->i915) >= 9)
1099			return gen9_xcs_offsets;
1100		else
1101			return gen8_xcs_offsets;
1102	}
1103}
1104
1105static struct i915_request *
1106__unwind_incomplete_requests(struct intel_engine_cs *engine)
1107{
1108	struct i915_request *rq, *rn, *active = NULL;
1109	struct list_head *pl;
1110	int prio = I915_PRIORITY_INVALID;
1111
1112	lockdep_assert_held(&engine->active.lock);
1113
1114	list_for_each_entry_safe_reverse(rq, rn,
1115					 &engine->active.requests,
1116					 sched.link) {
1117		if (i915_request_completed(rq))
1118			continue; /* XXX */
1119
1120		__i915_request_unsubmit(rq);
1121
1122		/*
1123		 * Push the request back into the queue for later resubmission.
1124		 * If this request is not native to this physical engine (i.e.
1125		 * it came from a virtual source), push it back onto the virtual
1126		 * engine so that it can be moved across onto another physical
1127		 * engine as load dictates.
1128		 */
1129		if (likely(rq->execution_mask == engine->mask)) {
1130			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1131			if (rq_prio(rq) != prio) {
1132				prio = rq_prio(rq);
1133				pl = i915_sched_lookup_priolist(engine, prio);
1134			}
1135			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1136
1137			list_move(&rq->sched.link, pl);
1138			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1139
1140			/* Check in case we rollback so far we wrap [size/2] */
1141			if (intel_ring_direction(rq->ring,
1142						 intel_ring_wrap(rq->ring,
1143								 rq->tail),
1144						 rq->ring->tail) > 0)
1145				rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1146
1147			active = rq;
1148		} else {
1149			struct intel_engine_cs *owner = rq->context->engine;
1150
1151			/*
1152			 * Decouple the virtual breadcrumb before moving it
1153			 * back to the virtual engine -- we don't want the
1154			 * request to complete in the background and try
1155			 * and cancel the breadcrumb on the virtual engine
1156			 * (instead of the old engine where it is linked)!
1157			 */
1158			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1159				     &rq->fence.flags)) {
1160				spin_lock_nested(&rq->lock,
1161						 SINGLE_DEPTH_NESTING);
1162				i915_request_cancel_breadcrumb(rq);
1163				spin_unlock(&rq->lock);
1164			}
1165			WRITE_ONCE(rq->engine, owner);
1166			owner->submit_request(rq);
1167			active = NULL;
1168		}
1169	}
1170
1171	return active;
1172}
1173
1174struct i915_request *
1175execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1176{
1177	struct intel_engine_cs *engine =
1178		container_of(execlists, typeof(*engine), execlists);
1179
1180	return __unwind_incomplete_requests(engine);
1181}
1182
1183static inline void
1184execlists_context_status_change(struct i915_request *rq, unsigned long status)
1185{
1186	/*
1187	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1188	 * The compiler should eliminate this function as dead-code.
1189	 */
1190	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1191		return;
1192
1193	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1194				   status, rq);
1195}
1196
1197static void intel_engine_context_in(struct intel_engine_cs *engine)
1198{
1199	unsigned long flags;
1200
1201	if (atomic_add_unless(&engine->stats.active, 1, 0))
1202		return;
1203
1204	write_seqlock_irqsave(&engine->stats.lock, flags);
1205	if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1206		engine->stats.start = ktime_get();
1207		atomic_inc(&engine->stats.active);
1208	}
1209	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1210}
1211
1212static void intel_engine_context_out(struct intel_engine_cs *engine)
1213{
1214	unsigned long flags;
1215
1216	GEM_BUG_ON(!atomic_read(&engine->stats.active));
1217
1218	if (atomic_add_unless(&engine->stats.active, -1, 1))
1219		return;
1220
1221	write_seqlock_irqsave(&engine->stats.lock, flags);
1222	if (atomic_dec_and_test(&engine->stats.active)) {
1223		engine->stats.total =
1224			ktime_add(engine->stats.total,
1225				  ktime_sub(ktime_get(), engine->stats.start));
1226	}
1227	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1228}
1229
1230static void
1231execlists_check_context(const struct intel_context *ce,
1232			const struct intel_engine_cs *engine)
1233{
1234	const struct intel_ring *ring = ce->ring;
1235	u32 *regs = ce->lrc_reg_state;
1236	bool valid = true;
1237	int x;
1238
1239	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1240		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1241		       engine->name,
1242		       regs[CTX_RING_START],
1243		       i915_ggtt_offset(ring->vma));
1244		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1245		valid = false;
1246	}
1247
1248	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1249	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1250		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1251		       engine->name,
1252		       regs[CTX_RING_CTL],
1253		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1254		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1255		valid = false;
1256	}
1257
1258	x = lrc_ring_mi_mode(engine);
1259	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1260		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1261		       engine->name, regs[x + 1]);
1262		regs[x + 1] &= ~STOP_RING;
1263		regs[x + 1] |= STOP_RING << 16;
1264		valid = false;
1265	}
1266
1267	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1268}
1269
1270static void restore_default_state(struct intel_context *ce,
1271				  struct intel_engine_cs *engine)
1272{
1273	u32 *regs;
1274
1275	regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1276	execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1277
1278	ce->runtime.last = intel_context_get_runtime(ce);
1279}
1280
1281static void reset_active(struct i915_request *rq,
1282			 struct intel_engine_cs *engine)
1283{
1284	struct intel_context * const ce = rq->context;
1285	u32 head;
1286
1287	/*
1288	 * The executing context has been cancelled. We want to prevent
1289	 * further execution along this context and propagate the error on
1290	 * to anything depending on its results.
1291	 *
1292	 * In __i915_request_submit(), we apply the -EIO and remove the
1293	 * requests' payloads for any banned requests. But first, we must
1294	 * rewind the context back to the start of the incomplete request so
1295	 * that we do not jump back into the middle of the batch.
1296	 *
1297	 * We preserve the breadcrumbs and semaphores of the incomplete
1298	 * requests so that inter-timeline dependencies (i.e other timelines)
1299	 * remain correctly ordered. And we defer to __i915_request_submit()
1300	 * so that all asynchronous waits are correctly handled.
1301	 */
1302	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1303		     rq->fence.context, rq->fence.seqno);
1304
1305	/* On resubmission of the active request, payload will be scrubbed */
1306	if (i915_request_completed(rq))
1307		head = rq->tail;
1308	else
1309		head = active_request(ce->timeline, rq)->head;
1310	head = intel_ring_wrap(ce->ring, head);
1311
1312	/* Scrub the context image to prevent replaying the previous batch */
1313	restore_default_state(ce, engine);
1314	__execlists_update_reg_state(ce, engine, head);
1315
1316	/* We've switched away, so this should be a no-op, but intent matters */
1317	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1318}
1319
1320static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1321{
1322#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1323	ce->runtime.num_underflow += dt < 0;
1324	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1325#endif
1326}
1327
1328static void intel_context_update_runtime(struct intel_context *ce)
1329{
1330	u32 old;
1331	s32 dt;
1332
1333	if (intel_context_is_barrier(ce))
1334		return;
1335
1336	old = ce->runtime.last;
1337	ce->runtime.last = intel_context_get_runtime(ce);
1338	dt = ce->runtime.last - old;
1339
1340	if (unlikely(dt <= 0)) {
1341		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1342			 old, ce->runtime.last, dt);
1343		st_update_runtime_underflow(ce, dt);
1344		return;
1345	}
1346
1347	ewma_runtime_add(&ce->runtime.avg, dt);
1348	ce->runtime.total += dt;
1349}
1350
1351static inline struct intel_engine_cs *
1352__execlists_schedule_in(struct i915_request *rq)
1353{
1354	struct intel_engine_cs * const engine = rq->engine;
1355	struct intel_context * const ce = rq->context;
1356
1357	intel_context_get(ce);
1358
1359	if (unlikely(intel_context_is_banned(ce)))
1360		reset_active(rq, engine);
1361
1362	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1363		execlists_check_context(ce, engine);
1364
1365	if (ce->tag) {
1366		/* Use a fixed tag for OA and friends */
1367		GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1368		ce->lrc.ccid = ce->tag;
1369	} else {
1370		/* We don't need a strict matching tag, just different values */
1371		unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1372
1373		GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1374		clear_bit(tag - 1, &engine->context_tag);
1375		ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1376
1377		BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1378	}
1379
1380	ce->lrc.ccid |= engine->execlists.ccid;
1381
1382	__intel_gt_pm_get(engine->gt);
1383	if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1384		intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1385	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1386	intel_engine_context_in(engine);
1387
1388	return engine;
1389}
1390
1391static inline struct i915_request *
1392execlists_schedule_in(struct i915_request *rq, int idx)
1393{
1394	struct intel_context * const ce = rq->context;
1395	struct intel_engine_cs *old;
1396
1397	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1398	trace_i915_request_in(rq, idx);
1399
1400	old = READ_ONCE(ce->inflight);
1401	do {
1402		if (!old) {
1403			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1404			break;
1405		}
1406	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1407
1408	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1409	return i915_request_get(rq);
1410}
1411
1412static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1413{
1414	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1415	struct i915_request *next = READ_ONCE(ve->request);
1416
1417	if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1418		tasklet_hi_schedule(&ve->base.execlists.tasklet);
1419}
1420
1421static inline void
1422__execlists_schedule_out(struct i915_request *rq,
1423			 struct intel_engine_cs * const engine,
1424			 unsigned int ccid)
1425{
1426	struct intel_context * const ce = rq->context;
1427
1428	/*
1429	 * NB process_csb() is not under the engine->active.lock and hence
1430	 * schedule_out can race with schedule_in meaning that we should
1431	 * refrain from doing non-trivial work here.
1432	 */
1433
1434	/*
1435	 * If we have just completed this context, the engine may now be
1436	 * idle and we want to re-enter powersaving.
1437	 */
1438	if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1439	    i915_request_completed(rq))
1440		intel_engine_add_retire(engine, ce->timeline);
1441
1442	ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1443	ccid &= GEN12_MAX_CONTEXT_HW_ID;
1444	if (ccid < BITS_PER_LONG) {
1445		GEM_BUG_ON(ccid == 0);
1446		GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1447		set_bit(ccid - 1, &engine->context_tag);
1448	}
1449
1450	intel_context_update_runtime(ce);
1451	intel_engine_context_out(engine);
1452	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1453	if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1454		intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1455	intel_gt_pm_put_async(engine->gt);
1456
1457	/*
1458	 * If this is part of a virtual engine, its next request may
1459	 * have been blocked waiting for access to the active context.
1460	 * We have to kick all the siblings again in case we need to
1461	 * switch (e.g. the next request is not runnable on this
1462	 * engine). Hopefully, we will already have submitted the next
1463	 * request before the tasklet runs and do not need to rebuild
1464	 * each virtual tree and kick everyone again.
1465	 */
1466	if (ce->engine != engine)
1467		kick_siblings(rq, ce);
1468
1469	intel_context_put(ce);
1470}
1471
1472static inline void
1473execlists_schedule_out(struct i915_request *rq)
1474{
1475	struct intel_context * const ce = rq->context;
1476	struct intel_engine_cs *cur, *old;
1477	u32 ccid;
1478
1479	trace_i915_request_out(rq);
1480
1481	ccid = rq->context->lrc.ccid;
1482	old = READ_ONCE(ce->inflight);
1483	do
1484		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1485	while (!try_cmpxchg(&ce->inflight, &old, cur));
1486	if (!cur)
1487		__execlists_schedule_out(rq, old, ccid);
1488
1489	i915_request_put(rq);
1490}
1491
1492static u64 execlists_update_context(struct i915_request *rq)
1493{
1494	struct intel_context *ce = rq->context;
1495	u64 desc = ce->lrc.desc;
1496	u32 tail, prev;
1497
1498	/*
1499	 * WaIdleLiteRestore:bdw,skl
1500	 *
1501	 * We should never submit the context with the same RING_TAIL twice
1502	 * just in case we submit an empty ring, which confuses the HW.
1503	 *
1504	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1505	 * the normal request to be able to always advance the RING_TAIL on
1506	 * subsequent resubmissions (for lite restore). Should that fail us,
1507	 * and we try and submit the same tail again, force the context
1508	 * reload.
1509	 *
1510	 * If we need to return to a preempted context, we need to skip the
1511	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1512	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1513	 * an earlier request.
1514	 */
1515	GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1516	prev = rq->ring->tail;
1517	tail = intel_ring_set_tail(rq->ring, rq->tail);
1518	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1519		desc |= CTX_DESC_FORCE_RESTORE;
1520	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1521	rq->tail = rq->wa_tail;
1522
1523	/*
1524	 * Make sure the context image is complete before we submit it to HW.
1525	 *
1526	 * Ostensibly, writes (including the WCB) should be flushed prior to
1527	 * an uncached write such as our mmio register access, the empirical
1528	 * evidence (esp. on Braswell) suggests that the WC write into memory
1529	 * may not be visible to the HW prior to the completion of the UC
1530	 * register write and that we may begin execution from the context
1531	 * before its image is complete leading to invalid PD chasing.
1532	 */
1533	wmb();
1534
1535	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1536	return desc;
1537}
1538
1539static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1540{
1541	if (execlists->ctrl_reg) {
1542		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1543		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1544	} else {
1545		writel(upper_32_bits(desc), execlists->submit_reg);
1546		writel(lower_32_bits(desc), execlists->submit_reg);
1547	}
1548}
1549
1550static __maybe_unused char *
1551dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1552{
1553	if (!rq)
1554		return "";
1555
1556	snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1557		 prefix,
1558		 rq->context->lrc.ccid,
1559		 rq->fence.context, rq->fence.seqno,
1560		 i915_request_completed(rq) ? "!" :
1561		 i915_request_started(rq) ? "*" :
1562		 "",
1563		 rq_prio(rq));
1564
1565	return buf;
1566}
1567
1568static __maybe_unused void
1569trace_ports(const struct intel_engine_execlists *execlists,
1570	    const char *msg,
1571	    struct i915_request * const *ports)
1572{
1573	const struct intel_engine_cs *engine =
1574		container_of(execlists, typeof(*engine), execlists);
1575	char __maybe_unused p0[40], p1[40];
1576
1577	if (!ports[0])
1578		return;
1579
1580	ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1581		     dump_port(p0, sizeof(p0), "", ports[0]),
1582		     dump_port(p1, sizeof(p1), ", ", ports[1]));
1583}
1584
1585static inline bool
1586reset_in_progress(const struct intel_engine_execlists *execlists)
1587{
1588	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1589}
1590
1591static __maybe_unused bool
1592assert_pending_valid(const struct intel_engine_execlists *execlists,
1593		     const char *msg)
1594{
1595	struct intel_engine_cs *engine =
1596		container_of(execlists, typeof(*engine), execlists);
1597	struct i915_request * const *port, *rq;
1598	struct intel_context *ce = NULL;
1599	bool sentinel = false;
1600	u32 ccid = -1;
1601
1602	trace_ports(execlists, msg, execlists->pending);
1603
1604	/* We may be messing around with the lists during reset, lalala */
1605	if (reset_in_progress(execlists))
1606		return true;
1607
1608	if (!execlists->pending[0]) {
1609		GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1610			      engine->name);
1611		return false;
1612	}
1613
1614	if (execlists->pending[execlists_num_ports(execlists)]) {
1615		GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1616			      engine->name, execlists_num_ports(execlists));
1617		return false;
1618	}
1619
1620	for (port = execlists->pending; (rq = *port); port++) {
1621		unsigned long flags;
1622		bool ok = true;
1623
1624		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1625		GEM_BUG_ON(!i915_request_is_active(rq));
1626
1627		if (ce == rq->context) {
1628			GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1629				      engine->name,
1630				      ce->timeline->fence_context,
1631				      port - execlists->pending);
1632			return false;
1633		}
1634		ce = rq->context;
1635
1636		if (ccid == ce->lrc.ccid) {
1637			GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1638				      engine->name,
1639				      ccid, ce->timeline->fence_context,
1640				      port - execlists->pending);
1641			return false;
1642		}
1643		ccid = ce->lrc.ccid;
1644
1645		/*
1646		 * Sentinels are supposed to be the last request so they flush
1647		 * the current execution off the HW. Check that they are the only
1648		 * request in the pending submission.
1649		 */
1650		if (sentinel) {
1651			GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1652				      engine->name,
1653				      ce->timeline->fence_context,
1654				      port - execlists->pending);
1655			return false;
1656		}
1657		sentinel = i915_request_has_sentinel(rq);
1658
1659		/* Hold tightly onto the lock to prevent concurrent retires! */
1660		if (!spin_trylock_irqsave(&rq->lock, flags))
1661			continue;
1662
1663		if (i915_request_completed(rq))
1664			goto unlock;
1665
1666		if (i915_active_is_idle(&ce->active) &&
1667		    !intel_context_is_barrier(ce)) {
1668			GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1669				      engine->name,
1670				      ce->timeline->fence_context,
1671				      port - execlists->pending);
1672			ok = false;
1673			goto unlock;
1674		}
1675
1676		if (!i915_vma_is_pinned(ce->state)) {
1677			GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1678				      engine->name,
1679				      ce->timeline->fence_context,
1680				      port - execlists->pending);
1681			ok = false;
1682			goto unlock;
1683		}
1684
1685		if (!i915_vma_is_pinned(ce->ring->vma)) {
1686			GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1687				      engine->name,
1688				      ce->timeline->fence_context,
1689				      port - execlists->pending);
1690			ok = false;
1691			goto unlock;
1692		}
1693
1694unlock:
1695		spin_unlock_irqrestore(&rq->lock, flags);
1696		if (!ok)
1697			return false;
1698	}
1699
1700	return ce;
1701}
1702
1703static void execlists_submit_ports(struct intel_engine_cs *engine)
1704{
1705	struct intel_engine_execlists *execlists = &engine->execlists;
1706	unsigned int n;
1707
1708	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1709
1710	/*
1711	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1712	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1713	 * not be relinquished until the device is idle (see
1714	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1715	 * that all ELSP are drained i.e. we have processed the CSB,
1716	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1717	 */
1718	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1719
1720	/*
1721	 * ELSQ note: the submit queue is not cleared after being submitted
1722	 * to the HW so we need to make sure we always clean it up. This is
1723	 * currently ensured by the fact that we always write the same number
1724	 * of elsq entries, keep this in mind before changing the loop below.
1725	 */
1726	for (n = execlists_num_ports(execlists); n--; ) {
1727		struct i915_request *rq = execlists->pending[n];
1728
1729		write_desc(execlists,
1730			   rq ? execlists_update_context(rq) : 0,
1731			   n);
1732	}
1733
1734	/* we need to manually load the submit queue */
1735	if (execlists->ctrl_reg)
1736		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1737}
1738
1739static bool ctx_single_port_submission(const struct intel_context *ce)
1740{
1741	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1742		intel_context_force_single_submission(ce));
1743}
1744
1745static bool can_merge_ctx(const struct intel_context *prev,
1746			  const struct intel_context *next)
1747{
1748	if (prev != next)
1749		return false;
1750
1751	if (ctx_single_port_submission(prev))
1752		return false;
1753
1754	return true;
1755}
1756
1757static unsigned long i915_request_flags(const struct i915_request *rq)
1758{
1759	return READ_ONCE(rq->fence.flags);
1760}
1761
1762static bool can_merge_rq(const struct i915_request *prev,
1763			 const struct i915_request *next)
1764{
1765	GEM_BUG_ON(prev == next);
1766	GEM_BUG_ON(!assert_priority_queue(prev, next));
1767
1768	/*
1769	 * We do not submit known completed requests. Therefore if the next
1770	 * request is already completed, we can pretend to merge it in
1771	 * with the previous context (and we will skip updating the ELSP
1772	 * and tracking). Thus hopefully keeping the ELSP full with active
1773	 * contexts, despite the best efforts of preempt-to-busy to confuse
1774	 * us.
1775	 */
1776	if (i915_request_completed(next))
1777		return true;
1778
1779	if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1780		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1781		      BIT(I915_FENCE_FLAG_SENTINEL))))
1782		return false;
1783
1784	if (!can_merge_ctx(prev->context, next->context))
1785		return false;
1786
1787	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1788	return true;
1789}
1790
1791static void virtual_update_register_offsets(u32 *regs,
1792					    struct intel_engine_cs *engine)
1793{
1794	set_offsets(regs, reg_offsets(engine), engine, false);
1795}
1796
1797static bool virtual_matches(const struct virtual_engine *ve,
1798			    const struct i915_request *rq,
1799			    const struct intel_engine_cs *engine)
1800{
1801	const struct intel_engine_cs *inflight;
1802
1803	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1804		return false;
1805
1806	/*
1807	 * We track when the HW has completed saving the context image
1808	 * (i.e. when we have seen the final CS event switching out of
1809	 * the context) and must not overwrite the context image before
1810	 * then. This restricts us to only using the active engine
1811	 * while the previous virtualized request is inflight (so
1812	 * we reuse the register offsets). This is a very small
1813	 * hystersis on the greedy seelction algorithm.
1814	 */
1815	inflight = intel_context_inflight(&ve->context);
1816	if (inflight && inflight != engine)
1817		return false;
1818
1819	return true;
1820}
1821
1822static void virtual_xfer_breadcrumbs(struct virtual_engine *ve)
1823{
1824	/*
1825	 * All the outstanding signals on ve->siblings[0] must have
1826	 * been completed, just pending the interrupt handler. As those
1827	 * signals still refer to the old sibling (via rq->engine), we must
1828	 * transfer those to the old irq_worker to keep our locking
1829	 * consistent.
1830	 */
1831	intel_engine_transfer_stale_breadcrumbs(ve->siblings[0], &ve->context);
1832}
1833
1834#define for_each_waiter(p__, rq__) \
1835	list_for_each_entry_lockless(p__, \
1836				     &(rq__)->sched.waiters_list, \
1837				     wait_link)
1838
1839#define for_each_signaler(p__, rq__) \
1840	list_for_each_entry_rcu(p__, \
1841				&(rq__)->sched.signalers_list, \
1842				signal_link)
1843
1844static void defer_request(struct i915_request *rq, struct list_head * const pl)
1845{
1846	LIST_HEAD(list);
1847
1848	/*
1849	 * We want to move the interrupted request to the back of
1850	 * the round-robin list (i.e. its priority level), but
1851	 * in doing so, we must then move all requests that were in
1852	 * flight and were waiting for the interrupted request to
1853	 * be run after it again.
1854	 */
1855	do {
1856		struct i915_dependency *p;
1857
1858		GEM_BUG_ON(i915_request_is_active(rq));
1859		list_move_tail(&rq->sched.link, pl);
1860
1861		for_each_waiter(p, rq) {
1862			struct i915_request *w =
1863				container_of(p->waiter, typeof(*w), sched);
1864
1865			if (p->flags & I915_DEPENDENCY_WEAK)
1866				continue;
1867
1868			/* Leave semaphores spinning on the other engines */
1869			if (w->engine != rq->engine)
1870				continue;
1871
1872			/* No waiter should start before its signaler */
1873			GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1874				   i915_request_started(w) &&
1875				   !i915_request_completed(rq));
1876
1877			GEM_BUG_ON(i915_request_is_active(w));
1878			if (!i915_request_is_ready(w))
1879				continue;
1880
1881			if (rq_prio(w) < rq_prio(rq))
1882				continue;
1883
1884			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1885			list_move_tail(&w->sched.link, &list);
1886		}
1887
1888		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1889	} while (rq);
1890}
1891
1892static void defer_active(struct intel_engine_cs *engine)
1893{
1894	struct i915_request *rq;
1895
1896	rq = __unwind_incomplete_requests(engine);
1897	if (!rq)
1898		return;
1899
1900	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1901}
1902
1903static bool
1904need_timeslice(const struct intel_engine_cs *engine,
1905	       const struct i915_request *rq,
1906	       const struct rb_node *rb)
1907{
1908	int hint;
1909
1910	if (!intel_engine_has_timeslices(engine))
1911		return false;
1912
1913	hint = engine->execlists.queue_priority_hint;
1914
1915	if (rb) {
1916		const struct virtual_engine *ve =
1917			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1918		const struct intel_engine_cs *inflight =
1919			intel_context_inflight(&ve->context);
1920
1921		if (!inflight || inflight == engine) {
1922			struct i915_request *next;
1923
1924			rcu_read_lock();
1925			next = READ_ONCE(ve->request);
1926			if (next)
1927				hint = max(hint, rq_prio(next));
1928			rcu_read_unlock();
1929		}
1930	}
1931
1932	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1933		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1934
1935	GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1936	return hint >= effective_prio(rq);
1937}
1938
1939static bool
1940timeslice_yield(const struct intel_engine_execlists *el,
1941		const struct i915_request *rq)
1942{
1943	/*
1944	 * Once bitten, forever smitten!
1945	 *
1946	 * If the active context ever busy-waited on a semaphore,
1947	 * it will be treated as a hog until the end of its timeslice (i.e.
1948	 * until it is scheduled out and replaced by a new submission,
1949	 * possibly even its own lite-restore). The HW only sends an interrupt
1950	 * on the first miss, and we do know if that semaphore has been
1951	 * signaled, or even if it is now stuck on another semaphore. Play
1952	 * safe, yield if it might be stuck -- it will be given a fresh
1953	 * timeslice in the near future.
1954	 */
1955	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1956}
1957
1958static bool
1959timeslice_expired(const struct intel_engine_execlists *el,
1960		  const struct i915_request *rq)
1961{
1962	return timer_expired(&el->timer) || timeslice_yield(el, rq);
1963}
1964
1965static int
1966switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1967{
1968	if (list_is_last(&rq->sched.link, &engine->active.requests))
1969		return engine->execlists.queue_priority_hint;
1970
1971	return rq_prio(list_next_entry(rq, sched.link));
1972}
1973
1974static inline unsigned long
1975timeslice(const struct intel_engine_cs *engine)
1976{
1977	return READ_ONCE(engine->props.timeslice_duration_ms);
1978}
1979
1980static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1981{
1982	const struct intel_engine_execlists *execlists = &engine->execlists;
1983	const struct i915_request *rq = *execlists->active;
1984
1985	if (!rq || i915_request_completed(rq))
1986		return 0;
1987
1988	if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1989		return 0;
1990
1991	return timeslice(engine);
1992}
1993
1994static void set_timeslice(struct intel_engine_cs *engine)
1995{
1996	unsigned long duration;
1997
1998	if (!intel_engine_has_timeslices(engine))
1999		return;
2000
2001	duration = active_timeslice(engine);
2002	ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2003
2004	set_timer_ms(&engine->execlists.timer, duration);
2005}
2006
2007static void start_timeslice(struct intel_engine_cs *engine, int prio)
2008{
2009	struct intel_engine_execlists *execlists = &engine->execlists;
2010	unsigned long duration;
2011
2012	if (!intel_engine_has_timeslices(engine))
2013		return;
2014
2015	WRITE_ONCE(execlists->switch_priority_hint, prio);
2016	if (prio == INT_MIN)
2017		return;
2018
2019	if (timer_pending(&execlists->timer))
2020		return;
2021
2022	duration = timeslice(engine);
2023	ENGINE_TRACE(engine,
2024		     "start timeslicing, prio:%d, interval:%lu",
2025		     prio, duration);
2026
2027	set_timer_ms(&execlists->timer, duration);
2028}
2029
2030static void record_preemption(struct intel_engine_execlists *execlists)
2031{
2032	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2033}
2034
2035static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2036					    const struct i915_request *rq)
2037{
2038	if (!rq)
2039		return 0;
2040
2041	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
2042	if (unlikely(intel_context_is_banned(rq->context)))
2043		return 1;
2044
2045	return READ_ONCE(engine->props.preempt_timeout_ms);
2046}
2047
2048static void set_preempt_timeout(struct intel_engine_cs *engine,
2049				const struct i915_request *rq)
2050{
2051	if (!intel_engine_has_preempt_reset(engine))
2052		return;
2053
2054	set_timer_ms(&engine->execlists.preempt,
2055		     active_preempt_timeout(engine, rq));
2056}
2057
2058static inline void clear_ports(struct i915_request **ports, int count)
2059{
2060	memset_p((void **)ports, NULL, count);
2061}
2062
2063static inline void
2064copy_ports(struct i915_request **dst, struct i915_request **src, int count)
2065{
2066	/* A memcpy_p() would be very useful here! */
2067	while (count--)
2068		WRITE_ONCE(*dst++, *src++); /* avoid write tearing */
2069}
2070
2071static void execlists_dequeue(struct intel_engine_cs *engine)
2072{
2073	struct intel_engine_execlists * const execlists = &engine->execlists;
2074	struct i915_request **port = execlists->pending;
2075	struct i915_request ** const last_port = port + execlists->port_mask;
2076	struct i915_request * const *active;
2077	struct i915_request *last;
2078	struct rb_node *rb;
2079	bool submit = false;
2080
2081	/*
2082	 * Hardware submission is through 2 ports. Conceptually each port
2083	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2084	 * static for a context, and unique to each, so we only execute
2085	 * requests belonging to a single context from each ring. RING_HEAD
2086	 * is maintained by the CS in the context image, it marks the place
2087	 * where it got up to last time, and through RING_TAIL we tell the CS
2088	 * where we want to execute up to this time.
2089	 *
2090	 * In this list the requests are in order of execution. Consecutive
2091	 * requests from the same context are adjacent in the ringbuffer. We
2092	 * can combine these requests into a single RING_TAIL update:
2093	 *
2094	 *              RING_HEAD...req1...req2
2095	 *                                    ^- RING_TAIL
2096	 * since to execute req2 the CS must first execute req1.
2097	 *
2098	 * Our goal then is to point each port to the end of a consecutive
2099	 * sequence of requests as being the most optimal (fewest wake ups
2100	 * and context switches) submission.
2101	 */
2102
2103	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2104		struct virtual_engine *ve =
2105			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2106		struct i915_request *rq = READ_ONCE(ve->request);
2107
2108		if (!rq) { /* lazily cleanup after another engine handled rq */
2109			rb_erase_cached(rb, &execlists->virtual);
2110			RB_CLEAR_NODE(rb);
2111			rb = rb_first_cached(&execlists->virtual);
2112			continue;
2113		}
2114
2115		if (!virtual_matches(ve, rq, engine)) {
2116			rb = rb_next(rb);
2117			continue;
2118		}
2119
2120		break;
2121	}
2122
2123	/*
2124	 * If the queue is higher priority than the last
2125	 * request in the currently active context, submit afresh.
2126	 * We will resubmit again afterwards in case we need to split
2127	 * the active context to interject the preemption request,
2128	 * i.e. we will retrigger preemption following the ack in case
2129	 * of trouble.
2130	 */
2131	active = READ_ONCE(execlists->active);
2132
2133	/*
2134	 * In theory we can skip over completed contexts that have not
2135	 * yet been processed by events (as those events are in flight):
2136	 *
2137	 * while ((last = *active) && i915_request_completed(last))
2138	 *	active++;
2139	 *
2140	 * However, the GPU cannot handle this as it will ultimately
2141	 * find itself trying to jump back into a context it has just
2142	 * completed and barf.
2143	 */
2144
2145	if ((last = *active)) {
2146		if (need_preempt(engine, last, rb)) {
2147			if (i915_request_completed(last)) {
2148				tasklet_hi_schedule(&execlists->tasklet);
2149				return;
2150			}
2151
2152			ENGINE_TRACE(engine,
2153				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2154				     last->fence.context,
2155				     last->fence.seqno,
2156				     last->sched.attr.priority,
2157				     execlists->queue_priority_hint);
2158			record_preemption(execlists);
2159
2160			/*
2161			 * Don't let the RING_HEAD advance past the breadcrumb
2162			 * as we unwind (and until we resubmit) so that we do
2163			 * not accidentally tell it to go backwards.
2164			 */
2165			ring_set_paused(engine, 1);
2166
2167			/*
2168			 * Note that we have not stopped the GPU at this point,
2169			 * so we are unwinding the incomplete requests as they
2170			 * remain inflight and so by the time we do complete
2171			 * the preemption, some of the unwound requests may
2172			 * complete!
2173			 */
2174			__unwind_incomplete_requests(engine);
2175
2176			last = NULL;
2177		} else if (need_timeslice(engine, last, rb) &&
2178			   timeslice_expired(execlists, last)) {
2179			if (i915_request_completed(last)) {
2180				tasklet_hi_schedule(&execlists->tasklet);
2181				return;
2182			}
2183
2184			ENGINE_TRACE(engine,
2185				     "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2186				     last->fence.context,
2187				     last->fence.seqno,
2188				     last->sched.attr.priority,
2189				     execlists->queue_priority_hint,
2190				     yesno(timeslice_yield(execlists, last)));
2191
2192			ring_set_paused(engine, 1);
2193			defer_active(engine);
2194
2195			/*
2196			 * Unlike for preemption, if we rewind and continue
2197			 * executing the same context as previously active,
2198			 * the order of execution will remain the same and
2199			 * the tail will only advance. We do not need to
2200			 * force a full context restore, as a lite-restore
2201			 * is sufficient to resample the monotonic TAIL.
2202			 *
2203			 * If we switch to any other context, similarly we
2204			 * will not rewind TAIL of current context, and
2205			 * normal save/restore will preserve state and allow
2206			 * us to later continue executing the same request.
2207			 */
2208			last = NULL;
2209		} else {
2210			/*
2211			 * Otherwise if we already have a request pending
2212			 * for execution after the current one, we can
2213			 * just wait until the next CS event before
2214			 * queuing more. In either case we will force a
2215			 * lite-restore preemption event, but if we wait
2216			 * we hopefully coalesce several updates into a single
2217			 * submission.
2218			 */
2219			if (!list_is_last(&last->sched.link,
2220					  &engine->active.requests)) {
2221				/*
2222				 * Even if ELSP[1] is occupied and not worthy
2223				 * of timeslices, our queue might be.
2224				 */
2225				start_timeslice(engine, queue_prio(execlists));
2226				return;
2227			}
2228		}
2229	}
2230
2231	while (rb) { /* XXX virtual is always taking precedence */
2232		struct virtual_engine *ve =
2233			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2234		struct i915_request *rq;
2235
2236		spin_lock(&ve->base.active.lock);
2237
2238		rq = ve->request;
2239		if (unlikely(!rq)) { /* lost the race to a sibling */
2240			spin_unlock(&ve->base.active.lock);
2241			rb_erase_cached(rb, &execlists->virtual);
2242			RB_CLEAR_NODE(rb);
2243			rb = rb_first_cached(&execlists->virtual);
2244			continue;
2245		}
2246
2247		GEM_BUG_ON(rq != ve->request);
2248		GEM_BUG_ON(rq->engine != &ve->base);
2249		GEM_BUG_ON(rq->context != &ve->context);
2250
2251		if (rq_prio(rq) >= queue_prio(execlists)) {
2252			if (!virtual_matches(ve, rq, engine)) {
2253				spin_unlock(&ve->base.active.lock);
2254				rb = rb_next(rb);
2255				continue;
2256			}
2257
2258			if (last && !can_merge_rq(last, rq)) {
2259				spin_unlock(&ve->base.active.lock);
2260				start_timeslice(engine, rq_prio(rq));
2261				return; /* leave this for another sibling */
2262			}
2263
2264			ENGINE_TRACE(engine,
2265				     "virtual rq=%llx:%lld%s, new engine? %s\n",
2266				     rq->fence.context,
2267				     rq->fence.seqno,
2268				     i915_request_completed(rq) ? "!" :
2269				     i915_request_started(rq) ? "*" :
2270				     "",
2271				     yesno(engine != ve->siblings[0]));
2272
2273			WRITE_ONCE(ve->request, NULL);
2274			WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2275				   INT_MIN);
2276			rb_erase_cached(rb, &execlists->virtual);
2277			RB_CLEAR_NODE(rb);
2278
2279			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2280			WRITE_ONCE(rq->engine, engine);
2281
2282			if (engine != ve->siblings[0]) {
2283				u32 *regs = ve->context.lrc_reg_state;
2284				unsigned int n;
2285
2286				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2287
2288				if (!intel_engine_has_relative_mmio(engine))
2289					virtual_update_register_offsets(regs,
2290									engine);
2291
2292				if (!list_empty(&ve->context.signals))
2293					virtual_xfer_breadcrumbs(ve);
2294
2295				/*
2296				 * Move the bound engine to the top of the list
2297				 * for future execution. We then kick this
2298				 * tasklet first before checking others, so that
2299				 * we preferentially reuse this set of bound
2300				 * registers.
2301				 */
2302				for (n = 1; n < ve->num_siblings; n++) {
2303					if (ve->siblings[n] == engine) {
2304						swap(ve->siblings[n],
2305						     ve->siblings[0]);
2306						break;
2307					}
2308				}
2309
2310				GEM_BUG_ON(ve->siblings[0] != engine);
2311			}
2312
2313			if (__i915_request_submit(rq)) {
2314				submit = true;
2315				last = rq;
2316			}
2317			i915_request_put(rq);
2318
2319			/*
2320			 * Hmm, we have a bunch of virtual engine requests,
2321			 * but the first one was already completed (thanks
2322			 * preempt-to-busy!). Keep looking at the veng queue
2323			 * until we have no more relevant requests (i.e.
2324			 * the normal submit queue has higher priority).
2325			 */
2326			if (!submit) {
2327				spin_unlock(&ve->base.active.lock);
2328				rb = rb_first_cached(&execlists->virtual);
2329				continue;
2330			}
2331		}
2332
2333		spin_unlock(&ve->base.active.lock);
2334		break;
2335	}
2336
2337	while ((rb = rb_first_cached(&execlists->queue))) {
2338		struct i915_priolist *p = to_priolist(rb);
2339		struct i915_request *rq, *rn;
2340		int i;
2341
2342		priolist_for_each_request_consume(rq, rn, p, i) {
2343			bool merge = true;
2344
2345			/*
2346			 * Can we combine this request with the current port?
2347			 * It has to be the same context/ringbuffer and not
2348			 * have any exceptions (e.g. GVT saying never to
2349			 * combine contexts).
2350			 *
2351			 * If we can combine the requests, we can execute both
2352			 * by updating the RING_TAIL to point to the end of the
2353			 * second request, and so we never need to tell the
2354			 * hardware about the first.
2355			 */
2356			if (last && !can_merge_rq(last, rq)) {
2357				/*
2358				 * If we are on the second port and cannot
2359				 * combine this request with the last, then we
2360				 * are done.
2361				 */
2362				if (port == last_port)
2363					goto done;
2364
2365				/*
2366				 * We must not populate both ELSP[] with the
2367				 * same LRCA, i.e. we must submit 2 different
2368				 * contexts if we submit 2 ELSP.
2369				 */
2370				if (last->context == rq->context)
2371					goto done;
2372
2373				if (i915_request_has_sentinel(last))
2374					goto done;
2375
2376				/*
2377				 * If GVT overrides us we only ever submit
2378				 * port[0], leaving port[1] empty. Note that we
2379				 * also have to be careful that we don't queue
2380				 * the same context (even though a different
2381				 * request) to the second port.
2382				 */
2383				if (ctx_single_port_submission(last->context) ||
2384				    ctx_single_port_submission(rq->context))
2385					goto done;
2386
2387				merge = false;
2388			}
2389
2390			if (__i915_request_submit(rq)) {
2391				if (!merge) {
2392					*port = execlists_schedule_in(last, port - execlists->pending);
2393					port++;
2394					last = NULL;
2395				}
2396
2397				GEM_BUG_ON(last &&
2398					   !can_merge_ctx(last->context,
2399							  rq->context));
2400				GEM_BUG_ON(last &&
2401					   i915_seqno_passed(last->fence.seqno,
2402							     rq->fence.seqno));
2403
2404				submit = true;
2405				last = rq;
2406			}
2407		}
2408
2409		rb_erase_cached(&p->node, &execlists->queue);
2410		i915_priolist_free(p);
2411	}
2412
2413done:
2414	/*
2415	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2416	 *
2417	 * We choose the priority hint such that if we add a request of greater
2418	 * priority than this, we kick the submission tasklet to decide on
2419	 * the right order of submitting the requests to hardware. We must
2420	 * also be prepared to reorder requests as they are in-flight on the
2421	 * HW. We derive the priority hint then as the first "hole" in
2422	 * the HW submission ports and if there are no available slots,
2423	 * the priority of the lowest executing request, i.e. last.
2424	 *
2425	 * When we do receive a higher priority request ready to run from the
2426	 * user, see queue_request(), the priority hint is bumped to that
2427	 * request triggering preemption on the next dequeue (or subsequent
2428	 * interrupt for secondary ports).
2429	 */
2430	execlists->queue_priority_hint = queue_prio(execlists);
2431
2432	if (submit) {
2433		*port = execlists_schedule_in(last, port - execlists->pending);
2434		execlists->switch_priority_hint =
2435			switch_prio(engine, *execlists->pending);
2436
2437		/*
2438		 * Skip if we ended up with exactly the same set of requests,
2439		 * e.g. trying to timeslice a pair of ordered contexts
2440		 */
2441		if (!memcmp(active, execlists->pending,
2442			    (port - execlists->pending + 1) * sizeof(*port))) {
2443			do
2444				execlists_schedule_out(fetch_and_zero(port));
2445			while (port-- != execlists->pending);
2446
2447			goto skip_submit;
2448		}
2449		clear_ports(port + 1, last_port - port);
2450
2451		WRITE_ONCE(execlists->yield, -1);
2452		set_preempt_timeout(engine, *active);
2453		execlists_submit_ports(engine);
2454	} else {
2455		start_timeslice(engine, execlists->queue_priority_hint);
2456skip_submit:
2457		ring_set_paused(engine, 0);
2458	}
2459}
2460
2461static void
2462cancel_port_requests(struct intel_engine_execlists * const execlists)
2463{
2464	struct i915_request * const *port;
2465
2466	for (port = execlists->pending; *port; port++)
2467		execlists_schedule_out(*port);
2468	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2469
2470	/* Mark the end of active before we overwrite *active */
2471	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2472		execlists_schedule_out(*port);
2473	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2474
2475	smp_wmb(); /* complete the seqlock for execlists_active() */
2476	WRITE_ONCE(execlists->active, execlists->inflight);
2477}
2478
2479static inline void
2480invalidate_csb_entries(const u32 *first, const u32 *last)
2481{
2482	clflush((void *)first);
2483	clflush((void *)last);
2484}
2485
2486/*
2487 * Starting with Gen12, the status has a new format:
2488 *
2489 *     bit  0:     switched to new queue
2490 *     bit  1:     reserved
2491 *     bit  2:     semaphore wait mode (poll or signal), only valid when
2492 *                 switch detail is set to "wait on semaphore"
2493 *     bits 3-5:   engine class
2494 *     bits 6-11:  engine instance
2495 *     bits 12-14: reserved
2496 *     bits 15-25: sw context id of the lrc the GT switched to
2497 *     bits 26-31: sw counter of the lrc the GT switched to
2498 *     bits 32-35: context switch detail
2499 *                  - 0: ctx complete
2500 *                  - 1: wait on sync flip
2501 *                  - 2: wait on vblank
2502 *                  - 3: wait on scanline
2503 *                  - 4: wait on semaphore
2504 *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2505 *                       WAIT_FOR_EVENT)
2506 *     bit  36:    reserved
2507 *     bits 37-43: wait detail (for switch detail 1 to 4)
2508 *     bits 44-46: reserved
2509 *     bits 47-57: sw context id of the lrc the GT switched away from
2510 *     bits 58-63: sw counter of the lrc the GT switched away from
2511 */
2512static inline bool
2513gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2514{
2515	u32 lower_dw = csb[0];
2516	u32 upper_dw = csb[1];
2517	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2518	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2519	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2520
2521	/*
2522	 * The context switch detail is not guaranteed to be 5 when a preemption
2523	 * occurs, so we can't just check for that. The check below works for
2524	 * all the cases we care about, including preemptions of WAIT
2525	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2526	 * would require some extra handling, but we don't support that.
2527	 */
2528	if (!ctx_away_valid || new_queue) {
2529		GEM_BUG_ON(!ctx_to_valid);
2530		return true;
2531	}
2532
2533	/*
2534	 * switch detail = 5 is covered by the case above and we do not expect a
2535	 * context switch on an unsuccessful wait instruction since we always
2536	 * use polling mode.
2537	 */
2538	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2539	return false;
2540}
2541
2542static inline bool
2543gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2544{
2545	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2546}
2547
2548static void process_csb(struct intel_engine_cs *engine)
2549{
2550	struct intel_engine_execlists * const execlists = &engine->execlists;
2551	const u32 * const buf = execlists->csb_status;
2552	const u8 num_entries = execlists->csb_size;
2553	u8 head, tail;
2554
2555	/*
2556	 * As we modify our execlists state tracking we require exclusive
2557	 * access. Either we are inside the tasklet, or the tasklet is disabled
2558	 * and we assume that is only inside the reset paths and so serialised.
2559	 */
2560	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2561		   !reset_in_progress(execlists));
2562	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2563
2564	/*
2565	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2566	 * When reading from the csb_write mmio register, we have to be
2567	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2568	 * the low 4bits. As it happens we know the next 4bits are always
2569	 * zero and so we can simply masked off the low u8 of the register
2570	 * and treat it identically to reading from the HWSP (without having
2571	 * to use explicit shifting and masking, and probably bifurcating
2572	 * the code to handle the legacy mmio read).
2573	 */
2574	head = execlists->csb_head;
2575	tail = READ_ONCE(*execlists->csb_write);
2576	if (unlikely(head == tail))
2577		return;
2578
2579	/*
2580	 * We will consume all events from HW, or at least pretend to.
2581	 *
2582	 * The sequence of events from the HW is deterministic, and derived
2583	 * from our writes to the ELSP, with a smidgen of variability for
2584	 * the arrival of the asynchronous requests wrt to the inflight
2585	 * execution. If the HW sends an event that does not correspond with
2586	 * the one we are expecting, we have to abandon all hope as we lose
2587	 * all tracking of what the engine is actually executing. We will
2588	 * only detect we are out of sequence with the HW when we get an
2589	 * 'impossible' event because we have already drained our own
2590	 * preemption/promotion queue. If this occurs, we know that we likely
2591	 * lost track of execution earlier and must unwind and restart, the
2592	 * simplest way is by stop processing the event queue and force the
2593	 * engine to reset.
2594	 */
2595	execlists->csb_head = tail;
2596	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2597
2598	/*
2599	 * Hopefully paired with a wmb() in HW!
2600	 *
2601	 * We must complete the read of the write pointer before any reads
2602	 * from the CSB, so that we do not see stale values. Without an rmb
2603	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2604	 * we perform the READ_ONCE(*csb_write).
2605	 */
2606	rmb();
2607	do {
2608		bool promote;
2609
2610		if (++head == num_entries)
2611			head = 0;
2612
2613		/*
2614		 * We are flying near dragons again.
2615		 *
2616		 * We hold a reference to the request in execlist_port[]
2617		 * but no more than that. We are operating in softirq
2618		 * context and so cannot hold any mutex or sleep. That
2619		 * prevents us stopping the requests we are processing
2620		 * in port[] from being retired simultaneously (the
2621		 * breadcrumb will be complete before we see the
2622		 * context-switch). As we only hold the reference to the
2623		 * request, any pointer chasing underneath the request
2624		 * is subject to a potential use-after-free. Thus we
2625		 * store all of the bookkeeping within port[] as
2626		 * required, and avoid using unguarded pointers beneath
2627		 * request itself. The same applies to the atomic
2628		 * status notifier.
2629		 */
2630
2631		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2632			     head, buf[2 * head + 0], buf[2 * head + 1]);
2633
2634		if (INTEL_GEN(engine->i915) >= 12)
2635			promote = gen12_csb_parse(execlists, buf + 2 * head);
2636		else
2637			promote = gen8_csb_parse(execlists, buf + 2 * head);
2638		if (promote) {
2639			struct i915_request * const *old = execlists->active;
2640
2641			if (GEM_WARN_ON(!*execlists->pending)) {
2642				execlists->error_interrupt |= ERROR_CSB;
2643				break;
2644			}
2645
2646			ring_set_paused(engine, 0);
2647
2648			/* Point active to the new ELSP; prevent overwriting */
2649			WRITE_ONCE(execlists->active, execlists->pending);
2650			smp_wmb(); /* notify execlists_active() */
2651
2652			/* cancel old inflight, prepare for switch */
2653			trace_ports(execlists, "preempted", old);
2654			while (*old)
2655				execlists_schedule_out(*old++);
2656
2657			/* switch pending to inflight */
2658			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2659			copy_ports(execlists->inflight,
2660				   execlists->pending,
2661				   execlists_num_ports(execlists));
2662			smp_wmb(); /* complete the seqlock */
2663			WRITE_ONCE(execlists->active, execlists->inflight);
2664
2665			WRITE_ONCE(execlists->pending[0], NULL);
2666		} else {
2667			if (GEM_WARN_ON(!*execlists->active)) {
2668				execlists->error_interrupt |= ERROR_CSB;
2669				break;
2670			}
2671
2672			/* port0 completed, advanced to port1 */
2673			trace_ports(execlists, "completed", execlists->active);
2674
2675			/*
2676			 * We rely on the hardware being strongly
2677			 * ordered, that the breadcrumb write is
2678			 * coherent (visible from the CPU) before the
2679			 * user interrupt is processed. One might assume
2680			 * that the breadcrumb write being before the
2681			 * user interrupt and the CS event for the context
2682			 * switch would therefore be before the CS event
2683			 * itself...
2684			 */
2685			if (GEM_SHOW_DEBUG() &&
2686			    !i915_request_completed(*execlists->active)) {
2687				struct i915_request *rq = *execlists->active;
2688				const u32 *regs __maybe_unused =
2689					rq->context->lrc_reg_state;
2690
2691				ENGINE_TRACE(engine,
2692					     "context completed before request!\n");
2693				ENGINE_TRACE(engine,
2694					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2695					     ENGINE_READ(engine, RING_START),
2696					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2697					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2698					     ENGINE_READ(engine, RING_CTL),
2699					     ENGINE_READ(engine, RING_MI_MODE));
2700				ENGINE_TRACE(engine,
2701					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2702					     i915_ggtt_offset(rq->ring->vma),
2703					     rq->head, rq->tail,
2704					     rq->fence.context,
2705					     lower_32_bits(rq->fence.seqno),
2706					     hwsp_seqno(rq));
2707				ENGINE_TRACE(engine,
2708					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2709					     regs[CTX_RING_START],
2710					     regs[CTX_RING_HEAD],
2711					     regs[CTX_RING_TAIL]);
2712			}
2713
2714			execlists_schedule_out(*execlists->active++);
2715
2716			GEM_BUG_ON(execlists->active - execlists->inflight >
2717				   execlists_num_ports(execlists));
2718		}
2719	} while (head != tail);
2720
2721	set_timeslice(engine);
2722
2723	/*
2724	 * Gen11 has proven to fail wrt global observation point between
2725	 * entry and tail update, failing on the ordering and thus
2726	 * we see an old entry in the context status buffer.
2727	 *
2728	 * Forcibly evict out entries for the next gpu csb update,
2729	 * to increase the odds that we get a fresh entries with non
2730	 * working hardware. The cost for doing so comes out mostly with
2731	 * the wash as hardware, working or not, will need to do the
2732	 * invalidation before.
2733	 */
2734	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2735}
2736
2737static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2738{
2739	lockdep_assert_held(&engine->active.lock);
2740	if (!READ_ONCE(engine->execlists.pending[0])) {
2741		rcu_read_lock(); /* protect peeking at execlists->active */
2742		execlists_dequeue(engine);
2743		rcu_read_unlock();
2744	}
2745}
2746
2747static void __execlists_hold(struct i915_request *rq)
2748{
2749	LIST_HEAD(list);
2750
2751	do {
2752		struct i915_dependency *p;
2753
2754		if (i915_request_is_active(rq))
2755			__i915_request_unsubmit(rq);
2756
2757		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2758		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2759		i915_request_set_hold(rq);
2760		RQ_TRACE(rq, "on hold\n");
2761
2762		for_each_waiter(p, rq) {
2763			struct i915_request *w =
2764				container_of(p->waiter, typeof(*w), sched);
2765
2766			/* Leave semaphores spinning on the other engines */
2767			if (w->engine != rq->engine)
2768				continue;
2769
2770			if (!i915_request_is_ready(w))
2771				continue;
2772
2773			if (i915_request_completed(w))
2774				continue;
2775
2776			if (i915_request_on_hold(w))
2777				continue;
2778
2779			list_move_tail(&w->sched.link, &list);
2780		}
2781
2782		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2783	} while (rq);
2784}
2785
2786static bool execlists_hold(struct intel_engine_cs *engine,
2787			   struct i915_request *rq)
2788{
2789	spin_lock_irq(&engine->active.lock);
2790
2791	if (i915_request_completed(rq)) { /* too late! */
2792		rq = NULL;
2793		goto unlock;
2794	}
2795
2796	if (rq->engine != engine) { /* preempted virtual engine */
2797		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2798
2799		/*
2800		 * intel_context_inflight() is only protected by virtue
2801		 * of process_csb() being called only by the tasklet (or
2802		 * directly from inside reset while the tasklet is suspended).
2803		 * Assert that neither of those are allowed to run while we
2804		 * poke at the request queues.
2805		 */
2806		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2807
2808		/*
2809		 * An unsubmitted request along a virtual engine will
2810		 * remain on the active (this) engine until we are able
2811		 * to process the context switch away (and so mark the
2812		 * context as no longer in flight). That cannot have happened
2813		 * yet, otherwise we would not be hanging!
2814		 */
2815		spin_lock(&ve->base.active.lock);
2816		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2817		GEM_BUG_ON(ve->request != rq);
2818		ve->request = NULL;
2819		spin_unlock(&ve->base.active.lock);
2820		i915_request_put(rq);
2821
2822		rq->engine = engine;
2823	}
2824
2825	/*
2826	 * Transfer this request onto the hold queue to prevent it
2827	 * being resumbitted to HW (and potentially completed) before we have
2828	 * released it. Since we may have already submitted following
2829	 * requests, we need to remove those as well.
2830	 */
2831	GEM_BUG_ON(i915_request_on_hold(rq));
2832	GEM_BUG_ON(rq->engine != engine);
2833	__execlists_hold(rq);
2834	GEM_BUG_ON(list_empty(&engine->active.hold));
2835
2836unlock:
2837	spin_unlock_irq(&engine->active.lock);
2838	return rq;
2839}
2840
2841static bool hold_request(const struct i915_request *rq)
2842{
2843	struct i915_dependency *p;
2844	bool result = false;
2845
2846	/*
2847	 * If one of our ancestors is on hold, we must also be on hold,
2848	 * otherwise we will bypass it and execute before it.
2849	 */
2850	rcu_read_lock();
2851	for_each_signaler(p, rq) {
2852		const struct i915_request *s =
2853			container_of(p->signaler, typeof(*s), sched);
2854
2855		if (s->engine != rq->engine)
2856			continue;
2857
2858		result = i915_request_on_hold(s);
2859		if (result)
2860			break;
2861	}
2862	rcu_read_unlock();
2863
2864	return result;
2865}
2866
2867static void __execlists_unhold(struct i915_request *rq)
2868{
2869	LIST_HEAD(list);
2870
2871	do {
2872		struct i915_dependency *p;
2873
2874		RQ_TRACE(rq, "hold release\n");
2875
2876		GEM_BUG_ON(!i915_request_on_hold(rq));
2877		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2878
2879		i915_request_clear_hold(rq);
2880		list_move_tail(&rq->sched.link,
2881			       i915_sched_lookup_priolist(rq->engine,
2882							  rq_prio(rq)));
2883		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2884
2885		/* Also release any children on this engine that are ready */
2886		for_each_waiter(p, rq) {
2887			struct i915_request *w =
2888				container_of(p->waiter, typeof(*w), sched);
2889
2890			/* Propagate any change in error status */
2891			if (rq->fence.error)
2892				i915_request_set_error_once(w, rq->fence.error);
2893
2894			if (w->engine != rq->engine)
2895				continue;
2896
2897			if (!i915_request_on_hold(w))
2898				continue;
2899
2900			/* Check that no other parents are also on hold */
2901			if (hold_request(w))
2902				continue;
2903
2904			list_move_tail(&w->sched.link, &list);
2905		}
2906
2907		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2908	} while (rq);
2909}
2910
2911static void execlists_unhold(struct intel_engine_cs *engine,
2912			     struct i915_request *rq)
2913{
2914	spin_lock_irq(&engine->active.lock);
2915
2916	/*
2917	 * Move this request back to the priority queue, and all of its
2918	 * children and grandchildren that were suspended along with it.
2919	 */
2920	__execlists_unhold(rq);
2921
2922	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2923		engine->execlists.queue_priority_hint = rq_prio(rq);
2924		tasklet_hi_schedule(&engine->execlists.tasklet);
2925	}
2926
2927	spin_unlock_irq(&engine->active.lock);
2928}
2929
2930struct execlists_capture {
2931	struct work_struct work;
2932	struct i915_request *rq;
2933	struct i915_gpu_coredump *error;
2934};
2935
2936static void execlists_capture_work(struct work_struct *work)
2937{
2938	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2939	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2940	struct intel_engine_cs *engine = cap->rq->engine;
2941	struct intel_gt_coredump *gt = cap->error->gt;
2942	struct intel_engine_capture_vma *vma;
2943
2944	/* Compress all the objects attached to the request, slow! */
2945	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2946	if (vma) {
2947		struct i915_vma_compress *compress =
2948			i915_vma_capture_prepare(gt);
2949
2950		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2951		i915_vma_capture_finish(gt, compress);
2952	}
2953
2954	gt->simulated = gt->engine->simulated;
2955	cap->error->simulated = gt->simulated;
2956
2957	/* Publish the error state, and announce it to the world */
2958	i915_error_state_store(cap->error);
2959	i915_gpu_coredump_put(cap->error);
2960
2961	/* Return this request and all that depend upon it for signaling */
2962	execlists_unhold(engine, cap->rq);
2963	i915_request_put(cap->rq);
2964
2965	kfree(cap);
2966}
2967
2968static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2969{
2970	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2971	struct execlists_capture *cap;
2972
2973	cap = kmalloc(sizeof(*cap), gfp);
2974	if (!cap)
2975		return NULL;
2976
2977	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2978	if (!cap->error)
2979		goto err_cap;
2980
2981	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2982	if (!cap->error->gt)
2983		goto err_gpu;
2984
2985	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2986	if (!cap->error->gt->engine)
2987		goto err_gt;
2988
2989	return cap;
2990
2991err_gt:
2992	kfree(cap->error->gt);
2993err_gpu:
2994	kfree(cap->error);
2995err_cap:
2996	kfree(cap);
2997	return NULL;
2998}
2999
3000static struct i915_request *
3001active_context(struct intel_engine_cs *engine, u32 ccid)
3002{
3003	const struct intel_engine_execlists * const el = &engine->execlists;
3004	struct i915_request * const *port, *rq;
3005
3006	/*
3007	 * Use the most recent result from process_csb(), but just in case
3008	 * we trigger an error (via interrupt) before the first CS event has
3009	 * been written, peek at the next submission.
3010	 */
3011
3012	for (port = el->active; (rq = *port); port++) {
3013		if (rq->context->lrc.ccid == ccid) {
3014			ENGINE_TRACE(engine,
3015				     "ccid found at active:%zd\n",
3016				     port - el->active);
3017			return rq;
3018		}
3019	}
3020
3021	for (port = el->pending; (rq = *port); port++) {
3022		if (rq->context->lrc.ccid == ccid) {
3023			ENGINE_TRACE(engine,
3024				     "ccid found at pending:%zd\n",
3025				     port - el->pending);
3026			return rq;
3027		}
3028	}
3029
3030	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3031	return NULL;
3032}
3033
3034static u32 active_ccid(struct intel_engine_cs *engine)
3035{
3036	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3037}
3038
3039static void execlists_capture(struct intel_engine_cs *engine)
3040{
3041	struct execlists_capture *cap;
3042
3043	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3044		return;
3045
3046	/*
3047	 * We need to _quickly_ capture the engine state before we reset.
3048	 * We are inside an atomic section (softirq) here and we are delaying
3049	 * the forced preemption event.
3050	 */
3051	cap = capture_regs(engine);
3052	if (!cap)
3053		return;
3054
3055	spin_lock_irq(&engine->active.lock);
3056	cap->rq = active_context(engine, active_ccid(engine));
3057	if (cap->rq) {
3058		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3059		cap->rq = i915_request_get_rcu(cap->rq);
3060	}
3061	spin_unlock_irq(&engine->active.lock);
3062	if (!cap->rq)
3063		goto err_free;
3064
3065	/*
3066	 * Remove the request from the execlists queue, and take ownership
3067	 * of the request. We pass it to our worker who will _slowly_ compress
3068	 * all the pages the _user_ requested for debugging their batch, after
3069	 * which we return it to the queue for signaling.
3070	 *
3071	 * By removing them from the execlists queue, we also remove the
3072	 * requests from being processed by __unwind_incomplete_requests()
3073	 * during the intel_engine_reset(), and so they will *not* be replayed
3074	 * afterwards.
3075	 *
3076	 * Note that because we have not yet reset the engine at this point,
3077	 * it is possible for the request that we have identified as being
3078	 * guilty, did in fact complete and we will then hit an arbitration
3079	 * point allowing the outstanding preemption to succeed. The likelihood
3080	 * of that is very low (as capturing of the engine registers should be
3081	 * fast enough to run inside an irq-off atomic section!), so we will
3082	 * simply hold that request accountable for being non-preemptible
3083	 * long enough to force the reset.
3084	 */
3085	if (!execlists_hold(engine, cap->rq))
3086		goto err_rq;
3087
3088	INIT_WORK(&cap->work, execlists_capture_work);
3089	schedule_work(&cap->work);
3090	return;
3091
3092err_rq:
3093	i915_request_put(cap->rq);
3094err_free:
3095	i915_gpu_coredump_put(cap->error);
3096	kfree(cap);
3097}
3098
3099static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3100{
3101	const unsigned int bit = I915_RESET_ENGINE + engine->id;
3102	unsigned long *lock = &engine->gt->reset.flags;
3103
3104	if (!intel_has_reset_engine(engine->gt))
3105		return;
3106
3107	if (test_and_set_bit(bit, lock))
3108		return;
3109
3110	ENGINE_TRACE(engine, "reset for %s\n", msg);
3111
3112	/* Mark this tasklet as disabled to avoid waiting for it to complete */
3113	tasklet_disable_nosync(&engine->execlists.tasklet);
3114
3115	ring_set_paused(engine, 1); /* Freeze the current request in place */
3116	execlists_capture(engine);
3117	intel_engine_reset(engine, msg);
3118
3119	tasklet_enable(&engine->execlists.tasklet);
3120	clear_and_wake_up_bit(bit, lock);
3121}
3122
3123static bool preempt_timeout(const struct intel_engine_cs *const engine)
3124{
3125	const struct timer_list *t = &engine->execlists.preempt;
3126
3127	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3128		return false;
3129
3130	if (!timer_expired(t))
3131		return false;
3132
3133	return READ_ONCE(engine->execlists.pending[0]);
3134}
3135
3136/*
3137 * Check the unread Context Status Buffers and manage the submission of new
3138 * contexts to the ELSP accordingly.
3139 */
3140static void execlists_submission_tasklet(unsigned long data)
3141{
3142	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3143	bool timeout = preempt_timeout(engine);
3144
3145	process_csb(engine);
3146
3147	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3148		const char *msg;
3149
3150		/* Generate the error message in priority wrt to the user! */
3151		if (engine->execlists.error_interrupt & GENMASK(15, 0))
3152			msg = "CS error"; /* thrown by a user payload */
3153		else if (engine->execlists.error_interrupt & ERROR_CSB)
3154			msg = "invalid CSB event";
3155		else
3156			msg = "internal error";
3157
3158		engine->execlists.error_interrupt = 0;
3159		execlists_reset(engine, msg);
3160	}
3161
3162	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3163		unsigned long flags;
3164
3165		spin_lock_irqsave(&engine->active.lock, flags);
3166		__execlists_submission_tasklet(engine);
3167		spin_unlock_irqrestore(&engine->active.lock, flags);
3168
3169		/* Recheck after serialising with direct-submission */
3170		if (unlikely(timeout && preempt_timeout(engine)))
3171			execlists_reset(engine, "preemption time out");
3172	}
3173}
3174
3175static void __execlists_kick(struct intel_engine_execlists *execlists)
3176{
3177	/* Kick the tasklet for some interrupt coalescing and reset handling */
3178	tasklet_hi_schedule(&execlists->tasklet);
3179}
3180
3181#define execlists_kick(t, member) \
3182	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
3183
3184static void execlists_timeslice(struct timer_list *timer)
3185{
3186	execlists_kick(timer, timer);
3187}
3188
3189static void execlists_preempt(struct timer_list *timer)
3190{
3191	execlists_kick(timer, preempt);
3192}
3193
3194static void queue_request(struct intel_engine_cs *engine,
3195			  struct i915_request *rq)
3196{
3197	GEM_BUG_ON(!list_empty(&rq->sched.link));
3198	list_add_tail(&rq->sched.link,
3199		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
3200	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3201}
3202
3203static void __submit_queue_imm(struct intel_engine_cs *engine)
3204{
3205	struct intel_engine_execlists * const execlists = &engine->execlists;
3206
3207	if (reset_in_progress(execlists))
3208		return; /* defer until we restart the engine following reset */
3209
3210	__execlists_submission_tasklet(engine);
3211}
3212
3213static void submit_queue(struct intel_engine_cs *engine,
3214			 const struct i915_request *rq)
3215{
3216	struct intel_engine_execlists *execlists = &engine->execlists;
3217
3218	if (rq_prio(rq) <= execlists->queue_priority_hint)
3219		return;
3220
3221	execlists->queue_priority_hint = rq_prio(rq);
3222	__submit_queue_imm(engine);
3223}
3224
3225static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3226			     const struct i915_request *rq)
3227{
3228	GEM_BUG_ON(i915_request_on_hold(rq));
3229	return !list_empty(&engine->active.hold) && hold_request(rq);
3230}
3231
3232static void flush_csb(struct intel_engine_cs *engine)
3233{
3234	struct intel_engine_execlists *el = &engine->execlists;
3235
3236	if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3237		if (!reset_in_progress(el))
3238			process_csb(engine);
3239		tasklet_unlock(&el->tasklet);
3240	}
3241}
3242
3243static void execlists_submit_request(struct i915_request *request)
3244{
3245	struct intel_engine_cs *engine = request->engine;
3246	unsigned long flags;
3247
3248	/* Hopefully we clear execlists->pending[] to let us through */
3249	flush_csb(engine);
3250
3251	/* Will be called from irq-context when using foreign fences. */
3252	spin_lock_irqsave(&engine->active.lock, flags);
3253
3254	if (unlikely(ancestor_on_hold(engine, request))) {
3255		RQ_TRACE(request, "ancestor on hold\n");
3256		list_add_tail(&request->sched.link, &engine->active.hold);
3257		i915_request_set_hold(request);
3258	} else {
3259		queue_request(engine, request);
3260
3261		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3262		GEM_BUG_ON(list_empty(&request->sched.link));
3263
3264		submit_queue(engine, request);
3265	}
3266
3267	spin_unlock_irqrestore(&engine->active.lock, flags);
3268}
3269
3270static void __execlists_context_fini(struct intel_context *ce)
3271{
3272	intel_ring_put(ce->ring);
3273	i915_vma_put(ce->state);
3274}
3275
3276static void execlists_context_destroy(struct kref *kref)
3277{
3278	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3279
3280	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3281	GEM_BUG_ON(intel_context_is_pinned(ce));
3282
3283	if (ce->state)
3284		__execlists_context_fini(ce);
3285
3286	intel_context_fini(ce);
3287	intel_context_free(ce);
3288}
3289
3290static void
3291set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3292{
3293	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3294		return;
3295
3296	vaddr += engine->context_size;
3297
3298	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3299}
3300
3301static void
3302check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3303{
3304	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3305		return;
3306
3307	vaddr += engine->context_size;
3308
3309	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3310		drm_err_once(&engine->i915->drm,
3311			     "%s context redzone overwritten!\n",
3312			     engine->name);
3313}
3314
3315static void execlists_context_unpin(struct intel_context *ce)
3316{
3317	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3318		      ce->engine);
3319
3320	i915_gem_object_unpin_map(ce->state->obj);
3321}
3322
3323static u32 *
3324gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3325{
3326	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3327		MI_SRM_LRM_GLOBAL_GTT |
3328		MI_LRI_LRM_CS_MMIO;
3329	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3330	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3331		CTX_TIMESTAMP * sizeof(u32);
3332	*cs++ = 0;
3333
3334	*cs++ = MI_LOAD_REGISTER_REG |
3335		MI_LRR_SOURCE_CS_MMIO |
3336		MI_LRI_LRM_CS_MMIO;
3337	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3338	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3339
3340	*cs++ = MI_LOAD_REGISTER_REG |
3341		MI_LRR_SOURCE_CS_MMIO |
3342		MI_LRI_LRM_CS_MMIO;
3343	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3344	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3345
3346	return cs;
3347}
3348
3349static u32 *
3350gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3351{
3352	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3353
3354	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3355		MI_SRM_LRM_GLOBAL_GTT |
3356		MI_LRI_LRM_CS_MMIO;
3357	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3358	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3359		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3360	*cs++ = 0;
3361
3362	return cs;
3363}
3364
3365static u32 *
3366gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3367{
3368	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3369
3370	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3371		MI_SRM_LRM_GLOBAL_GTT |
3372		MI_LRI_LRM_CS_MMIO;
3373	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3374	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3375		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3376	*cs++ = 0;
3377
3378	*cs++ = MI_LOAD_REGISTER_REG |
3379		MI_LRR_SOURCE_CS_MMIO |
3380		MI_LRI_LRM_CS_MMIO;
3381	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3382	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3383
3384	return cs;
3385}
3386
3387static u32 *
3388gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3389{
3390	cs = gen12_emit_timestamp_wa(ce, cs);
3391	cs = gen12_emit_cmd_buf_wa(ce, cs);
3392	cs = gen12_emit_restore_scratch(ce, cs);
3393
3394	return cs;
3395}
3396
3397static u32 *
3398gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3399{
3400	cs = gen12_emit_timestamp_wa(ce, cs);
3401	cs = gen12_emit_restore_scratch(ce, cs);
3402
3403	return cs;
3404}
3405
3406static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3407{
3408	return PAGE_SIZE * ce->wa_bb_page;
3409}
3410
3411static u32 *context_indirect_bb(const struct intel_context *ce)
3412{
3413	void *ptr;
3414
3415	GEM_BUG_ON(!ce->wa_bb_page);
3416
3417	ptr = ce->lrc_reg_state;
3418	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3419	ptr += context_wa_bb_offset(ce);
3420
3421	return ptr;
3422}
3423
3424static void
3425setup_indirect_ctx_bb(const struct intel_context *ce,
3426		      const struct intel_engine_cs *engine,
3427		      u32 *(*emit)(const struct intel_context *, u32 *))
3428{
3429	u32 * const start = context_indirect_bb(ce);
3430	u32 *cs;
3431
3432	cs = emit(ce, start);
3433	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3434	while ((unsigned long)cs % CACHELINE_BYTES)
3435		*cs++ = MI_NOOP;
3436
3437	lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3438				    i915_ggtt_offset(ce->state) +
3439				    context_wa_bb_offset(ce),
3440				    (cs - start) * sizeof(*cs));
3441}
3442
3443static void
3444__execlists_update_reg_state(const struct intel_context *ce,
3445			     const struct intel_engine_cs *engine,
3446			     u32 head)
3447{
3448	struct intel_ring *ring = ce->ring;
3449	u32 *regs = ce->lrc_reg_state;
3450
3451	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3452	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3453
3454	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3455	regs[CTX_RING_HEAD] = head;
3456	regs[CTX_RING_TAIL] = ring->tail;
3457	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3458
3459	/* RPCS */
3460	if (engine->class == RENDER_CLASS) {
3461		regs[CTX_R_PWR_CLK_STATE] =
3462			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3463
3464		i915_oa_init_reg_state(ce, engine);
3465	}
3466
3467	if (ce->wa_bb_page) {
3468		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3469
3470		fn = gen12_emit_indirect_ctx_xcs;
3471		if (ce->engine->class == RENDER_CLASS)
3472			fn = gen12_emit_indirect_ctx_rcs;
3473
3474		/* Mutually exclusive wrt to global indirect bb */
3475		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3476		setup_indirect_ctx_bb(ce, engine, fn);
3477	}
3478}
3479
3480static int
3481__execlists_context_pin(struct intel_context *ce,
3482			struct intel_engine_cs *engine)
3483{
3484	void *vaddr;
3485
3486	GEM_BUG_ON(!ce->state);
3487	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3488
3489	vaddr = i915_gem_object_pin_map(ce->state->obj,
3490					i915_coherent_map_type(engine->i915) |
3491					I915_MAP_OVERRIDE);
3492	if (IS_ERR(vaddr))
3493		return PTR_ERR(vaddr);
3494
3495	ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3496	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3497	__execlists_update_reg_state(ce, engine, ce->ring->tail);
3498
3499	return 0;
3500}
3501
3502static int execlists_context_pin(struct intel_context *ce)
3503{
3504	return __execlists_context_pin(ce, ce->engine);
3505}
3506
3507static int execlists_context_alloc(struct intel_context *ce)
3508{
3509	return __execlists_context_alloc(ce, ce->engine);
3510}
3511
3512static void execlists_context_reset(struct intel_context *ce)
3513{
3514	CE_TRACE(ce, "reset\n");
3515	GEM_BUG_ON(!intel_context_is_pinned(ce));
3516
3517	intel_ring_reset(ce->ring, ce->ring->emit);
3518
3519	/* Scrub away the garbage */
3520	execlists_init_reg_state(ce->lrc_reg_state,
3521				 ce, ce->engine, ce->ring, true);
3522	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3523
3524	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3525}
3526
3527static const struct intel_context_ops execlists_context_ops = {
3528	.alloc = execlists_context_alloc,
3529
3530	.pin = execlists_context_pin,
3531	.unpin = execlists_context_unpin,
3532
3533	.enter = intel_context_enter_engine,
3534	.exit = intel_context_exit_engine,
3535
3536	.reset = execlists_context_reset,
3537	.destroy = execlists_context_destroy,
3538};
3539
3540static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3541{
3542	u32 *cs;
3543
3544	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3545	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3546		return 0;
3547
3548	cs = intel_ring_begin(rq, 6);
3549	if (IS_ERR(cs))
3550		return PTR_ERR(cs);
3551
3552	/*
3553	 * Check if we have been preempted before we even get started.
3554	 *
3555	 * After this point i915_request_started() reports true, even if
3556	 * we get preempted and so are no longer running.
3557	 */
3558	*cs++ = MI_ARB_CHECK;
3559	*cs++ = MI_NOOP;
3560
3561	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3562	*cs++ = i915_request_timeline(rq)->hwsp_offset;
3563	*cs++ = 0;
3564	*cs++ = rq->fence.seqno - 1;
3565
3566	intel_ring_advance(rq, cs);
3567
3568	/* Record the updated position of the request's payload */
3569	rq->infix = intel_ring_offset(rq, cs);
3570
3571	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3572
3573	return 0;
3574}
3575
3576static int emit_pdps(struct i915_request *rq)
3577{
3578	const struct intel_engine_cs * const engine = rq->engine;
3579	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3580	int err, i;
3581	u32 *cs;
3582
3583	GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3584
3585	/*
3586	 * Beware ye of the dragons, this sequence is magic!
3587	 *
3588	 * Small changes to this sequence can cause anything from
3589	 * GPU hangs to forcewake errors and machine lockups!
3590	 */
3591
3592	/* Flush any residual operations from the context load */
3593	err = engine->emit_flush(rq, EMIT_FLUSH);
3594	if (err)
3595		return err;
3596
3597	/* Magic required to prevent forcewake errors! */
3598	err = engine->emit_flush(rq, EMIT_INVALIDATE);
3599	if (err)
3600		return err;
3601
3602	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3603	if (IS_ERR(cs))
3604		return PTR_ERR(cs);
3605
3606	/* Ensure the LRI have landed before we invalidate & continue */
3607	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3608	for (i = GEN8_3LVL_PDPES; i--; ) {
3609		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3610		u32 base = engine->mmio_base;
3611
3612		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3613		*cs++ = upper_32_bits(pd_daddr);
3614		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3615		*cs++ = lower_32_bits(pd_daddr);
3616	}
3617	*cs++ = MI_NOOP;
3618
3619	intel_ring_advance(rq, cs);
3620
3621	return 0;
3622}
3623
3624static int execlists_request_alloc(struct i915_request *request)
3625{
3626	int ret;
3627
3628	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3629
3630	/*
3631	 * Flush enough space to reduce the likelihood of waiting after
3632	 * we start building the request - in which case we will just
3633	 * have to repeat work.
3634	 */
3635	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3636
3637	/*
3638	 * Note that after this point, we have committed to using
3639	 * this request as it is being used to both track the
3640	 * state of engine initialisation and liveness of the
3641	 * golden renderstate above. Think twice before you try
3642	 * to cancel/unwind this request now.
3643	 */
3644
3645	if (!i915_vm_is_4lvl(request->context->vm)) {
3646		ret = emit_pdps(request);
3647		if (ret)
3648			return ret;
3649	}
3650
3651	/* Unconditionally invalidate GPU caches and TLBs. */
3652	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3653	if (ret)
3654		return ret;
3655
3656	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3657	return 0;
3658}
3659
3660/*
3661 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3662 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3663 * but there is a slight complication as this is applied in WA batch where the
3664 * values are only initialized once so we cannot take register value at the
3665 * beginning and reuse it further; hence we save its value to memory, upload a
3666 * constant value with bit21 set and then we restore it back with the saved value.
3667 * To simplify the WA, a constant value is formed by using the default value
3668 * of this register. This shouldn't be a problem because we are only modifying
3669 * it for a short period and this batch in non-premptible. We can ofcourse
3670 * use additional instructions that read the actual value of the register
3671 * at that time and set our bit of interest but it makes the WA complicated.
3672 *
3673 * This WA is also required for Gen9 so extracting as a function avoids
3674 * code duplication.
3675 */
3676static u32 *
3677gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3678{
3679	/* NB no one else is allowed to scribble over scratch + 256! */
3680	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3681	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3682	*batch++ = intel_gt_scratch_offset(engine->gt,
3683					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3684	*batch++ = 0;
3685
3686	*batch++ = MI_LOAD_REGISTER_IMM(1);
3687	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3688	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3689
3690	batch = gen8_emit_pipe_control(batch,
3691				       PIPE_CONTROL_CS_STALL |
3692				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3693				       0);
3694
3695	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3696	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3697	*batch++ = intel_gt_scratch_offset(engine->gt,
3698					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3699	*batch++ = 0;
3700
3701	return batch;
3702}
3703
3704/*
3705 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3706 * initialized at the beginning and shared across all contexts but this field
3707 * helps us to have multiple batches at different offsets and select them based
3708 * on a criteria. At the moment this batch always start at the beginning of the page
3709 * and at this point we don't have multiple wa_ctx batch buffers.
3710 *
3711 * The number of WA applied are not known at the beginning; we use this field
3712 * to return the no of DWORDS written.
3713 *
3714 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3715 * so it adds NOOPs as padding to make it cacheline aligned.
3716 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3717 * makes a complete batch buffer.
3718 */
3719static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3720{
3721	/* WaDisableCtxRestoreArbitration:bdw,chv */
3722	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3723
3724	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3725	if (IS_BROADWELL(engine->i915))
3726		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3727
3728	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3729	/* Actual scratch location is at 128 bytes offset */
3730	batch = gen8_emit_pipe_control(batch,
3731				       PIPE_CONTROL_FLUSH_L3 |
3732				       PIPE_CONTROL_STORE_DATA_INDEX |
3733				       PIPE_CONTROL_CS_STALL |
3734				       PIPE_CONTROL_QW_WRITE,
3735				       LRC_PPHWSP_SCRATCH_ADDR);
3736
3737	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3738
3739	/* Pad to end of cacheline */
3740	while ((unsigned long)batch % CACHELINE_BYTES)
3741		*batch++ = MI_NOOP;
3742
3743	/*
3744	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3745	 * execution depends on the length specified in terms of cache lines
3746	 * in the register CTX_RCS_INDIRECT_CTX
3747	 */
3748
3749	return batch;
3750}
3751
3752struct lri {
3753	i915_reg_t reg;
3754	u32 value;
3755};
3756
3757static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3758{
3759	GEM_BUG_ON(!count || count > 63);
3760
3761	*batch++ = MI_LOAD_REGISTER_IMM(count);
3762	do {
3763		*batch++ = i915_mmio_reg_offset(lri->reg);
3764		*batch++ = lri->value;
3765	} while (lri++, --count);
3766	*batch++ = MI_NOOP;
3767
3768	return batch;
3769}
3770
3771static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3772{
3773	static const struct lri lri[] = {
3774		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3775		{
3776			COMMON_SLICE_CHICKEN2,
3777			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3778				       0),
3779		},
3780
3781		/* BSpec: 11391 */
3782		{
3783			FF_SLICE_CHICKEN,
3784			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3785				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3786		},
3787
3788		/* BSpec: 11299 */
3789		{
3790			_3D_CHICKEN3,
3791			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3792				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3793		}
3794	};
3795
3796	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3797
3798	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3799	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3800
3801	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3802	batch = gen8_emit_pipe_control(batch,
3803				       PIPE_CONTROL_FLUSH_L3 |
3804				       PIPE_CONTROL_STORE_DATA_INDEX |
3805				       PIPE_CONTROL_CS_STALL |
3806				       PIPE_CONTROL_QW_WRITE,
3807				       LRC_PPHWSP_SCRATCH_ADDR);
3808
3809	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3810
3811	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3812	if (HAS_POOLED_EU(engine->i915)) {
3813		/*
3814		 * EU pool configuration is setup along with golden context
3815		 * during context initialization. This value depends on
3816		 * device type (2x6 or 3x6) and needs to be updated based
3817		 * on which subslice is disabled especially for 2x6
3818		 * devices, however it is safe to load default
3819		 * configuration of 3x6 device instead of masking off
3820		 * corresponding bits because HW ignores bits of a disabled
3821		 * subslice and drops down to appropriate config. Please
3822		 * see render_state_setup() in i915_gem_render_state.c for
3823		 * possible configurations, to avoid duplication they are
3824		 * not shown here again.
3825		 */
3826		*batch++ = GEN9_MEDIA_POOL_STATE;
3827		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3828		*batch++ = 0x00777000;
3829		*batch++ = 0;
3830		*batch++ = 0;
3831		*batch++ = 0;
3832	}
3833
3834	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3835
3836	/* Pad to end of cacheline */
3837	while ((unsigned long)batch % CACHELINE_BYTES)
3838		*batch++ = MI_NOOP;
3839
3840	return batch;
3841}
3842
3843static u32 *
3844gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3845{
3846	int i;
3847
3848	/*
3849	 * WaPipeControlBefore3DStateSamplePattern: cnl
3850	 *
3851	 * Ensure the engine is idle prior to programming a
3852	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3853	 */
3854	batch = gen8_emit_pipe_control(batch,
3855				       PIPE_CONTROL_CS_STALL,
3856				       0);
3857	/*
3858	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3859	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3860	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3861	 * confusing. Since gen8_emit_pipe_control() already advances the
3862	 * batch by 6 dwords, we advance the other 10 here, completing a
3863	 * cacheline. It's not clear if the workaround requires this padding
3864	 * before other commands, or if it's just the regular padding we would
3865	 * already have for the workaround bb, so leave it here for now.
3866	 */
3867	for (i = 0; i < 10; i++)
3868		*batch++ = MI_NOOP;
3869
3870	/* Pad to end of cacheline */
3871	while ((unsigned long)batch % CACHELINE_BYTES)
3872		*batch++ = MI_NOOP;
3873
3874	return batch;
3875}
3876
3877#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3878
3879static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3880{
3881	struct drm_i915_gem_object *obj;
3882	struct i915_vma *vma;
3883	int err;
3884
3885	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3886	if (IS_ERR(obj))
3887		return PTR_ERR(obj);
3888
3889	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3890	if (IS_ERR(vma)) {
3891		err = PTR_ERR(vma);
3892		goto err;
3893	}
3894
3895	err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3896	if (err)
3897		goto err;
3898
3899	engine->wa_ctx.vma = vma;
3900	return 0;
3901
3902err:
3903	i915_gem_object_put(obj);
3904	return err;
3905}
3906
3907static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3908{
3909	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3910}
3911
3912typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3913
3914static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3915{
3916	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3917	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3918					    &wa_ctx->per_ctx };
3919	wa_bb_func_t wa_bb_fn[2];
3920	void *batch, *batch_ptr;
3921	unsigned int i;
3922	int ret;
3923
3924	if (engine->class != RENDER_CLASS)
3925		return 0;
3926
3927	switch (INTEL_GEN(engine->i915)) {
3928	case 12:
3929	case 11:
3930		return 0;
3931	case 10:
3932		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3933		wa_bb_fn[1] = NULL;
3934		break;
3935	case 9:
3936		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3937		wa_bb_fn[1] = NULL;
3938		break;
3939	case 8:
3940		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3941		wa_bb_fn[1] = NULL;
3942		break;
3943	default:
3944		MISSING_CASE(INTEL_GEN(engine->i915));
3945		return 0;
3946	}
3947
3948	ret = lrc_setup_wa_ctx(engine);
3949	if (ret) {
3950		drm_dbg(&engine->i915->drm,
3951			"Failed to setup context WA page: %d\n", ret);
3952		return ret;
3953	}
3954
3955	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
3956
3957	/*
3958	 * Emit the two workaround batch buffers, recording the offset from the
3959	 * start of the workaround batch buffer object for each and their
3960	 * respective sizes.
3961	 */
3962	batch_ptr = batch;
3963	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3964		wa_bb[i]->offset = batch_ptr - batch;
3965		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3966						  CACHELINE_BYTES))) {
3967			ret = -EINVAL;
3968			break;
3969		}
3970		if (wa_bb_fn[i])
3971			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3972		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3973	}
3974	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3975
3976	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
3977	__i915_gem_object_release_map(wa_ctx->vma->obj);
3978	if (ret)
3979		lrc_destroy_wa_ctx(engine);
3980
3981	return ret;
3982}
3983
3984static void reset_csb_pointers(struct intel_engine_cs *engine)
3985{
3986	struct intel_engine_execlists * const execlists = &engine->execlists;
3987	const unsigned int reset_value = execlists->csb_size - 1;
3988
3989	ring_set_paused(engine, 0);
3990
3991	/*
3992	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3993	 * Bludgeon them with a mmio update to be sure.
3994	 */
3995	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3996		     0xffff << 16 | reset_value << 8 | reset_value);
3997	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3998
3999	/*
4000	 * After a reset, the HW starts writing into CSB entry [0]. We
4001	 * therefore have to set our HEAD pointer back one entry so that
4002	 * the *first* entry we check is entry 0. To complicate this further,
4003	 * as we don't wait for the first interrupt after reset, we have to
4004	 * fake the HW write to point back to the last entry so that our
4005	 * inline comparison of our cached head position against the last HW
4006	 * write works even before the first interrupt.
4007	 */
4008	execlists->csb_head = reset_value;
4009	WRITE_ONCE(*execlists->csb_write, reset_value);
4010	wmb(); /* Make sure this is visible to HW (paranoia?) */
4011
4012	invalidate_csb_entries(&execlists->csb_status[0],
4013			       &execlists->csb_status[reset_value]);
4014
4015	/* Once more for luck and our trusty paranoia */
4016	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4017		     0xffff << 16 | reset_value << 8 | reset_value);
4018	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4019
4020	GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
4021}
4022
4023static void execlists_sanitize(struct intel_engine_cs *engine)
4024{
4025	/*
4026	 * Poison residual state on resume, in case the suspend didn't!
4027	 *
4028	 * We have to assume that across suspend/resume (or other loss
4029	 * of control) that the contents of our pinned buffers has been
4030	 * lost, replaced by garbage. Since this doesn't always happen,
4031	 * let's poison such state so that we more quickly spot when
4032	 * we falsely assume it has been preserved.
4033	 */
4034	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4035		memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
4036
4037	reset_csb_pointers(engine);
4038
4039	/*
4040	 * The kernel_context HWSP is stored in the status_page. As above,
4041	 * that may be lost on resume/initialisation, and so we need to
4042	 * reset the value in the HWSP.
4043	 */
4044	intel_timeline_reset_seqno(engine->kernel_context->timeline);
4045
4046	/* And scrub the dirty cachelines for the HWSP */
4047	clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4048}
4049
4050static void enable_error_interrupt(struct intel_engine_cs *engine)
4051{
4052	u32 status;
4053
4054	engine->execlists.error_interrupt = 0;
4055	ENGINE_WRITE(engine, RING_EMR, ~0u);
4056	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4057
4058	status = ENGINE_READ(engine, RING_ESR);
4059	if (unlikely(status)) {
4060		drm_err(&engine->i915->drm,
4061			"engine '%s' resumed still in error: %08x\n",
4062			engine->name, status);
4063		__intel_gt_reset(engine->gt, engine->mask);
4064	}
4065
4066	/*
4067	 * On current gen8+, we have 2 signals to play with
4068	 *
4069	 * - I915_ERROR_INSTUCTION (bit 0)
4070	 *
4071	 *    Generate an error if the command parser encounters an invalid
4072	 *    instruction
4073	 *
4074	 *    This is a fatal error.
4075	 *
4076	 * - CP_PRIV (bit 2)
4077	 *
4078	 *    Generate an error on privilege violation (where the CP replaces
4079	 *    the instruction with a no-op). This also fires for writes into
4080	 *    read-only scratch pages.
4081	 *
4082	 *    This is a non-fatal error, parsing continues.
4083	 *
4084	 * * there are a few others defined for odd HW that we do not use
4085	 *
4086	 * Since CP_PRIV fires for cases where we have chosen to ignore the
4087	 * error (as the HW is validating and suppressing the mistakes), we
4088	 * only unmask the instruction error bit.
4089	 */
4090	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4091}
4092
4093static void enable_execlists(struct intel_engine_cs *engine)
4094{
4095	u32 mode;
4096
4097	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4098
4099	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4100
4101	if (INTEL_GEN(engine->i915) >= 11)
4102		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4103	else
4104		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4105	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4106
4107	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4108
4109	ENGINE_WRITE_FW(engine,
4110			RING_HWS_PGA,
4111			i915_ggtt_offset(engine->status_page.vma));
4112	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4113
4114	enable_error_interrupt(engine);
4115
4116	engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4117}
4118
4119static bool unexpected_starting_state(struct intel_engine_cs *engine)
4120{
4121	bool unexpected = false;
4122
4123	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4124		drm_dbg(&engine->i915->drm,
4125			"STOP_RING still set in RING_MI_MODE\n");
4126		unexpected = true;
4127	}
4128
4129	return unexpected;
4130}
4131
4132static int execlists_resume(struct intel_engine_cs *engine)
4133{
4134	intel_mocs_init_engine(engine);
4135
4136	intel_engine_reset_breadcrumbs(engine);
4137
4138	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4139		struct drm_printer p = drm_debug_printer(__func__);
4140
4141		intel_engine_dump(engine, &p, NULL);
4142	}
4143
4144	enable_execlists(engine);
4145
4146	return 0;
4147}
4148
4149static void execlists_reset_prepare(struct intel_engine_cs *engine)
4150{
4151	struct intel_engine_execlists * const execlists = &engine->execlists;
4152	unsigned long flags;
4153
4154	ENGINE_TRACE(engine, "depth<-%d\n",
4155		     atomic_read(&execlists->tasklet.count));
4156
4157	/*
4158	 * Prevent request submission to the hardware until we have
4159	 * completed the reset in i915_gem_reset_finish(). If a request
4160	 * is completed by one engine, it may then queue a request
4161	 * to a second via its execlists->tasklet *just* as we are
4162	 * calling engine->resume() and also writing the ELSP.
4163	 * Turning off the execlists->tasklet until the reset is over
4164	 * prevents the race.
4165	 */
4166	__tasklet_disable_sync_once(&execlists->tasklet);
4167	GEM_BUG_ON(!reset_in_progress(execlists));
4168
4169	/* And flush any current direct submission. */
4170	spin_lock_irqsave(&engine->active.lock, flags);
4171	spin_unlock_irqrestore(&engine->active.lock, flags);
4172
4173	/*
4174	 * We stop engines, otherwise we might get failed reset and a
4175	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
4176	 * from system hang if batchbuffer is progressing when
4177	 * the reset is issued, regardless of READY_TO_RESET ack.
4178	 * Thus assume it is best to stop engines on all gens
4179	 * where we have a gpu reset.
4180	 *
4181	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4182	 *
4183	 * FIXME: Wa for more modern gens needs to be validated
4184	 */
4185	ring_set_paused(engine, 1);
4186	intel_engine_stop_cs(engine);
4187
4188	engine->execlists.reset_ccid = active_ccid(engine);
4189}
4190
4191static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4192{
4193	int x;
4194
4195	x = lrc_ring_mi_mode(engine);
4196	if (x != -1) {
4197		regs[x + 1] &= ~STOP_RING;
4198		regs[x + 1] |= STOP_RING << 16;
4199	}
4200}
4201
4202static void __execlists_reset_reg_state(const struct intel_context *ce,
4203					const struct intel_engine_cs *engine)
4204{
4205	u32 *regs = ce->lrc_reg_state;
4206
4207	__reset_stop_ring(regs, engine);
4208}
4209
4210static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4211{
4212	struct intel_engine_execlists * const execlists = &engine->execlists;
4213	struct intel_context *ce;
4214	struct i915_request *rq;
4215	u32 head;
4216
4217	mb(); /* paranoia: read the CSB pointers from after the reset */
4218	clflush(execlists->csb_write);
4219	mb();
4220
4221	process_csb(engine); /* drain preemption events */
4222
4223	/* Following the reset, we need to reload the CSB read/write pointers */
4224	reset_csb_pointers(engine);
4225
4226	/*
4227	 * Save the currently executing context, even if we completed
4228	 * its request, it was still running at the time of the
4229	 * reset and will have been clobbered.
4230	 */
4231	rq = active_context(engine, engine->execlists.reset_ccid);
4232	if (!rq)
4233		goto unwind;
4234
4235	ce = rq->context;
4236	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4237
4238	if (i915_request_completed(rq)) {
4239		/* Idle context; tidy up the ring so we can restart afresh */
4240		head = intel_ring_wrap(ce->ring, rq->tail);
4241		goto out_replay;
4242	}
4243
4244	/* We still have requests in-flight; the engine should be active */
4245	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4246
4247	/* Context has requests still in-flight; it should not be idle! */
4248	GEM_BUG_ON(i915_active_is_idle(&ce->active));
4249
4250	rq = active_request(ce->timeline, rq);
4251	head = intel_ring_wrap(ce->ring, rq->head);
4252	GEM_BUG_ON(head == ce->ring->tail);
4253
4254	/*
4255	 * If this request hasn't started yet, e.g. it is waiting on a
4256	 * semaphore, we need to avoid skipping the request or else we
4257	 * break the signaling chain. However, if the context is corrupt
4258	 * the request will not restart and we will be stuck with a wedged
4259	 * device. It is quite often the case that if we issue a reset
4260	 * while the GPU is loading the context image, that the context
4261	 * image becomes corrupt.
4262	 *
4263	 * Otherwise, if we have not started yet, the request should replay
4264	 * perfectly and we do not need to flag the result as being erroneous.
4265	 */
4266	if (!i915_request_started(rq))
4267		goto out_replay;
4268
4269	/*
4270	 * If the request was innocent, we leave the request in the ELSP
4271	 * and will try to replay it on restarting. The context image may
4272	 * have been corrupted by the reset, in which case we may have
4273	 * to service a new GPU hang, but more likely we can continue on
4274	 * without impact.
4275	 *
4276	 * If the request was guilty, we presume the context is corrupt
4277	 * and have to at least restore the RING register in the context
4278	 * image back to the expected values to skip over the guilty request.
4279	 */
4280	__i915_request_reset(rq, stalled);
4281
4282	/*
4283	 * We want a simple context + ring to execute the breadcrumb update.
4284	 * We cannot rely on the context being intact across the GPU hang,
4285	 * so clear it and rebuild just what we need for the breadcrumb.
4286	 * All pending requests for this context will be zapped, and any
4287	 * future request will be after userspace has had the opportunity
4288	 * to recreate its own state.
4289	 */
4290out_replay:
4291	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4292		     head, ce->ring->tail);
4293	__execlists_reset_reg_state(ce, engine);
4294	__execlists_update_reg_state(ce, engine, head);
4295	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4296
4297unwind:
4298	/* Push back any incomplete requests for replay after the reset. */
4299	cancel_port_requests(execlists);
4300	__unwind_incomplete_requests(engine);
4301}
4302
4303static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4304{
4305	unsigned long flags;
4306
4307	ENGINE_TRACE(engine, "\n");
4308
4309	spin_lock_irqsave(&engine->active.lock, flags);
4310
4311	__execlists_reset(engine, stalled);
4312
4313	spin_unlock_irqrestore(&engine->active.lock, flags);
4314}
4315
4316static void nop_submission_tasklet(unsigned long data)
4317{
4318	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4319
4320	/* The driver is wedged; don't process any more events. */
4321	WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4322}
4323
4324static void execlists_reset_cancel(struct intel_engine_cs *engine)
4325{
4326	struct intel_engine_execlists * const execlists = &engine->execlists;
4327	struct i915_request *rq, *rn;
4328	struct rb_node *rb;
4329	unsigned long flags;
4330
4331	ENGINE_TRACE(engine, "\n");
4332
4333	/*
4334	 * Before we call engine->cancel_requests(), we should have exclusive
4335	 * access to the submission state. This is arranged for us by the
4336	 * caller disabling the interrupt generation, the tasklet and other
4337	 * threads that may then access the same state, giving us a free hand
4338	 * to reset state. However, we still need to let lockdep be aware that
4339	 * we know this state may be accessed in hardirq context, so we
4340	 * disable the irq around this manipulation and we want to keep
4341	 * the spinlock focused on its duties and not accidentally conflate
4342	 * coverage to the submission's irq state. (Similarly, although we
4343	 * shouldn't need to disable irq around the manipulation of the
4344	 * submission's irq state, we also wish to remind ourselves that
4345	 * it is irq state.)
4346	 */
4347	spin_lock_irqsave(&engine->active.lock, flags);
4348
4349	__execlists_reset(engine, true);
4350
4351	/* Mark all executing requests as skipped. */
4352	list_for_each_entry(rq, &engine->active.requests, sched.link)
4353		mark_eio(rq);
4354
4355	/* Flush the queued requests to the timeline list (for retiring). */
4356	while ((rb = rb_first_cached(&execlists->queue))) {
4357		struct i915_priolist *p = to_priolist(rb);
4358		int i;
4359
4360		priolist_for_each_request_consume(rq, rn, p, i) {
4361			mark_eio(rq);
4362			__i915_request_submit(rq);
4363		}
4364
4365		rb_erase_cached(&p->node, &execlists->queue);
4366		i915_priolist_free(p);
4367	}
4368
4369	/* On-hold requests will be flushed to timeline upon their release */
4370	list_for_each_entry(rq, &engine->active.hold, sched.link)
4371		mark_eio(rq);
4372
4373	/* Cancel all attached virtual engines */
4374	while ((rb = rb_first_cached(&execlists->virtual))) {
4375		struct virtual_engine *ve =
4376			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4377
4378		rb_erase_cached(rb, &execlists->virtual);
4379		RB_CLEAR_NODE(rb);
4380
4381		spin_lock(&ve->base.active.lock);
4382		rq = fetch_and_zero(&ve->request);
4383		if (rq) {
4384			mark_eio(rq);
4385
4386			rq->engine = engine;
4387			__i915_request_submit(rq);
4388			i915_request_put(rq);
4389
4390			ve->base.execlists.queue_priority_hint = INT_MIN;
4391		}
4392		spin_unlock(&ve->base.active.lock);
4393	}
4394
4395	/* Remaining _unready_ requests will be nop'ed when submitted */
4396
4397	execlists->queue_priority_hint = INT_MIN;
4398	execlists->queue = RB_ROOT_CACHED;
4399
4400	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4401	execlists->tasklet.func = nop_submission_tasklet;
4402
4403	spin_unlock_irqrestore(&engine->active.lock, flags);
4404}
4405
4406static void execlists_reset_finish(struct intel_engine_cs *engine)
4407{
4408	struct intel_engine_execlists * const execlists = &engine->execlists;
4409
4410	/*
4411	 * After a GPU reset, we may have requests to replay. Do so now while
4412	 * we still have the forcewake to be sure that the GPU is not allowed
4413	 * to sleep before we restart and reload a context.
4414	 */
4415	GEM_BUG_ON(!reset_in_progress(execlists));
4416	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4417		execlists->tasklet.func(execlists->tasklet.data);
4418
4419	if (__tasklet_enable(&execlists->tasklet))
4420		/* And kick in case we missed a new request submission. */
4421		tasklet_hi_schedule(&execlists->tasklet);
4422	ENGINE_TRACE(engine, "depth->%d\n",
4423		     atomic_read(&execlists->tasklet.count));
4424}
4425
4426static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4427				    u64 offset, u32 len,
4428				    const unsigned int flags)
4429{
4430	u32 *cs;
4431
4432	cs = intel_ring_begin(rq, 4);
4433	if (IS_ERR(cs))
4434		return PTR_ERR(cs);
4435
4436	/*
4437	 * WaDisableCtxRestoreArbitration:bdw,chv
4438	 *
4439	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
4440	 * particular all the gen that do not need the w/a at all!), if we
4441	 * took care to make sure that on every switch into this context
4442	 * (both ordinary and for preemption) that arbitrartion was enabled
4443	 * we would be fine.  However, for gen8 there is another w/a that
4444	 * requires us to not preempt inside GPGPU execution, so we keep
4445	 * arbitration disabled for gen8 batches. Arbitration will be
4446	 * re-enabled before we close the request
4447	 * (engine->emit_fini_breadcrumb).
4448	 */
4449	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4450
4451	/* FIXME(BDW+): Address space and security selectors. */
4452	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4453		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4454	*cs++ = lower_32_bits(offset);
4455	*cs++ = upper_32_bits(offset);
4456
4457	intel_ring_advance(rq, cs);
4458
4459	return 0;
4460}
4461
4462static int gen8_emit_bb_start(struct i915_request *rq,
4463			      u64 offset, u32 len,
4464			      const unsigned int flags)
4465{
4466	u32 *cs;
4467
4468	cs = intel_ring_begin(rq, 6);
4469	if (IS_ERR(cs))
4470		return PTR_ERR(cs);
4471
4472	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4473
4474	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
4475		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4476	*cs++ = lower_32_bits(offset);
4477	*cs++ = upper_32_bits(offset);
4478
4479	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4480	*cs++ = MI_NOOP;
4481
4482	intel_ring_advance(rq, cs);
4483
4484	return 0;
4485}
4486
4487static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4488{
4489	ENGINE_WRITE(engine, RING_IMR,
4490		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
4491	ENGINE_POSTING_READ(engine, RING_IMR);
4492}
4493
4494static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4495{
4496	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4497}
4498
4499static int gen8_emit_flush(struct i915_request *request, u32 mode)
4500{
4501	u32 cmd, *cs;
4502
4503	cs = intel_ring_begin(request, 4);
4504	if (IS_ERR(cs))
4505		return PTR_ERR(cs);
4506
4507	cmd = MI_FLUSH_DW + 1;
4508
4509	/* We always require a command barrier so that subsequent
4510	 * commands, such as breadcrumb interrupts, are strictly ordered
4511	 * wrt the contents of the write cache being flushed to memory
4512	 * (and thus being coherent from the CPU).
4513	 */
4514	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4515
4516	if (mode & EMIT_INVALIDATE) {
4517		cmd |= MI_INVALIDATE_TLB;
4518		if (request->engine->class == VIDEO_DECODE_CLASS)
4519			cmd |= MI_INVALIDATE_BSD;
4520	}
4521
4522	*cs++ = cmd;
4523	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4524	*cs++ = 0; /* upper addr */
4525	*cs++ = 0; /* value */
4526	intel_ring_advance(request, cs);
4527
4528	return 0;
4529}
4530
4531static int gen8_emit_flush_render(struct i915_request *request,
4532				  u32 mode)
4533{
4534	bool vf_flush_wa = false, dc_flush_wa = false;
4535	u32 *cs, flags = 0;
4536	int len;
4537
4538	flags |= PIPE_CONTROL_CS_STALL;
4539
4540	if (mode & EMIT_FLUSH) {
4541		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4542		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4543		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4544		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4545	}
4546
4547	if (mode & EMIT_INVALIDATE) {
4548		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4549		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4550		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4551		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4552		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4553		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4554		flags |= PIPE_CONTROL_QW_WRITE;
4555		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4556
4557		/*
4558		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4559		 * pipe control.
4560		 */
4561		if (IS_GEN(request->engine->i915, 9))
4562			vf_flush_wa = true;
4563
4564		/* WaForGAMHang:kbl */
4565		if (IS_KBL_REVID(request->engine->i915, 0, KBL_REVID_B0))
4566			dc_flush_wa = true;
4567	}
4568
4569	len = 6;
4570
4571	if (vf_flush_wa)
4572		len += 6;
4573
4574	if (dc_flush_wa)
4575		len += 12;
4576
4577	cs = intel_ring_begin(request, len);
4578	if (IS_ERR(cs))
4579		return PTR_ERR(cs);
4580
4581	if (vf_flush_wa)
4582		cs = gen8_emit_pipe_control(cs, 0, 0);
4583
4584	if (dc_flush_wa)
4585		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4586					    0);
4587
4588	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4589
4590	if (dc_flush_wa)
4591		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4592
4593	intel_ring_advance(request, cs);
4594
4595	return 0;
4596}
4597
4598static int gen11_emit_flush_render(struct i915_request *request,
4599				   u32 mode)
4600{
4601	if (mode & EMIT_FLUSH) {
4602		u32 *cs;
4603		u32 flags = 0;
4604
4605		flags |= PIPE_CONTROL_CS_STALL;
4606
4607		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4608		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4609		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4610		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4611		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4612		flags |= PIPE_CONTROL_QW_WRITE;
4613		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4614
4615		cs = intel_ring_begin(request, 6);
4616		if (IS_ERR(cs))
4617			return PTR_ERR(cs);
4618
4619		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4620		intel_ring_advance(request, cs);
4621	}
4622
4623	if (mode & EMIT_INVALIDATE) {
4624		u32 *cs;
4625		u32 flags = 0;
4626
4627		flags |= PIPE_CONTROL_CS_STALL;
4628
4629		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4630		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4631		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4632		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4633		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4634		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4635		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4636		flags |= PIPE_CONTROL_QW_WRITE;
4637		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4638
4639		cs = intel_ring_begin(request, 6);
4640		if (IS_ERR(cs))
4641			return PTR_ERR(cs);
4642
4643		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4644		intel_ring_advance(request, cs);
4645	}
4646
4647	return 0;
4648}
4649
4650static u32 preparser_disable(bool state)
4651{
4652	return MI_ARB_CHECK | 1 << 8 | state;
4653}
4654
4655static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4656{
4657	static const i915_reg_t vd[] = {
4658		GEN12_VD0_AUX_NV,
4659		GEN12_VD1_AUX_NV,
4660		GEN12_VD2_AUX_NV,
4661		GEN12_VD3_AUX_NV,
4662	};
4663
4664	static const i915_reg_t ve[] = {
4665		GEN12_VE0_AUX_NV,
4666		GEN12_VE1_AUX_NV,
4667	};
4668
4669	if (engine->class == VIDEO_DECODE_CLASS)
4670		return vd[engine->instance];
4671
4672	if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4673		return ve[engine->instance];
4674
4675	GEM_BUG_ON("unknown aux_inv_reg\n");
4676
4677	return INVALID_MMIO_REG;
4678}
4679
4680static u32 *
4681gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4682{
4683	*cs++ = MI_LOAD_REGISTER_IMM(1);
4684	*cs++ = i915_mmio_reg_offset(inv_reg);
4685	*cs++ = AUX_INV;
4686	*cs++ = MI_NOOP;
4687
4688	return cs;
4689}
4690
4691static int gen12_emit_flush_render(struct i915_request *request,
4692				   u32 mode)
4693{
4694	if (mode & EMIT_FLUSH) {
4695		u32 flags = 0;
4696		u32 *cs;
4697
4698		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4699		flags |= PIPE_CONTROL_FLUSH_L3;
4700		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4701		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4702		/* Wa_1409600907:tgl */
4703		flags |= PIPE_CONTROL_DEPTH_STALL;
4704		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4705		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4706
4707		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4708		flags |= PIPE_CONTROL_QW_WRITE;
4709
4710		flags |= PIPE_CONTROL_CS_STALL;
4711
4712		cs = intel_ring_begin(request, 6);
4713		if (IS_ERR(cs))
4714			return PTR_ERR(cs);
4715
4716		cs = gen12_emit_pipe_control(cs,
4717					     PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4718					     flags, LRC_PPHWSP_SCRATCH_ADDR);
4719		intel_ring_advance(request, cs);
4720	}
4721
4722	if (mode & EMIT_INVALIDATE) {
4723		u32 flags = 0;
4724		u32 *cs;
4725
4726		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4727		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4728		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4729		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4730		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4731		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4732		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4733
4734		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4735		flags |= PIPE_CONTROL_QW_WRITE;
4736
4737		flags |= PIPE_CONTROL_CS_STALL;
4738
4739		cs = intel_ring_begin(request, 8 + 4);
4740		if (IS_ERR(cs))
4741			return PTR_ERR(cs);
4742
4743		/*
4744		 * Prevent the pre-parser from skipping past the TLB
4745		 * invalidate and loading a stale page for the batch
4746		 * buffer / request payload.
4747		 */
4748		*cs++ = preparser_disable(true);
4749
4750		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4751
4752		/* hsdes: 1809175790 */
4753		cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4754
4755		*cs++ = preparser_disable(false);
4756		intel_ring_advance(request, cs);
4757	}
4758
4759	return 0;
4760}
4761
4762static int gen12_emit_flush(struct i915_request *request, u32 mode)
4763{
4764	intel_engine_mask_t aux_inv = 0;
4765	u32 cmd, *cs;
4766
4767	if (mode & EMIT_INVALIDATE)
4768		aux_inv = request->engine->mask & ~BIT(BCS0);
4769
4770	cs = intel_ring_begin(request,
4771			      4 + (aux_inv ? 2 * hweight8(aux_inv) + 2 : 0));
4772	if (IS_ERR(cs))
4773		return PTR_ERR(cs);
4774
4775	cmd = MI_FLUSH_DW + 1;
4776
4777	/* We always require a command barrier so that subsequent
4778	 * commands, such as breadcrumb interrupts, are strictly ordered
4779	 * wrt the contents of the write cache being flushed to memory
4780	 * (and thus being coherent from the CPU).
4781	 */
4782	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4783
4784	if (mode & EMIT_INVALIDATE) {
4785		cmd |= MI_INVALIDATE_TLB;
4786		if (request->engine->class == VIDEO_DECODE_CLASS)
4787			cmd |= MI_INVALIDATE_BSD;
4788	}
4789
4790	*cs++ = cmd;
4791	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4792	*cs++ = 0; /* upper addr */
4793	*cs++ = 0; /* value */
4794
4795	if (aux_inv) { /* hsdes: 1809175790 */
4796		struct intel_engine_cs *engine;
4797		unsigned int tmp;
4798
4799		*cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4800		for_each_engine_masked(engine, request->engine->gt,
4801				       aux_inv, tmp) {
4802			*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4803			*cs++ = AUX_INV;
4804		}
4805		*cs++ = MI_NOOP;
4806	}
4807	intel_ring_advance(request, cs);
4808
4809	return 0;
4810}
4811
4812static void assert_request_valid(struct i915_request *rq)
4813{
4814	struct intel_ring *ring __maybe_unused = rq->ring;
4815
4816	/* Can we unwind this request without appearing to go forwards? */
4817	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4818}
4819
4820/*
4821 * Reserve space for 2 NOOPs at the end of each request to be
4822 * used as a workaround for not being allowed to do lite
4823 * restore with HEAD==TAIL (WaIdleLiteRestore).
4824 */
4825static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4826{
4827	/* Ensure there's always at least one preemption point per-request. */
4828	*cs++ = MI_ARB_CHECK;
4829	*cs++ = MI_NOOP;
4830	request->wa_tail = intel_ring_offset(request, cs);
4831
4832	/* Check that entire request is less than half the ring */
4833	assert_request_valid(request);
4834
4835	return cs;
4836}
4837
4838static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4839{
4840	*cs++ = MI_SEMAPHORE_WAIT |
4841		MI_SEMAPHORE_GLOBAL_GTT |
4842		MI_SEMAPHORE_POLL |
4843		MI_SEMAPHORE_SAD_EQ_SDD;
4844	*cs++ = 0;
4845	*cs++ = intel_hws_preempt_address(request->engine);
4846	*cs++ = 0;
4847
4848	return cs;
4849}
4850
4851static __always_inline u32*
4852gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4853{
4854	*cs++ = MI_USER_INTERRUPT;
4855
4856	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4857	if (intel_engine_has_semaphores(request->engine))
4858		cs = emit_preempt_busywait(request, cs);
4859
4860	request->tail = intel_ring_offset(request, cs);
4861	assert_ring_tail_valid(request->ring, request->tail);
4862
4863	return gen8_emit_wa_tail(request, cs);
4864}
4865
4866static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs)
4867{
4868	u32 addr = i915_request_active_timeline(request)->hwsp_offset;
4869
4870	return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0);
4871}
4872
4873static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4874{
4875	return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4876}
4877
4878static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4879{
4880	cs = gen8_emit_pipe_control(cs,
4881				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4882				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4883				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4884				    0);
4885
4886	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4887	cs = gen8_emit_ggtt_write_rcs(cs,
4888				      request->fence.seqno,
4889				      i915_request_active_timeline(request)->hwsp_offset,
4890				      PIPE_CONTROL_FLUSH_ENABLE |
4891				      PIPE_CONTROL_CS_STALL);
4892
4893	return gen8_emit_fini_breadcrumb_tail(request, cs);
4894}
4895
4896static u32 *
4897gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4898{
4899	cs = gen8_emit_ggtt_write_rcs(cs,
4900				      request->fence.seqno,
4901				      i915_request_active_timeline(request)->hwsp_offset,
4902				      PIPE_CONTROL_CS_STALL |
4903				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4904				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4905				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4906				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4907				      PIPE_CONTROL_FLUSH_ENABLE);
4908
4909	return gen8_emit_fini_breadcrumb_tail(request, cs);
4910}
4911
4912/*
4913 * Note that the CS instruction pre-parser will not stall on the breadcrumb
4914 * flush and will continue pre-fetching the instructions after it before the
4915 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4916 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4917 * of the next request before the memory has been flushed, we're guaranteed that
4918 * we won't access the batch itself too early.
4919 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4920 * so, if the current request is modifying an instruction in the next request on
4921 * the same intel_context, we might pre-fetch and then execute the pre-update
4922 * instruction. To avoid this, the users of self-modifying code should either
4923 * disable the parser around the code emitting the memory writes, via a new flag
4924 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4925 * the in-kernel use-cases we've opted to use a separate context, see
4926 * reloc_gpu() as an example.
4927 * All the above applies only to the instructions themselves. Non-inline data
4928 * used by the instructions is not pre-fetched.
4929 */
4930
4931static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4932{
4933	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4934		MI_SEMAPHORE_GLOBAL_GTT |
4935		MI_SEMAPHORE_POLL |
4936		MI_SEMAPHORE_SAD_EQ_SDD;
4937	*cs++ = 0;
4938	*cs++ = intel_hws_preempt_address(request->engine);
4939	*cs++ = 0;
4940	*cs++ = 0;
4941	*cs++ = MI_NOOP;
4942
4943	return cs;
4944}
4945
4946static __always_inline u32*
4947gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4948{
4949	*cs++ = MI_USER_INTERRUPT;
4950
4951	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4952	if (intel_engine_has_semaphores(request->engine))
4953		cs = gen12_emit_preempt_busywait(request, cs);
4954
4955	request->tail = intel_ring_offset(request, cs);
4956	assert_ring_tail_valid(request->ring, request->tail);
4957
4958	return gen8_emit_wa_tail(request, cs);
4959}
4960
4961static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4962{
4963	return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4964}
4965
4966static u32 *
4967gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4968{
4969	cs = gen12_emit_ggtt_write_rcs(cs,
4970				       request->fence.seqno,
4971				       i915_request_active_timeline(request)->hwsp_offset,
4972				       PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4973				       PIPE_CONTROL_CS_STALL |
4974				       PIPE_CONTROL_TILE_CACHE_FLUSH |
4975				       PIPE_CONTROL_FLUSH_L3 |
4976				       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4977				       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4978				       /* Wa_1409600907:tgl */
4979				       PIPE_CONTROL_DEPTH_STALL |
4980				       PIPE_CONTROL_DC_FLUSH_ENABLE |
4981				       PIPE_CONTROL_FLUSH_ENABLE);
4982
4983	return gen12_emit_fini_breadcrumb_tail(request, cs);
4984}
4985
4986static void execlists_park(struct intel_engine_cs *engine)
4987{
4988	cancel_timer(&engine->execlists.timer);
4989	cancel_timer(&engine->execlists.preempt);
4990}
4991
4992void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4993{
4994	engine->submit_request = execlists_submit_request;
4995	engine->schedule = i915_schedule;
4996	engine->execlists.tasklet.func = execlists_submission_tasklet;
4997
4998	engine->reset.prepare = execlists_reset_prepare;
4999	engine->reset.rewind = execlists_reset_rewind;
5000	engine->reset.cancel = execlists_reset_cancel;
5001	engine->reset.finish = execlists_reset_finish;
5002
5003	engine->park = execlists_park;
5004	engine->unpark = NULL;
5005
5006	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
5007	if (!intel_vgpu_active(engine->i915)) {
5008		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
5009		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
5010			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
5011			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
5012				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
5013		}
5014	}
5015
5016	if (INTEL_GEN(engine->i915) >= 12)
5017		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
5018
5019	if (intel_engine_has_preemption(engine))
5020		engine->emit_bb_start = gen8_emit_bb_start;
5021	else
5022		engine->emit_bb_start = gen8_emit_bb_start_noarb;
5023}
5024
5025static void execlists_shutdown(struct intel_engine_cs *engine)
5026{
5027	/* Synchronise with residual timers and any softirq they raise */
5028	del_timer_sync(&engine->execlists.timer);
5029	del_timer_sync(&engine->execlists.preempt);
5030	tasklet_kill(&engine->execlists.tasklet);
5031}
5032
5033static void execlists_release(struct intel_engine_cs *engine)
5034{
5035	engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
5036
5037	execlists_shutdown(engine);
5038
5039	intel_engine_cleanup_common(engine);
5040	lrc_destroy_wa_ctx(engine);
5041}
5042
5043static void
5044logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5045{
5046	/* Default vfuncs which can be overriden by each engine. */
5047
5048	engine->resume = execlists_resume;
5049
5050	engine->cops = &execlists_context_ops;
5051	engine->request_alloc = execlists_request_alloc;
5052
5053	engine->emit_flush = gen8_emit_flush;
5054	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5055	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5056	if (INTEL_GEN(engine->i915) >= 12) {
5057		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5058		engine->emit_flush = gen12_emit_flush;
5059	}
5060	engine->set_default_submission = intel_execlists_set_default_submission;
5061
5062	if (INTEL_GEN(engine->i915) < 11) {
5063		engine->irq_enable = gen8_logical_ring_enable_irq;
5064		engine->irq_disable = gen8_logical_ring_disable_irq;
5065	} else {
5066		/*
5067		 * TODO: On Gen11 interrupt masks need to be clear
5068		 * to allow C6 entry. Keep interrupts enabled at
5069		 * and take the hit of generating extra interrupts
5070		 * until a more refined solution exists.
5071		 */
5072	}
5073}
5074
5075static inline void
5076logical_ring_default_irqs(struct intel_engine_cs *engine)
5077{
5078	unsigned int shift = 0;
5079
5080	if (INTEL_GEN(engine->i915) < 11) {
5081		const u8 irq_shifts[] = {
5082			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
5083			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
5084			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5085			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5086			[VECS0] = GEN8_VECS_IRQ_SHIFT,
5087		};
5088
5089		shift = irq_shifts[engine->id];
5090	}
5091
5092	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5093	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5094	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5095	engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5096}
5097
5098static void rcs_submission_override(struct intel_engine_cs *engine)
5099{
5100	switch (INTEL_GEN(engine->i915)) {
5101	case 12:
5102		engine->emit_flush = gen12_emit_flush_render;
5103		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5104		break;
5105	case 11:
5106		engine->emit_flush = gen11_emit_flush_render;
5107		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5108		break;
5109	default:
5110		engine->emit_flush = gen8_emit_flush_render;
5111		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5112		break;
5113	}
5114}
5115
5116int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5117{
5118	struct intel_engine_execlists * const execlists = &engine->execlists;
5119	struct drm_i915_private *i915 = engine->i915;
5120	struct intel_uncore *uncore = engine->uncore;
5121	u32 base = engine->mmio_base;
5122
5123	tasklet_init(&engine->execlists.tasklet,
5124		     execlists_submission_tasklet, (unsigned long)engine);
5125	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5126	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5127
5128	logical_ring_default_vfuncs(engine);
5129	logical_ring_default_irqs(engine);
5130
5131	if (engine->class == RENDER_CLASS)
5132		rcs_submission_override(engine);
5133
5134	if (intel_init_workaround_bb(engine))
5135		/*
5136		 * We continue even if we fail to initialize WA batch
5137		 * because we only expect rare glitches but nothing
5138		 * critical to prevent us from using GPU
5139		 */
5140		drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5141
5142	if (HAS_LOGICAL_RING_ELSQ(i915)) {
5143		execlists->submit_reg = uncore->regs +
5144			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5145		execlists->ctrl_reg = uncore->regs +
5146			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5147	} else {
5148		execlists->submit_reg = uncore->regs +
5149			i915_mmio_reg_offset(RING_ELSP(base));
5150	}
5151
5152	execlists->csb_status =
5153		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5154
5155	execlists->csb_write =
5156		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
5157
5158	if (INTEL_GEN(i915) < 11)
5159		execlists->csb_size = GEN8_CSB_ENTRIES;
5160	else
5161		execlists->csb_size = GEN11_CSB_ENTRIES;
5162
5163	if (INTEL_GEN(engine->i915) >= 11) {
5164		execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5165		execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5166	}
5167
5168	/* Finally, take ownership and responsibility for cleanup! */
5169	engine->sanitize = execlists_sanitize;
5170	engine->release = execlists_release;
5171
5172	return 0;
5173}
5174
5175static void init_common_reg_state(u32 * const regs,
5176				  const struct intel_engine_cs *engine,
5177				  const struct intel_ring *ring,
5178				  bool inhibit)
5179{
5180	u32 ctl;
5181
5182	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5183	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5184	if (inhibit)
5185		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5186	if (INTEL_GEN(engine->i915) < 11)
5187		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5188					   CTX_CTRL_RS_CTX_ENABLE);
5189	regs[CTX_CONTEXT_CONTROL] = ctl;
5190
5191	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5192	regs[CTX_TIMESTAMP] = 0;
5193}
5194
5195static void init_wa_bb_reg_state(u32 * const regs,
5196				 const struct intel_engine_cs *engine)
5197{
5198	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5199
5200	if (wa_ctx->per_ctx.size) {
5201		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5202
5203		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5204		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5205			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5206	}
5207
5208	if (wa_ctx->indirect_ctx.size) {
5209		lrc_ring_setup_indirect_ctx(regs, engine,
5210					    i915_ggtt_offset(wa_ctx->vma) +
5211					    wa_ctx->indirect_ctx.offset,
5212					    wa_ctx->indirect_ctx.size);
5213	}
5214}
5215
5216static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5217{
5218	if (i915_vm_is_4lvl(&ppgtt->vm)) {
5219		/* 64b PPGTT (48bit canonical)
5220		 * PDP0_DESCRIPTOR contains the base address to PML4 and
5221		 * other PDP Descriptors are ignored.
5222		 */
5223		ASSIGN_CTX_PML4(ppgtt, regs);
5224	} else {
5225		ASSIGN_CTX_PDP(ppgtt, regs, 3);
5226		ASSIGN_CTX_PDP(ppgtt, regs, 2);
5227		ASSIGN_CTX_PDP(ppgtt, regs, 1);
5228		ASSIGN_CTX_PDP(ppgtt, regs, 0);
5229	}
5230}
5231
5232static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5233{
5234	if (i915_is_ggtt(vm))
5235		return i915_vm_to_ggtt(vm)->alias;
5236	else
5237		return i915_vm_to_ppgtt(vm);
5238}
5239
5240static void execlists_init_reg_state(u32 *regs,
5241				     const struct intel_context *ce,
5242				     const struct intel_engine_cs *engine,
5243				     const struct intel_ring *ring,
5244				     bool inhibit)
5245{
5246	/*
5247	 * A context is actually a big batch buffer with several
5248	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5249	 * values we are setting here are only for the first context restore:
5250	 * on a subsequent save, the GPU will recreate this batchbuffer with new
5251	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5252	 * we are not initializing here).
5253	 *
5254	 * Must keep consistent with virtual_update_register_offsets().
5255	 */
5256	set_offsets(regs, reg_offsets(engine), engine, inhibit);
5257
5258	init_common_reg_state(regs, engine, ring, inhibit);
5259	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5260
5261	init_wa_bb_reg_state(regs, engine);
5262
5263	__reset_stop_ring(regs, engine);
5264}
5265
5266static int
5267populate_lr_context(struct intel_context *ce,
5268		    struct drm_i915_gem_object *ctx_obj,
5269		    struct intel_engine_cs *engine,
5270		    struct intel_ring *ring)
5271{
5272	bool inhibit = true;
5273	void *vaddr;
5274
5275	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5276	if (IS_ERR(vaddr)) {
5277		drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5278		return PTR_ERR(vaddr);
5279	}
5280
5281	set_redzone(vaddr, engine);
5282
5283	if (engine->default_state) {
5284		shmem_read(engine->default_state, 0,
5285			   vaddr, engine->context_size);
5286		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
5287		inhibit = false;
5288	}
5289
5290	/* Clear the ppHWSP (inc. per-context counters) */
5291	memset(vaddr, 0, PAGE_SIZE);
5292
5293	/*
5294	 * The second page of the context object contains some registers which
5295	 * must be set up prior to the first execution.
5296	 */
5297	execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5298				 ce, engine, ring, inhibit);
5299
5300	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5301	i915_gem_object_unpin_map(ctx_obj);
5302	return 0;
5303}
5304
5305static int __execlists_context_alloc(struct intel_context *ce,
5306				     struct intel_engine_cs *engine)
5307{
5308	struct drm_i915_gem_object *ctx_obj;
5309	struct intel_ring *ring;
5310	struct i915_vma *vma;
5311	u32 context_size;
5312	int ret;
5313
5314	GEM_BUG_ON(ce->state);
5315	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5316
5317	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5318		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5319
5320	if (INTEL_GEN(engine->i915) == 12) {
5321		ce->wa_bb_page = context_size / PAGE_SIZE;
5322		context_size += PAGE_SIZE;
5323	}
5324
5325	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5326	if (IS_ERR(ctx_obj))
5327		return PTR_ERR(ctx_obj);
5328
5329	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5330	if (IS_ERR(vma)) {
5331		ret = PTR_ERR(vma);
5332		goto error_deref_obj;
5333	}
5334
5335	if (!ce->timeline) {
5336		struct intel_timeline *tl;
5337		struct i915_vma *hwsp;
5338
5339		/*
5340		 * Use the static global HWSP for the kernel context, and
5341		 * a dynamically allocated cacheline for everyone else.
5342		 */
5343		hwsp = NULL;
5344		if (unlikely(intel_context_is_barrier(ce)))
5345			hwsp = engine->status_page.vma;
5346
5347		tl = intel_timeline_create(engine->gt, hwsp);
5348		if (IS_ERR(tl)) {
5349			ret = PTR_ERR(tl);
5350			goto error_deref_obj;
5351		}
5352
5353		ce->timeline = tl;
5354	}
5355
5356	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5357	if (IS_ERR(ring)) {
5358		ret = PTR_ERR(ring);
5359		goto error_deref_obj;
5360	}
5361
5362	ret = populate_lr_context(ce, ctx_obj, engine, ring);
5363	if (ret) {
5364		drm_dbg(&engine->i915->drm,
5365			"Failed to populate LRC: %d\n", ret);
5366		goto error_ring_free;
5367	}
5368
5369	ce->ring = ring;
5370	ce->state = vma;
5371
5372	return 0;
5373
5374error_ring_free:
5375	intel_ring_put(ring);
5376error_deref_obj:
5377	i915_gem_object_put(ctx_obj);
5378	return ret;
5379}
5380
5381static struct list_head *virtual_queue(struct virtual_engine *ve)
5382{
5383	return &ve->base.execlists.default_priolist.requests[0];
5384}
5385
5386static void virtual_context_destroy(struct kref *kref)
5387{
5388	struct virtual_engine *ve =
5389		container_of(kref, typeof(*ve), context.ref);
5390	unsigned int n;
5391
5392	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5393	GEM_BUG_ON(ve->request);
5394	GEM_BUG_ON(ve->context.inflight);
5395
5396	for (n = 0; n < ve->num_siblings; n++) {
5397		struct intel_engine_cs *sibling = ve->siblings[n];
5398		struct rb_node *node = &ve->nodes[sibling->id].rb;
5399		unsigned long flags;
5400
5401		if (RB_EMPTY_NODE(node))
5402			continue;
5403
5404		spin_lock_irqsave(&sibling->active.lock, flags);
5405
5406		/* Detachment is lazily performed in the execlists tasklet */
5407		if (!RB_EMPTY_NODE(node))
5408			rb_erase_cached(node, &sibling->execlists.virtual);
5409
5410		spin_unlock_irqrestore(&sibling->active.lock, flags);
5411	}
5412	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5413
5414	if (ve->context.state)
5415		__execlists_context_fini(&ve->context);
5416	intel_context_fini(&ve->context);
5417
5418	intel_engine_free_request_pool(&ve->base);
5419
5420	kfree(ve->bonds);
5421	kfree(ve);
5422}
5423
5424static void virtual_engine_initial_hint(struct virtual_engine *ve)
5425{
5426	int swp;
5427
5428	/*
5429	 * Pick a random sibling on starting to help spread the load around.
5430	 *
5431	 * New contexts are typically created with exactly the same order
5432	 * of siblings, and often started in batches. Due to the way we iterate
5433	 * the array of sibling when submitting requests, sibling[0] is
5434	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5435	 * randomised across the system, we also help spread the load by the
5436	 * first engine we inspect being different each time.
5437	 *
5438	 * NB This does not force us to execute on this engine, it will just
5439	 * typically be the first we inspect for submission.
5440	 */
5441	swp = prandom_u32_max(ve->num_siblings);
5442	if (swp)
5443		swap(ve->siblings[swp], ve->siblings[0]);
5444}
5445
5446static int virtual_context_alloc(struct intel_context *ce)
5447{
5448	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5449
5450	return __execlists_context_alloc(ce, ve->siblings[0]);
5451}
5452
5453static int virtual_context_pin(struct intel_context *ce)
5454{
5455	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5456
5457	/* Note: we must use a real engine class for setting up reg state */
5458	return __execlists_context_pin(ce, ve->siblings[0]);
5459}
5460
5461static void virtual_context_enter(struct intel_context *ce)
5462{
5463	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5464	unsigned int n;
5465
5466	for (n = 0; n < ve->num_siblings; n++)
5467		intel_engine_pm_get(ve->siblings[n]);
5468
5469	intel_timeline_enter(ce->timeline);
5470}
5471
5472static void virtual_context_exit(struct intel_context *ce)
5473{
5474	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5475	unsigned int n;
5476
5477	intel_timeline_exit(ce->timeline);
5478
5479	for (n = 0; n < ve->num_siblings; n++)
5480		intel_engine_pm_put(ve->siblings[n]);
5481}
5482
5483static const struct intel_context_ops virtual_context_ops = {
5484	.alloc = virtual_context_alloc,
5485
5486	.pin = virtual_context_pin,
5487	.unpin = execlists_context_unpin,
5488
5489	.enter = virtual_context_enter,
5490	.exit = virtual_context_exit,
5491
5492	.destroy = virtual_context_destroy,
5493};
5494
5495static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5496{
5497	struct i915_request *rq;
5498	intel_engine_mask_t mask;
5499
5500	rq = READ_ONCE(ve->request);
5501	if (!rq)
5502		return 0;
5503
5504	/* The rq is ready for submission; rq->execution_mask is now stable. */
5505	mask = rq->execution_mask;
5506	if (unlikely(!mask)) {
5507		/* Invalid selection, submit to a random engine in error */
5508		i915_request_set_error_once(rq, -ENODEV);
5509		mask = ve->siblings[0]->mask;
5510	}
5511
5512	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5513		     rq->fence.context, rq->fence.seqno,
5514		     mask, ve->base.execlists.queue_priority_hint);
5515
5516	return mask;
5517}
5518
5519static void virtual_submission_tasklet(unsigned long data)
5520{
5521	struct virtual_engine * const ve = (struct virtual_engine *)data;
5522	const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5523	intel_engine_mask_t mask;
5524	unsigned int n;
5525
5526	rcu_read_lock();
5527	mask = virtual_submission_mask(ve);
5528	rcu_read_unlock();
5529	if (unlikely(!mask))
5530		return;
5531
5532	local_irq_disable();
5533	for (n = 0; n < ve->num_siblings; n++) {
5534		struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5535		struct ve_node * const node = &ve->nodes[sibling->id];
5536		struct rb_node **parent, *rb;
5537		bool first;
5538
5539		if (!READ_ONCE(ve->request))
5540			break; /* already handled by a sibling's tasklet */
5541
5542		if (unlikely(!(mask & sibling->mask))) {
5543			if (!RB_EMPTY_NODE(&node->rb)) {
5544				spin_lock(&sibling->active.lock);
5545				rb_erase_cached(&node->rb,
5546						&sibling->execlists.virtual);
5547				RB_CLEAR_NODE(&node->rb);
5548				spin_unlock(&sibling->active.lock);
5549			}
5550			continue;
5551		}
5552
5553		spin_lock(&sibling->active.lock);
5554
5555		if (!RB_EMPTY_NODE(&node->rb)) {
5556			/*
5557			 * Cheat and avoid rebalancing the tree if we can
5558			 * reuse this node in situ.
5559			 */
5560			first = rb_first_cached(&sibling->execlists.virtual) ==
5561				&node->rb;
5562			if (prio == node->prio || (prio > node->prio && first))
5563				goto submit_engine;
5564
5565			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5566		}
5567
5568		rb = NULL;
5569		first = true;
5570		parent = &sibling->execlists.virtual.rb_root.rb_node;
5571		while (*parent) {
5572			struct ve_node *other;
5573
5574			rb = *parent;
5575			other = rb_entry(rb, typeof(*other), rb);
5576			if (prio > other->prio) {
5577				parent = &rb->rb_left;
5578			} else {
5579				parent = &rb->rb_right;
5580				first = false;
5581			}
5582		}
5583
5584		rb_link_node(&node->rb, rb, parent);
5585		rb_insert_color_cached(&node->rb,
5586				       &sibling->execlists.virtual,
5587				       first);
5588
5589submit_engine:
5590		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5591		node->prio = prio;
5592		if (first && prio > sibling->execlists.queue_priority_hint)
5593			tasklet_hi_schedule(&sibling->execlists.tasklet);
5594
5595		spin_unlock(&sibling->active.lock);
5596	}
5597	local_irq_enable();
5598}
5599
5600static void virtual_submit_request(struct i915_request *rq)
5601{
5602	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5603	struct i915_request *old;
5604	unsigned long flags;
5605
5606	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5607		     rq->fence.context,
5608		     rq->fence.seqno);
5609
5610	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5611
5612	spin_lock_irqsave(&ve->base.active.lock, flags);
5613
5614	old = ve->request;
5615	if (old) { /* background completion event from preempt-to-busy */
5616		GEM_BUG_ON(!i915_request_completed(old));
5617		__i915_request_submit(old);
5618		i915_request_put(old);
5619	}
5620
5621	if (i915_request_completed(rq)) {
5622		__i915_request_submit(rq);
5623
5624		ve->base.execlists.queue_priority_hint = INT_MIN;
5625		ve->request = NULL;
5626	} else {
5627		ve->base.execlists.queue_priority_hint = rq_prio(rq);
5628		ve->request = i915_request_get(rq);
5629
5630		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5631		list_move_tail(&rq->sched.link, virtual_queue(ve));
5632
5633		tasklet_hi_schedule(&ve->base.execlists.tasklet);
5634	}
5635
5636	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5637}
5638
5639static struct ve_bond *
5640virtual_find_bond(struct virtual_engine *ve,
5641		  const struct intel_engine_cs *master)
5642{
5643	int i;
5644
5645	for (i = 0; i < ve->num_bonds; i++) {
5646		if (ve->bonds[i].master == master)
5647			return &ve->bonds[i];
5648	}
5649
5650	return NULL;
5651}
5652
5653static void
5654virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5655{
5656	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5657	intel_engine_mask_t allowed, exec;
5658	struct ve_bond *bond;
5659
5660	allowed = ~to_request(signal)->engine->mask;
5661
5662	bond = virtual_find_bond(ve, to_request(signal)->engine);
5663	if (bond)
5664		allowed &= bond->sibling_mask;
5665
5666	/* Restrict the bonded request to run on only the available engines */
5667	exec = READ_ONCE(rq->execution_mask);
5668	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5669		;
5670
5671	/* Prevent the master from being re-run on the bonded engines */
5672	to_request(signal)->execution_mask &= ~allowed;
5673}
5674
5675struct intel_context *
5676intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5677			       unsigned int count)
5678{
5679	struct virtual_engine *ve;
5680	unsigned int n;
5681	int err;
5682
5683	if (count == 0)
5684		return ERR_PTR(-EINVAL);
5685
5686	if (count == 1)
5687		return intel_context_create(siblings[0]);
5688
5689	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5690	if (!ve)
5691		return ERR_PTR(-ENOMEM);
5692
5693	ve->base.i915 = siblings[0]->i915;
5694	ve->base.gt = siblings[0]->gt;
5695	ve->base.uncore = siblings[0]->uncore;
5696	ve->base.id = -1;
5697
5698	ve->base.class = OTHER_CLASS;
5699	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5700	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5701	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5702
5703	/*
5704	 * The decision on whether to submit a request using semaphores
5705	 * depends on the saturated state of the engine. We only compute
5706	 * this during HW submission of the request, and we need for this
5707	 * state to be globally applied to all requests being submitted
5708	 * to this engine. Virtual engines encompass more than one physical
5709	 * engine and so we cannot accurately tell in advance if one of those
5710	 * engines is already saturated and so cannot afford to use a semaphore
5711	 * and be pessimized in priority for doing so -- if we are the only
5712	 * context using semaphores after all other clients have stopped, we
5713	 * will be starved on the saturated system. Such a global switch for
5714	 * semaphores is less than ideal, but alas is the current compromise.
5715	 */
5716	ve->base.saturated = ALL_ENGINES;
5717
5718	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5719
5720	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5721	intel_engine_init_breadcrumbs(&ve->base);
5722	intel_engine_init_execlists(&ve->base);
5723	ve->base.breadcrumbs.irq_armed = true; /* fake HW, used for irq_work */
5724
5725	ve->base.cops = &virtual_context_ops;
5726	ve->base.request_alloc = execlists_request_alloc;
5727
5728	ve->base.schedule = i915_schedule;
5729	ve->base.submit_request = virtual_submit_request;
5730	ve->base.bond_execute = virtual_bond_execute;
5731
5732	INIT_LIST_HEAD(virtual_queue(ve));
5733	ve->base.execlists.queue_priority_hint = INT_MIN;
5734	tasklet_init(&ve->base.execlists.tasklet,
5735		     virtual_submission_tasklet,
5736		     (unsigned long)ve);
5737
5738	intel_context_init(&ve->context, &ve->base);
5739
5740	for (n = 0; n < count; n++) {
5741		struct intel_engine_cs *sibling = siblings[n];
5742
5743		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5744		if (sibling->mask & ve->base.mask) {
5745			DRM_DEBUG("duplicate %s entry in load balancer\n",
5746				  sibling->name);
5747			err = -EINVAL;
5748			goto err_put;
5749		}
5750
5751		/*
5752		 * The virtual engine implementation is tightly coupled to
5753		 * the execlists backend -- we push out request directly
5754		 * into a tree inside each physical engine. We could support
5755		 * layering if we handle cloning of the requests and
5756		 * submitting a copy into each backend.
5757		 */
5758		if (sibling->execlists.tasklet.func !=
5759		    execlists_submission_tasklet) {
5760			err = -ENODEV;
5761			goto err_put;
5762		}
5763
5764		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5765		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5766
5767		ve->siblings[ve->num_siblings++] = sibling;
5768		ve->base.mask |= sibling->mask;
5769
5770		/*
5771		 * All physical engines must be compatible for their emission
5772		 * functions (as we build the instructions during request
5773		 * construction and do not alter them before submission
5774		 * on the physical engine). We use the engine class as a guide
5775		 * here, although that could be refined.
5776		 */
5777		if (ve->base.class != OTHER_CLASS) {
5778			if (ve->base.class != sibling->class) {
5779				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5780					  sibling->class, ve->base.class);
5781				err = -EINVAL;
5782				goto err_put;
5783			}
5784			continue;
5785		}
5786
5787		ve->base.class = sibling->class;
5788		ve->base.uabi_class = sibling->uabi_class;
5789		snprintf(ve->base.name, sizeof(ve->base.name),
5790			 "v%dx%d", ve->base.class, count);
5791		ve->base.context_size = sibling->context_size;
5792
5793		ve->base.emit_bb_start = sibling->emit_bb_start;
5794		ve->base.emit_flush = sibling->emit_flush;
5795		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5796		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5797		ve->base.emit_fini_breadcrumb_dw =
5798			sibling->emit_fini_breadcrumb_dw;
5799
5800		ve->base.flags = sibling->flags;
5801	}
5802
5803	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5804
5805	virtual_engine_initial_hint(ve);
5806	return &ve->context;
5807
5808err_put:
5809	intel_context_put(&ve->context);
5810	return ERR_PTR(err);
5811}
5812
5813struct intel_context *
5814intel_execlists_clone_virtual(struct intel_engine_cs *src)
5815{
5816	struct virtual_engine *se = to_virtual_engine(src);
5817	struct intel_context *dst;
5818
5819	dst = intel_execlists_create_virtual(se->siblings,
5820					     se->num_siblings);
5821	if (IS_ERR(dst))
5822		return dst;
5823
5824	if (se->num_bonds) {
5825		struct virtual_engine *de = to_virtual_engine(dst->engine);
5826
5827		de->bonds = kmemdup(se->bonds,
5828				    sizeof(*se->bonds) * se->num_bonds,
5829				    GFP_KERNEL);
5830		if (!de->bonds) {
5831			intel_context_put(dst);
5832			return ERR_PTR(-ENOMEM);
5833		}
5834
5835		de->num_bonds = se->num_bonds;
5836	}
5837
5838	return dst;
5839}
5840
5841int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5842				     const struct intel_engine_cs *master,
5843				     const struct intel_engine_cs *sibling)
5844{
5845	struct virtual_engine *ve = to_virtual_engine(engine);
5846	struct ve_bond *bond;
5847	int n;
5848
5849	/* Sanity check the sibling is part of the virtual engine */
5850	for (n = 0; n < ve->num_siblings; n++)
5851		if (sibling == ve->siblings[n])
5852			break;
5853	if (n == ve->num_siblings)
5854		return -EINVAL;
5855
5856	bond = virtual_find_bond(ve, master);
5857	if (bond) {
5858		bond->sibling_mask |= sibling->mask;
5859		return 0;
5860	}
5861
5862	bond = krealloc(ve->bonds,
5863			sizeof(*bond) * (ve->num_bonds + 1),
5864			GFP_KERNEL);
5865	if (!bond)
5866		return -ENOMEM;
5867
5868	bond[ve->num_bonds].master = master;
5869	bond[ve->num_bonds].sibling_mask = sibling->mask;
5870
5871	ve->bonds = bond;
5872	ve->num_bonds++;
5873
5874	return 0;
5875}
5876
5877struct intel_engine_cs *
5878intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5879				 unsigned int sibling)
5880{
5881	struct virtual_engine *ve = to_virtual_engine(engine);
5882
5883	if (sibling >= ve->num_siblings)
5884		return NULL;
5885
5886	return ve->siblings[sibling];
5887}
5888
5889void intel_execlists_show_requests(struct intel_engine_cs *engine,
5890				   struct drm_printer *m,
5891				   void (*show_request)(struct drm_printer *m,
5892							struct i915_request *rq,
5893							const char *prefix),
5894				   unsigned int max)
5895{
5896	const struct intel_engine_execlists *execlists = &engine->execlists;
5897	struct i915_request *rq, *last;
5898	unsigned long flags;
5899	unsigned int count;
5900	struct rb_node *rb;
5901
5902	spin_lock_irqsave(&engine->active.lock, flags);
5903
5904	last = NULL;
5905	count = 0;
5906	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5907		if (count++ < max - 1)
5908			show_request(m, rq, "\t\tE ");
5909		else
5910			last = rq;
5911	}
5912	if (last) {
5913		if (count > max) {
5914			drm_printf(m,
5915				   "\t\t...skipping %d executing requests...\n",
5916				   count - max);
5917		}
5918		show_request(m, last, "\t\tE ");
5919	}
5920
5921	if (execlists->switch_priority_hint != INT_MIN)
5922		drm_printf(m, "\t\tSwitch priority hint: %d\n",
5923			   READ_ONCE(execlists->switch_priority_hint));
5924	if (execlists->queue_priority_hint != INT_MIN)
5925		drm_printf(m, "\t\tQueue priority hint: %d\n",
5926			   READ_ONCE(execlists->queue_priority_hint));
5927
5928	last = NULL;
5929	count = 0;
5930	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5931		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5932		int i;
5933
5934		priolist_for_each_request(rq, p, i) {
5935			if (count++ < max - 1)
5936				show_request(m, rq, "\t\tQ ");
5937			else
5938				last = rq;
5939		}
5940	}
5941	if (last) {
5942		if (count > max) {
5943			drm_printf(m,
5944				   "\t\t...skipping %d queued requests...\n",
5945				   count - max);
5946		}
5947		show_request(m, last, "\t\tQ ");
5948	}
5949
5950	last = NULL;
5951	count = 0;
5952	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5953		struct virtual_engine *ve =
5954			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5955		struct i915_request *rq = READ_ONCE(ve->request);
5956
5957		if (rq) {
5958			if (count++ < max - 1)
5959				show_request(m, rq, "\t\tV ");
5960			else
5961				last = rq;
5962		}
5963	}
5964	if (last) {
5965		if (count > max) {
5966			drm_printf(m,
5967				   "\t\t...skipping %d virtual requests...\n",
5968				   count - max);
5969		}
5970		show_request(m, last, "\t\tV ");
5971	}
5972
5973	spin_unlock_irqrestore(&engine->active.lock, flags);
5974}
5975
5976void intel_lr_context_reset(struct intel_engine_cs *engine,
5977			    struct intel_context *ce,
5978			    u32 head,
5979			    bool scrub)
5980{
5981	GEM_BUG_ON(!intel_context_is_pinned(ce));
5982
5983	/*
5984	 * We want a simple context + ring to execute the breadcrumb update.
5985	 * We cannot rely on the context being intact across the GPU hang,
5986	 * so clear it and rebuild just what we need for the breadcrumb.
5987	 * All pending requests for this context will be zapped, and any
5988	 * future request will be after userspace has had the opportunity
5989	 * to recreate its own state.
5990	 */
5991	if (scrub)
5992		restore_default_state(ce, engine);
5993
5994	/* Rerun the request; its payload has been neutered (if guilty). */
5995	__execlists_update_reg_state(ce, engine, head);
5996}
5997
5998bool
5999intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
6000{
6001	return engine->set_default_submission ==
6002	       intel_execlists_set_default_submission;
6003}
6004
6005#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6006#include "selftest_lrc.c"
6007#endif