intel_lrc.c - drivers/gpu/drm/i915/gt/intel_lrc.c - Linux diff v5.4

 
   1/*
   2 * Copyright © 2014 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * Authors:
  24 *    Ben Widawsky <ben@bwidawsk.net>
  25 *    Michel Thierry <michel.thierry@intel.com>
  26 *    Thomas Daniel <thomas.daniel@intel.com>
  27 *    Oscar Mateo <oscar.mateo@intel.com>
  28 *
  29 */
  30
  31/**
  32 * DOC: Logical Rings, Logical Ring Contexts and Execlists
  33 *
  34 * Motivation:
  35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
  36 * These expanded contexts enable a number of new abilities, especially
  37 * "Execlists" (also implemented in this file).
  38 *
  39 * One of the main differences with the legacy HW contexts is that logical
  40 * ring contexts incorporate many more things to the context's state, like
  41 * PDPs or ringbuffer control registers:
  42 *
  43 * The reason why PDPs are included in the context is straightforward: as
  44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
  45 * contained there mean you don't need to do a ppgtt->switch_mm yourself,
  46 * instead, the GPU will do it for you on the context switch.
  47 *
  48 * But, what about the ringbuffer control registers (head, tail, etc..)?
  49 * shouldn't we just need a set of those per engine command streamer? This is
  50 * where the name "Logical Rings" starts to make sense: by virtualizing the
  51 * rings, the engine cs shifts to a new "ring buffer" with every context
  52 * switch. When you want to submit a workload to the GPU you: A) choose your
  53 * context, B) find its appropriate virtualized ring, C) write commands to it
  54 * and then, finally, D) tell the GPU to switch to that context.
  55 *
  56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
  57 * to a contexts is via a context execution list, ergo "Execlists".
  58 *
  59 * LRC implementation:
  60 * Regarding the creation of contexts, we have:
  61 *
  62 * - One global default context.
  63 * - One local default context for each opened fd.
  64 * - One local extra context for each context create ioctl call.
  65 *
  66 * Now that ringbuffers belong per-context (and not per-engine, like before)
  67 * and that contexts are uniquely tied to a given engine (and not reusable,
  68 * like before) we need:
  69 *
  70 * - One ringbuffer per-engine inside each context.
  71 * - One backing object per-engine inside each context.
  72 *
  73 * The global default context starts its life with these new objects fully
  74 * allocated and populated. The local default context for each opened fd is
  75 * more complex, because we don't know at creation time which engine is going
  76 * to use them. To handle this, we have implemented a deferred creation of LR
  77 * contexts:
  78 *
  79 * The local context starts its life as a hollow or blank holder, that only
  80 * gets populated for a given engine once we receive an execbuffer. If later
  81 * on we receive another execbuffer ioctl for the same context but a different
  82 * engine, we allocate/populate a new ringbuffer and context backing object and
  83 * so on.
  84 *
  85 * Finally, regarding local contexts created using the ioctl call: as they are
  86 * only allowed with the render ring, we can allocate & populate them right
  87 * away (no need to defer anything, at least for now).
  88 *
  89 * Execlists implementation:
  90 * Execlists are the new method by which, on gen8+ hardware, workloads are
  91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
  92 * This method works as follows:
  93 *
  94 * When a request is committed, its commands (the BB start and any leading or
  95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
  96 * for the appropriate context. The tail pointer in the hardware context is not
  97 * updated at this time, but instead, kept by the driver in the ringbuffer
  98 * structure. A structure representing this request is added to a request queue
  99 * for the appropriate engine: this structure contains a copy of the context's
 100 * tail after the request was written to the ring buffer and a pointer to the
 101 * context itself.
 102 *
 103 * If the engine's request queue was empty before the request was added, the
 104 * queue is processed immediately. Otherwise the queue will be processed during
 105 * a context switch interrupt. In any case, elements on the queue will get sent
 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 107 * globally unique 20-bits submission ID.
 108 *
 109 * When execution of a request completes, the GPU updates the context status
 110 * buffer with a context complete event and generates a context switch interrupt.
 111 * During the interrupt handling, the driver examines the events in the buffer:
 112 * for each context complete event, if the announced ID matches that on the head
 113 * of the request queue, then that request is retired and removed from the queue.
 114 *
 115 * After processing, if any requests were retired and the queue is not empty
 116 * then a new execution list can be submitted. The two requests at the front of
 117 * the queue are next to be submitted but since a context may not occur twice in
 118 * an execution list, if subsequent requests have the same ID as the first then
 119 * the two requests must be combined. This is done simply by discarding requests
 120 * at the head of the queue until either only one requests is left (in which case
 121 * we use a NULL second context) or the first two requests have unique IDs.
 122 *
 123 * By always executing the first two requests in the queue the driver ensures
 124 * that the GPU is kept as busy as possible. In the case where a single context
 125 * completes but a second context is still executing, the request for this second
 126 * context will be at the head of the queue when we remove the first one. This
 127 * request will then be resubmitted along with a new request for a different context,
 128 * which will cause the hardware to continue executing the second request and queue
 129 * the new request (the GPU detects the condition of a context getting preempted
 130 * with the same context and optimizes the context switch flow by not doing
 131 * preemption, but just sampling the new tail pointer).
 132 *
 133 */
 134#include <linux/interrupt.h>
 135
 136#include "gem/i915_gem_context.h"
 137
 
 138#include "i915_drv.h"
 139#include "i915_perf.h"
 140#include "i915_trace.h"
 141#include "i915_vgpu.h"
 142#include "intel_engine_pm.h"
 
 
 143#include "intel_gt.h"
 144#include "intel_gt_pm.h"
 
 145#include "intel_lrc_reg.h"
 146#include "intel_mocs.h"
 147#include "intel_reset.h"
 148#include "intel_workarounds.h"
 149
 150#define RING_EXECLIST_QFULL		(1 << 0x2)
 151#define RING_EXECLIST1_VALID		(1 << 0x3)
 152#define RING_EXECLIST0_VALID		(1 << 0x4)
 153#define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
 154#define RING_EXECLIST1_ACTIVE		(1 << 0x11)
 155#define RING_EXECLIST0_ACTIVE		(1 << 0x12)
 156
 157#define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
 158#define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
 159#define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
 160#define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
 161#define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
 162#define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
 163
 164#define GEN8_CTX_STATUS_COMPLETED_MASK \
 165	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
 166
 167#define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
 168
 169#define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
 170#define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
 171#define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
 172#define GEN12_IDLE_CTX_ID		0x7FF
 173#define GEN12_CSB_CTX_VALID(csb_dw) \
 174	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
 175
 176/* Typical size of the average request (2 pipecontrols and a MI_BB) */
 177#define EXECLISTS_REQUEST_SIZE 64 /* bytes */
 178#define WA_TAIL_DWORDS 2
 179#define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
 180
 181struct virtual_engine {
 182	struct intel_engine_cs base;
 183	struct intel_context context;
 184
 185	/*
 186	 * We allow only a single request through the virtual engine at a time
 187	 * (each request in the timeline waits for the completion fence of
 188	 * the previous before being submitted). By restricting ourselves to
 189	 * only submitting a single request, each request is placed on to a
 190	 * physical to maximise load spreading (by virtue of the late greedy
 191	 * scheduling -- each real engine takes the next available request
 192	 * upon idling).
 193	 */
 194	struct i915_request *request;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 195
 196	/*
 197	 * We keep a rbtree of available virtual engines inside each physical
 198	 * engine, sorted by priority. Here we preallocate the nodes we need
 199	 * for the virtual engine, indexed by physical_engine->id.
 200	 */
 201	struct ve_node {
 202		struct rb_node rb;
 203		int prio;
 204	} nodes[I915_NUM_ENGINES];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 205
 206	/*
 207	 * Keep track of bonded pairs -- restrictions upon on our selection
 208	 * of physical engines any particular request may be submitted to.
 209	 * If we receive a submit-fence from a master engine, we will only
 210	 * use one of sibling_mask physical engines.
 211	 */
 212	struct ve_bond {
 213		const struct intel_engine_cs *master;
 214		intel_engine_mask_t sibling_mask;
 215	} *bonds;
 216	unsigned int num_bonds;
 217
 218	/* And finally, which physical engines this virtual engine maps onto. */
 219	unsigned int num_siblings;
 220	struct intel_engine_cs *siblings[0];
 221};
 222
 223static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
 224{
 225	GEM_BUG_ON(!intel_engine_is_virtual(engine));
 226	return container_of(engine, struct virtual_engine, base);
 227}
 228
 229static int __execlists_context_alloc(struct intel_context *ce,
 230				     struct intel_engine_cs *engine);
 231
 232static void execlists_init_reg_state(u32 *reg_state,
 233				     struct intel_context *ce,
 234				     struct intel_engine_cs *engine,
 235				     struct intel_ring *ring);
 236
 237static void mark_eio(struct i915_request *rq)
 238{
 239	if (!i915_request_signaled(rq))
 240		dma_fence_set_error(&rq->fence, -EIO);
 241	i915_request_mark_complete(rq);
 242}
 243
 244static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
 245{
 246	return (i915_ggtt_offset(engine->status_page.vma) +
 247		I915_GEM_HWS_PREEMPT_ADDR);
 248}
 249
 250static inline void
 251ring_set_paused(const struct intel_engine_cs *engine, int state)
 252{
 253	/*
 254	 * We inspect HWS_PREEMPT with a semaphore inside
 255	 * engine->emit_fini_breadcrumb. If the dword is true,
 256	 * the ring is paused as the semaphore will busywait
 257	 * until the dword is false.
 258	 */
 259	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
 260	if (state)
 261		wmb();
 262}
 263
 264static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 265{
 266	return rb_entry(rb, struct i915_priolist, node);
 267}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 268
 269static inline int rq_prio(const struct i915_request *rq)
 270{
 271	return rq->sched.attr.priority;
 272}
 273
 274static int effective_prio(const struct i915_request *rq)
 275{
 276	int prio = rq_prio(rq);
 277
 278	/*
 279	 * If this request is special and must not be interrupted at any
 280	 * cost, so be it. Note we are only checking the most recent request
 281	 * in the context and so may be masking an earlier vip request. It
 282	 * is hoped that under the conditions where nopreempt is used, this
 283	 * will not matter (i.e. all requests to that context will be
 284	 * nopreempt for as long as desired).
 285	 */
 286	if (i915_request_has_nopreempt(rq))
 287		prio = I915_PRIORITY_UNPREEMPTABLE;
 288
 289	/*
 290	 * On unwinding the active request, we give it a priority bump
 291	 * if it has completed waiting on any semaphore. If we know that
 292	 * the request has already started, we can prevent an unwanted
 293	 * preempt-to-idle cycle by taking that into account now.
 294	 */
 295	if (__i915_request_has_started(rq))
 296		prio |= I915_PRIORITY_NOSEMAPHORE;
 297
 298	/* Restrict mere WAIT boosts from triggering preemption */
 299	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
 300	return prio | __NO_PREEMPTION;
 301}
 302
 303static int queue_prio(const struct intel_engine_execlists *execlists)
 304{
 305	struct i915_priolist *p;
 306	struct rb_node *rb;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 307
 308	rb = rb_first_cached(&execlists->queue);
 309	if (!rb)
 310		return INT_MIN;
 311
 312	/*
 313	 * As the priolist[] are inverted, with the highest priority in [0],
 314	 * we have to flip the index value to become priority.
 315	 */
 316	p = to_priolist(rb);
 317	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
 318}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 319
 320static inline bool need_preempt(const struct intel_engine_cs *engine,
 321				const struct i915_request *rq,
 322				struct rb_node *rb)
 323{
 324	int last_prio;
 325
 326	if (!intel_engine_has_semaphores(engine))
 327		return false;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 328
 329	/*
 330	 * Check if the current priority hint merits a preemption attempt.
 331	 *
 332	 * We record the highest value priority we saw during rescheduling
 333	 * prior to this dequeue, therefore we know that if it is strictly
 334	 * less than the current tail of ESLP[0], we do not need to force
 335	 * a preempt-to-idle cycle.
 336	 *
 337	 * However, the priority hint is a mere hint that we may need to
 338	 * preempt. If that hint is stale or we may be trying to preempt
 339	 * ourselves, ignore the request.
 340	 */
 341	last_prio = effective_prio(rq);
 342	if (!i915_scheduler_need_preempt(engine->execlists.queue_priority_hint,
 343					 last_prio))
 344		return false;
 345
 346	/*
 347	 * Check against the first request in ELSP[1], it will, thanks to the
 348	 * power of PI, be the highest priority of that context.
 349	 */
 350	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
 351	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
 352		return true;
 353
 354	if (rb) {
 355		struct virtual_engine *ve =
 356			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
 357		bool preempt = false;
 358
 359		if (engine == ve->siblings[0]) { /* only preempt one sibling */
 360			struct i915_request *next;
 361
 362			rcu_read_lock();
 363			next = READ_ONCE(ve->request);
 364			if (next)
 365				preempt = rq_prio(next) > last_prio;
 366			rcu_read_unlock();
 367		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 368
 369		if (preempt)
 370			return preempt;
 371	}
 372
 373	/*
 374	 * If the inflight context did not trigger the preemption, then maybe
 375	 * it was the set of queued requests? Pick the highest priority in
 376	 * the queue (the first active priolist) and see if it deserves to be
 377	 * running instead of ELSP[0].
 378	 *
 379	 * The highest priority request in the queue can not be either
 380	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
 381	 * context, it's priority would not exceed ELSP[0] aka last_prio.
 382	 */
 383	return queue_prio(&engine->execlists) > last_prio;
 384}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 385
 386__maybe_unused static inline bool
 387assert_priority_queue(const struct i915_request *prev,
 388		      const struct i915_request *next)
 389{
 390	/*
 391	 * Without preemption, the prev may refer to the still active element
 392	 * which we refuse to let go.
 393	 *
 394	 * Even with preemption, there are times when we think it is better not
 395	 * to preempt and leave an ostensibly lower priority request in flight.
 396	 */
 397	if (i915_request_is_active(prev))
 398		return true;
 399
 400	return rq_prio(prev) >= rq_prio(next);
 401}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 402
 403/*
 404 * The context descriptor encodes various attributes of a context,
 405 * including its GTT address and some flags. Because it's fairly
 406 * expensive to calculate, we'll just do it once and cache the result,
 407 * which remains valid until the context is unpinned.
 408 *
 409 * This is what a descriptor looks like, from LSB to MSB::
 410 *
 411 *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
 412 *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
 413 *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
 414 *      bits 53-54:    mbz, reserved for use by hardware
 415 *      bits 55-63:    group ID, currently unused and set to 0
 416 *
 417 * Starting from Gen11, the upper dword of the descriptor has a new format:
 418 *
 419 *      bits 32-36:    reserved
 420 *      bits 37-47:    SW context ID
 421 *      bits 48:53:    engine instance
 422 *      bit 54:        mbz, reserved for use by hardware
 423 *      bits 55-60:    SW counter
 424 *      bits 61-63:    engine class
 425 *
 426 * engine info, SW context ID and SW counter need to form a unique number
 427 * (Context ID) per lrc.
 428 */
 429static u64
 430lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
 431{
 432	struct i915_gem_context *ctx = ce->gem_context;
 433	u64 desc;
 434
 435	BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH)));
 436	BUILD_BUG_ON(GEN11_MAX_CONTEXT_HW_ID > (BIT(GEN11_SW_CTX_ID_WIDTH)));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 437
 438	desc = INTEL_LEGACY_32B_CONTEXT;
 439	if (i915_vm_is_4lvl(ce->vm))
 440		desc = INTEL_LEGACY_64B_CONTEXT;
 441	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
 442
 443	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
 444	if (IS_GEN(engine->i915, 8))
 445		desc |= GEN8_CTX_L3LLC_COHERENT;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 446
 447	desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE;
 448								/* bits 12-31 */
 449	/*
 450	 * The following 32bits are copied into the OA reports (dword 2).
 451	 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
 452	 * anything below.
 453	 */
 454	if (INTEL_GEN(engine->i915) >= 11) {
 455		GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
 456		desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;
 457								/* bits 37-47 */
 458
 459		desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
 460								/* bits 48-53 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 461
 462		/* TODO: decide what to do with SW counter (bits 55-60) */
 
 463
 464		desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
 465								/* bits 61-63 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 466	} else {
 467		GEM_BUG_ON(ctx->hw_id >= BIT(GEN8_CTX_ID_WIDTH));
 468		desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT;	/* bits 32-52 */
 
 
 
 
 
 
 469	}
 470
 471	return desc;
 472}
 473
 474static void unwind_wa_tail(struct i915_request *rq)
 475{
 476	rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
 477	assert_ring_tail_valid(rq->ring, rq->tail);
 
 
 
 
 
 
 
 
 478}
 479
 480static struct i915_request *
 481__unwind_incomplete_requests(struct intel_engine_cs *engine)
 482{
 483	struct i915_request *rq, *rn, *active = NULL;
 484	struct list_head *uninitialized_var(pl);
 485	int prio = I915_PRIORITY_INVALID;
 486
 487	lockdep_assert_held(&engine->active.lock);
 488
 489	list_for_each_entry_safe_reverse(rq, rn,
 490					 &engine->active.requests,
 491					 sched.link) {
 492		struct intel_engine_cs *owner;
 493
 494		if (i915_request_completed(rq))
 495			continue; /* XXX */
 496
 497		__i915_request_unsubmit(rq);
 498		unwind_wa_tail(rq);
 499
 500		/*
 501		 * Push the request back into the queue for later resubmission.
 502		 * If this request is not native to this physical engine (i.e.
 503		 * it came from a virtual source), push it back onto the virtual
 504		 * engine so that it can be moved across onto another physical
 505		 * engine as load dictates.
 506		 */
 507		owner = rq->hw_context->engine;
 508		if (likely(owner == engine)) {
 509			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
 510			if (rq_prio(rq) != prio) {
 511				prio = rq_prio(rq);
 512				pl = i915_sched_lookup_priolist(engine, prio);
 513			}
 514			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
 515
 516			list_move(&rq->sched.link, pl);
 517			active = rq;
 518		} else {
 519			/*
 520			 * Decouple the virtual breadcrumb before moving it
 521			 * back to the virtual engine -- we don't want the
 522			 * request to complete in the background and try
 523			 * and cancel the breadcrumb on the virtual engine
 524			 * (instead of the old engine where it is linked)!
 525			 */
 526			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
 527				     &rq->fence.flags)) {
 528				spin_lock(&rq->lock);
 529				i915_request_cancel_breadcrumb(rq);
 530				spin_unlock(&rq->lock);
 531			}
 532			rq->engine = owner;
 533			owner->submit_request(rq);
 534			active = NULL;
 535		}
 536	}
 537
 538	return active;
 539}
 540
 541struct i915_request *
 542execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
 543{
 544	struct intel_engine_cs *engine =
 545		container_of(execlists, typeof(*engine), execlists);
 546
 547	return __unwind_incomplete_requests(engine);
 
 
 
 
 
 
 548}
 549
 550static inline void
 551execlists_context_status_change(struct i915_request *rq, unsigned long status)
 552{
 553	/*
 554	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
 555	 * The compiler should eliminate this function as dead-code.
 556	 */
 557	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
 558		return;
 559
 560	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
 561				   status, rq);
 562}
 563
 564static inline struct intel_engine_cs *
 565__execlists_schedule_in(struct i915_request *rq)
 566{
 567	struct intel_engine_cs * const engine = rq->engine;
 568	struct intel_context * const ce = rq->hw_context;
 569
 570	intel_context_get(ce);
 571
 572	intel_gt_pm_get(engine->gt);
 573	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
 574	intel_engine_context_in(engine);
 575
 576	return engine;
 577}
 578
 579static inline struct i915_request *
 580execlists_schedule_in(struct i915_request *rq, int idx)
 581{
 582	struct intel_context * const ce = rq->hw_context;
 583	struct intel_engine_cs *old;
 584
 585	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
 586	trace_i915_request_in(rq, idx);
 
 587
 588	old = READ_ONCE(ce->inflight);
 589	do {
 590		if (!old) {
 591			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
 592			break;
 593		}
 594	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
 595
 596	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
 597	return i915_request_get(rq);
 598}
 599
 600static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
 601{
 602	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
 603	struct i915_request *next = READ_ONCE(ve->request);
 604
 605	if (next && next->execution_mask & ~rq->execution_mask)
 606		tasklet_schedule(&ve->base.execlists.tasklet);
 
 
 
 
 
 
 
 
 
 
 
 
 607}
 608
 609static inline void
 610__execlists_schedule_out(struct i915_request *rq,
 611			 struct intel_engine_cs * const engine)
 612{
 613	struct intel_context * const ce = rq->hw_context;
 
 
 
 
 
 
 
 614
 615	intel_engine_context_out(engine);
 616	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
 617	intel_gt_pm_put(engine->gt);
 618
 619	/*
 620	 * If this is part of a virtual engine, its next request may
 621	 * have been blocked waiting for access to the active context.
 622	 * We have to kick all the siblings again in case we need to
 623	 * switch (e.g. the next request is not runnable on this
 624	 * engine). Hopefully, we will already have submitted the next
 625	 * request before the tasklet runs and do not need to rebuild
 626	 * each virtual tree and kick everyone again.
 627	 */
 628	if (ce->engine != engine)
 629		kick_siblings(rq, ce);
 630
 631	intel_context_put(ce);
 632}
 633
 634static inline void
 635execlists_schedule_out(struct i915_request *rq)
 636{
 637	struct intel_context * const ce = rq->hw_context;
 638	struct intel_engine_cs *cur, *old;
 639
 640	trace_i915_request_out(rq);
 641
 642	old = READ_ONCE(ce->inflight);
 643	do
 644		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
 645	while (!try_cmpxchg(&ce->inflight, &old, cur));
 646	if (!cur)
 647		__execlists_schedule_out(rq, old);
 648
 649	i915_request_put(rq);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 650}
 651
 652static u64 execlists_update_context(const struct i915_request *rq)
 
 653{
 654	struct intel_context *ce = rq->hw_context;
 655	u64 desc;
 656
 657	ce->lrc_reg_state[CTX_RING_TAIL + 1] =
 658		intel_ring_set_tail(rq->ring, rq->tail);
 659
 660	/*
 661	 * Make sure the context image is complete before we submit it to HW.
 662	 *
 663	 * Ostensibly, writes (including the WCB) should be flushed prior to
 664	 * an uncached write such as our mmio register access, the empirical
 665	 * evidence (esp. on Braswell) suggests that the WC write into memory
 666	 * may not be visible to the HW prior to the completion of the UC
 667	 * register write and that we may begin execution from the context
 668	 * before its image is complete leading to invalid PD chasing.
 669	 *
 670	 * Furthermore, Braswell, at least, wants a full mb to be sure that
 671	 * the writes are coherent in memory (visible to the GPU) prior to
 672	 * execution, and not just visible to other CPUs (as is the result of
 673	 * wmb).
 674	 */
 675	mb();
 676
 677	desc = ce->lrc_desc;
 678	ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
 679
 680	return desc;
 
 
 
 
 
 681}
 682
 683static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
 684{
 685	if (execlists->ctrl_reg) {
 686		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
 687		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
 
 
 
 688	} else {
 689		writel(upper_32_bits(desc), execlists->submit_reg);
 690		writel(lower_32_bits(desc), execlists->submit_reg);
 
 
 691	}
 692}
 693
 694static __maybe_unused void
 695trace_ports(const struct intel_engine_execlists *execlists,
 696	    const char *msg,
 697	    struct i915_request * const *ports)
 698{
 699	const struct intel_engine_cs *engine =
 700		container_of(execlists, typeof(*engine), execlists);
 701
 702	GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n",
 703		  engine->name, msg,
 704		  ports[0]->fence.context,
 705		  ports[0]->fence.seqno,
 706		  i915_request_completed(ports[0]) ? "!" :
 707		  i915_request_started(ports[0]) ? "*" :
 708		  "",
 709		  ports[1] ? ports[1]->fence.context : 0,
 710		  ports[1] ? ports[1]->fence.seqno : 0);
 711}
 712
 713static __maybe_unused bool
 714assert_pending_valid(const struct intel_engine_execlists *execlists,
 715		     const char *msg)
 716{
 717	struct i915_request * const *port, *rq;
 718	struct intel_context *ce = NULL;
 719
 720	trace_ports(execlists, msg, execlists->pending);
 721
 722	if (!execlists->pending[0])
 723		return false;
 724
 725	if (execlists->pending[execlists_num_ports(execlists)])
 726		return false;
 727
 728	for (port = execlists->pending; (rq = *port); port++) {
 729		if (ce == rq->hw_context)
 730			return false;
 731
 732		ce = rq->hw_context;
 733		if (i915_request_completed(rq))
 734			continue;
 735
 736		if (i915_active_is_idle(&ce->active))
 737			return false;
 738
 739		if (!i915_vma_is_pinned(ce->state))
 740			return false;
 741	}
 742
 743	return ce;
 744}
 745
 746static void execlists_submit_ports(struct intel_engine_cs *engine)
 
 
 
 747{
 748	struct intel_engine_execlists *execlists = &engine->execlists;
 749	unsigned int n;
 750
 751	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
 752
 753	/*
 754	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
 755	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
 756	 * not be relinquished until the device is idle (see
 757	 * i915_gem_idle_work_handler()). As a precaution, we make sure
 758	 * that all ELSP are drained i.e. we have processed the CSB,
 759	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
 760	 */
 761	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
 762
 763	/*
 764	 * ELSQ note: the submit queue is not cleared after being submitted
 765	 * to the HW so we need to make sure we always clean it up. This is
 766	 * currently ensured by the fact that we always write the same number
 767	 * of elsq entries, keep this in mind before changing the loop below.
 768	 */
 769	for (n = execlists_num_ports(execlists); n--; ) {
 770		struct i915_request *rq = execlists->pending[n];
 771
 772		write_desc(execlists,
 773			   rq ? execlists_update_context(rq) : 0,
 774			   n);
 775	}
 776
 777	/* we need to manually load the submit queue */
 778	if (execlists->ctrl_reg)
 779		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
 780}
 781
 782static bool ctx_single_port_submission(const struct intel_context *ce)
 783{
 784	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
 785		i915_gem_context_force_single_submission(ce->gem_context));
 786}
 787
 788static bool can_merge_ctx(const struct intel_context *prev,
 789			  const struct intel_context *next)
 790{
 791	if (prev != next)
 792		return false;
 793
 794	if (ctx_single_port_submission(prev))
 795		return false;
 796
 797	return true;
 798}
 799
 800static bool can_merge_rq(const struct i915_request *prev,
 801			 const struct i915_request *next)
 
 802{
 803	GEM_BUG_ON(prev == next);
 804	GEM_BUG_ON(!assert_priority_queue(prev, next));
 805
 806	/*
 807	 * We do not submit known completed requests. Therefore if the next
 808	 * request is already completed, we can pretend to merge it in
 809	 * with the previous context (and we will skip updating the ELSP
 810	 * and tracking). Thus hopefully keeping the ELSP full with active
 811	 * contexts, despite the best efforts of preempt-to-busy to confuse
 812	 * us.
 813	 */
 814	if (i915_request_completed(next))
 815		return true;
 816
 817	if (!can_merge_ctx(prev->hw_context, next->hw_context))
 818		return false;
 819
 820	return true;
 821}
 822
 823static void virtual_update_register_offsets(u32 *regs,
 824					    struct intel_engine_cs *engine)
 825{
 826	u32 base = engine->mmio_base;
 827
 828	/* Must match execlists_init_reg_state()! */
 829
 830	regs[CTX_CONTEXT_CONTROL] =
 831		i915_mmio_reg_offset(RING_CONTEXT_CONTROL(base));
 832	regs[CTX_RING_HEAD] = i915_mmio_reg_offset(RING_HEAD(base));
 833	regs[CTX_RING_TAIL] = i915_mmio_reg_offset(RING_TAIL(base));
 834	regs[CTX_RING_BUFFER_START] = i915_mmio_reg_offset(RING_START(base));
 835	regs[CTX_RING_BUFFER_CONTROL] = i915_mmio_reg_offset(RING_CTL(base));
 836
 837	regs[CTX_BB_HEAD_U] = i915_mmio_reg_offset(RING_BBADDR_UDW(base));
 838	regs[CTX_BB_HEAD_L] = i915_mmio_reg_offset(RING_BBADDR(base));
 839	regs[CTX_BB_STATE] = i915_mmio_reg_offset(RING_BBSTATE(base));
 840	regs[CTX_SECOND_BB_HEAD_U] =
 841		i915_mmio_reg_offset(RING_SBBADDR_UDW(base));
 842	regs[CTX_SECOND_BB_HEAD_L] = i915_mmio_reg_offset(RING_SBBADDR(base));
 843	regs[CTX_SECOND_BB_STATE] = i915_mmio_reg_offset(RING_SBBSTATE(base));
 844
 845	regs[CTX_CTX_TIMESTAMP] =
 846		i915_mmio_reg_offset(RING_CTX_TIMESTAMP(base));
 847	regs[CTX_PDP3_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 3));
 848	regs[CTX_PDP3_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 3));
 849	regs[CTX_PDP2_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 2));
 850	regs[CTX_PDP2_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 2));
 851	regs[CTX_PDP1_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 1));
 852	regs[CTX_PDP1_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 1));
 853	regs[CTX_PDP0_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 0));
 854	regs[CTX_PDP0_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 0));
 855
 856	if (engine->class == RENDER_CLASS) {
 857		regs[CTX_RCS_INDIRECT_CTX] =
 858			i915_mmio_reg_offset(RING_INDIRECT_CTX(base));
 859		regs[CTX_RCS_INDIRECT_CTX_OFFSET] =
 860			i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(base));
 861		regs[CTX_BB_PER_CTX_PTR] =
 862			i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(base));
 863
 864		regs[CTX_R_PWR_CLK_STATE] =
 865			i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE);
 866	}
 867}
 868
 869static bool virtual_matches(const struct virtual_engine *ve,
 870			    const struct i915_request *rq,
 871			    const struct intel_engine_cs *engine)
 872{
 873	const struct intel_engine_cs *inflight;
 874
 875	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
 876		return false;
 877
 878	/*
 879	 * We track when the HW has completed saving the context image
 880	 * (i.e. when we have seen the final CS event switching out of
 881	 * the context) and must not overwrite the context image before
 882	 * then. This restricts us to only using the active engine
 883	 * while the previous virtualized request is inflight (so
 884	 * we reuse the register offsets). This is a very small
 885	 * hystersis on the greedy seelction algorithm.
 886	 */
 887	inflight = intel_context_inflight(&ve->context);
 888	if (inflight && inflight != engine)
 889		return false;
 890
 891	return true;
 892}
 893
 894static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
 895				     struct intel_engine_cs *engine)
 896{
 897	struct intel_engine_cs *old = ve->siblings[0];
 
 898
 899	/* All unattached (rq->engine == old) must already be completed */
 900
 901	spin_lock(&old->breadcrumbs.irq_lock);
 902	if (!list_empty(&ve->context.signal_link)) {
 903		list_move_tail(&ve->context.signal_link,
 904			       &engine->breadcrumbs.signalers);
 905		intel_engine_queue_breadcrumbs(engine);
 906	}
 907	spin_unlock(&old->breadcrumbs.irq_lock);
 908}
 909
 910static struct i915_request *
 911last_active(const struct intel_engine_execlists *execlists)
 912{
 913	struct i915_request * const *last = READ_ONCE(execlists->active);
 914
 915	while (*last && i915_request_completed(*last))
 916		last++;
 917
 918	return *last;
 919}
 920
 921static void defer_request(struct i915_request *rq, struct list_head * const pl)
 922{
 923	LIST_HEAD(list);
 924
 925	/*
 926	 * We want to move the interrupted request to the back of
 927	 * the round-robin list (i.e. its priority level), but
 928	 * in doing so, we must then move all requests that were in
 929	 * flight and were waiting for the interrupted request to
 930	 * be run after it again.
 931	 */
 932	do {
 933		struct i915_dependency *p;
 934
 935		GEM_BUG_ON(i915_request_is_active(rq));
 936		list_move_tail(&rq->sched.link, pl);
 937
 938		list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
 939			struct i915_request *w =
 940				container_of(p->waiter, typeof(*w), sched);
 941
 942			/* Leave semaphores spinning on the other engines */
 943			if (w->engine != rq->engine)
 944				continue;
 945
 946			/* No waiter should start before its signaler */
 947			GEM_BUG_ON(i915_request_started(w) &&
 948				   !i915_request_completed(rq));
 949
 950			GEM_BUG_ON(i915_request_is_active(w));
 951			if (list_empty(&w->sched.link))
 952				continue; /* Not yet submitted; unready */
 953
 954			if (rq_prio(w) < rq_prio(rq))
 955				continue;
 
 956
 957			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
 958			list_move_tail(&w->sched.link, &list);
 959		}
 960
 961		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
 962	} while (rq);
 963}
 964
 965static void defer_active(struct intel_engine_cs *engine)
 
 
 966{
 967	struct i915_request *rq;
 968
 969	rq = __unwind_incomplete_requests(engine);
 970	if (!rq)
 971		return;
 972
 973	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
 974}
 975
 976static bool
 977need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
 978{
 979	int hint;
 980
 981	if (!intel_engine_has_semaphores(engine))
 982		return false;
 983
 984	if (list_is_last(&rq->sched.link, &engine->active.requests))
 985		return false;
 986
 987	hint = max(rq_prio(list_next_entry(rq, sched.link)),
 988		   engine->execlists.queue_priority_hint);
 
 989
 990	return hint >= effective_prio(rq);
 
 
 
 
 991}
 992
 993static int
 994switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
 995{
 996	if (list_is_last(&rq->sched.link, &engine->active.requests))
 997		return INT_MIN;
 998
 999	return rq_prio(list_next_entry(rq, sched.link));
1000}
1001
1002static bool
1003enable_timeslice(const struct intel_engine_execlists *execlists)
1004{
1005	const struct i915_request *rq = *execlists->active;
 
 
 
 
1006
1007	if (i915_request_completed(rq))
1008		return false;
 
 
 
 
 
 
 
1009
1010	return execlists->switch_priority_hint >= effective_prio(rq);
1011}
1012
1013static void record_preemption(struct intel_engine_execlists *execlists)
1014{
1015	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1016}
1017
1018static void execlists_dequeue(struct intel_engine_cs *engine)
 
1019{
1020	struct intel_engine_execlists * const execlists = &engine->execlists;
1021	struct i915_request **port = execlists->pending;
1022	struct i915_request ** const last_port = port + execlists->port_mask;
1023	struct i915_request *last;
1024	struct rb_node *rb;
1025	bool submit = false;
1026
1027	/*
1028	 * Hardware submission is through 2 ports. Conceptually each port
1029	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1030	 * static for a context, and unique to each, so we only execute
1031	 * requests belonging to a single context from each ring. RING_HEAD
1032	 * is maintained by the CS in the context image, it marks the place
1033	 * where it got up to last time, and through RING_TAIL we tell the CS
1034	 * where we want to execute up to this time.
1035	 *
1036	 * In this list the requests are in order of execution. Consecutive
1037	 * requests from the same context are adjacent in the ringbuffer. We
1038	 * can combine these requests into a single RING_TAIL update:
1039	 *
1040	 *              RING_HEAD...req1...req2
1041	 *                                    ^- RING_TAIL
1042	 * since to execute req2 the CS must first execute req1.
1043	 *
1044	 * Our goal then is to point each port to the end of a consecutive
1045	 * sequence of requests as being the most optimal (fewest wake ups
1046	 * and context switches) submission.
1047	 */
1048
1049	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1050		struct virtual_engine *ve =
1051			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1052		struct i915_request *rq = READ_ONCE(ve->request);
1053
1054		if (!rq) { /* lazily cleanup after another engine handled rq */
1055			rb_erase_cached(rb, &execlists->virtual);
1056			RB_CLEAR_NODE(rb);
1057			rb = rb_first_cached(&execlists->virtual);
1058			continue;
1059		}
1060
1061		if (!virtual_matches(ve, rq, engine)) {
1062			rb = rb_next(rb);
1063			continue;
1064		}
1065
1066		break;
1067	}
1068
1069	/*
1070	 * If the queue is higher priority than the last
1071	 * request in the currently active context, submit afresh.
1072	 * We will resubmit again afterwards in case we need to split
1073	 * the active context to interject the preemption request,
1074	 * i.e. we will retrigger preemption following the ack in case
1075	 * of trouble.
1076	 */
1077	last = last_active(execlists);
1078	if (last) {
1079		if (need_preempt(engine, last, rb)) {
1080			GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n",
1081				  engine->name,
1082				  last->fence.context,
1083				  last->fence.seqno,
1084				  last->sched.attr.priority,
1085				  execlists->queue_priority_hint);
1086			record_preemption(execlists);
1087
1088			/*
1089			 * Don't let the RING_HEAD advance past the breadcrumb
1090			 * as we unwind (and until we resubmit) so that we do
1091			 * not accidentally tell it to go backwards.
1092			 */
1093			ring_set_paused(engine, 1);
1094
1095			/*
1096			 * Note that we have not stopped the GPU at this point,
1097			 * so we are unwinding the incomplete requests as they
1098			 * remain inflight and so by the time we do complete
1099			 * the preemption, some of the unwound requests may
1100			 * complete!
1101			 */
1102			__unwind_incomplete_requests(engine);
1103
1104			/*
1105			 * If we need to return to the preempted context, we
1106			 * need to skip the lite-restore and force it to
1107			 * reload the RING_TAIL. Otherwise, the HW has a
1108			 * tendency to ignore us rewinding the TAIL to the
1109			 * end of an earlier request.
1110			 */
1111			last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1112			last = NULL;
1113		} else if (need_timeslice(engine, last) &&
1114			   !timer_pending(&engine->execlists.timer)) {
1115			GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n",
1116				  engine->name,
1117				  last->fence.context,
1118				  last->fence.seqno,
1119				  last->sched.attr.priority,
1120				  execlists->queue_priority_hint);
1121
1122			ring_set_paused(engine, 1);
1123			defer_active(engine);
1124
1125			/*
1126			 * Unlike for preemption, if we rewind and continue
1127			 * executing the same context as previously active,
1128			 * the order of execution will remain the same and
1129			 * the tail will only advance. We do not need to
1130			 * force a full context restore, as a lite-restore
1131			 * is sufficient to resample the monotonic TAIL.
1132			 *
1133			 * If we switch to any other context, similarly we
1134			 * will not rewind TAIL of current context, and
1135			 * normal save/restore will preserve state and allow
1136			 * us to later continue executing the same request.
1137			 */
1138			last = NULL;
1139		} else {
1140			/*
1141			 * Otherwise if we already have a request pending
1142			 * for execution after the current one, we can
1143			 * just wait until the next CS event before
1144			 * queuing more. In either case we will force a
1145			 * lite-restore preemption event, but if we wait
1146			 * we hopefully coalesce several updates into a single
1147			 * submission.
1148			 */
1149			if (!list_is_last(&last->sched.link,
1150					  &engine->active.requests))
1151				return;
1152
1153			/*
1154			 * WaIdleLiteRestore:bdw,skl
1155			 * Apply the wa NOOPs to prevent
1156			 * ring:HEAD == rq:TAIL as we resubmit the
1157			 * request. See gen8_emit_fini_breadcrumb() for
1158			 * where we prepare the padding after the
1159			 * end of the request.
1160			 */
1161			last->tail = last->wa_tail;
1162		}
1163	}
1164
1165	while (rb) { /* XXX virtual is always taking precedence */
1166		struct virtual_engine *ve =
1167			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1168		struct i915_request *rq;
1169
1170		spin_lock(&ve->base.active.lock);
1171
1172		rq = ve->request;
1173		if (unlikely(!rq)) { /* lost the race to a sibling */
1174			spin_unlock(&ve->base.active.lock);
1175			rb_erase_cached(rb, &execlists->virtual);
1176			RB_CLEAR_NODE(rb);
1177			rb = rb_first_cached(&execlists->virtual);
1178			continue;
1179		}
1180
1181		GEM_BUG_ON(rq != ve->request);
1182		GEM_BUG_ON(rq->engine != &ve->base);
1183		GEM_BUG_ON(rq->hw_context != &ve->context);
1184
1185		if (rq_prio(rq) >= queue_prio(execlists)) {
1186			if (!virtual_matches(ve, rq, engine)) {
1187				spin_unlock(&ve->base.active.lock);
1188				rb = rb_next(rb);
1189				continue;
1190			}
1191
1192			if (last && !can_merge_rq(last, rq)) {
1193				spin_unlock(&ve->base.active.lock);
1194				return; /* leave this for another */
1195			}
1196
1197			GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n",
1198				  engine->name,
1199				  rq->fence.context,
1200				  rq->fence.seqno,
1201				  i915_request_completed(rq) ? "!" :
1202				  i915_request_started(rq) ? "*" :
1203				  "",
1204				  yesno(engine != ve->siblings[0]));
1205
1206			ve->request = NULL;
1207			ve->base.execlists.queue_priority_hint = INT_MIN;
1208			rb_erase_cached(rb, &execlists->virtual);
1209			RB_CLEAR_NODE(rb);
1210
1211			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1212			rq->engine = engine;
1213
1214			if (engine != ve->siblings[0]) {
1215				u32 *regs = ve->context.lrc_reg_state;
1216				unsigned int n;
1217
1218				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1219				virtual_update_register_offsets(regs, engine);
1220
1221				if (!list_empty(&ve->context.signals))
1222					virtual_xfer_breadcrumbs(ve, engine);
1223
1224				/*
1225				 * Move the bound engine to the top of the list
1226				 * for future execution. We then kick this
1227				 * tasklet first before checking others, so that
1228				 * we preferentially reuse this set of bound
1229				 * registers.
1230				 */
1231				for (n = 1; n < ve->num_siblings; n++) {
1232					if (ve->siblings[n] == engine) {
1233						swap(ve->siblings[n],
1234						     ve->siblings[0]);
1235						break;
1236					}
1237				}
1238
1239				GEM_BUG_ON(ve->siblings[0] != engine);
1240			}
1241
1242			if (__i915_request_submit(rq)) {
1243				submit = true;
1244				last = rq;
1245			}
1246			i915_request_put(rq);
1247
1248			/*
1249			 * Hmm, we have a bunch of virtual engine requests,
1250			 * but the first one was already completed (thanks
1251			 * preempt-to-busy!). Keep looking at the veng queue
1252			 * until we have no more relevant requests (i.e.
1253			 * the normal submit queue has higher priority).
1254			 */
1255			if (!submit) {
1256				spin_unlock(&ve->base.active.lock);
1257				rb = rb_first_cached(&execlists->virtual);
1258				continue;
1259			}
1260		}
1261
1262		spin_unlock(&ve->base.active.lock);
1263		break;
1264	}
1265
1266	while ((rb = rb_first_cached(&execlists->queue))) {
1267		struct i915_priolist *p = to_priolist(rb);
1268		struct i915_request *rq, *rn;
1269		int i;
1270
1271		priolist_for_each_request_consume(rq, rn, p, i) {
1272			bool merge = true;
1273
1274			/*
1275			 * Can we combine this request with the current port?
1276			 * It has to be the same context/ringbuffer and not
1277			 * have any exceptions (e.g. GVT saying never to
1278			 * combine contexts).
1279			 *
1280			 * If we can combine the requests, we can execute both
1281			 * by updating the RING_TAIL to point to the end of the
1282			 * second request, and so we never need to tell the
1283			 * hardware about the first.
1284			 */
1285			if (last && !can_merge_rq(last, rq)) {
1286				/*
1287				 * If we are on the second port and cannot
1288				 * combine this request with the last, then we
1289				 * are done.
1290				 */
1291				if (port == last_port)
1292					goto done;
1293
1294				/*
1295				 * We must not populate both ELSP[] with the
1296				 * same LRCA, i.e. we must submit 2 different
1297				 * contexts if we submit 2 ELSP.
1298				 */
1299				if (last->hw_context == rq->hw_context)
1300					goto done;
1301
1302				/*
1303				 * If GVT overrides us we only ever submit
1304				 * port[0], leaving port[1] empty. Note that we
1305				 * also have to be careful that we don't queue
1306				 * the same context (even though a different
1307				 * request) to the second port.
1308				 */
1309				if (ctx_single_port_submission(last->hw_context) ||
1310				    ctx_single_port_submission(rq->hw_context))
1311					goto done;
1312
1313				merge = false;
1314			}
1315
1316			if (__i915_request_submit(rq)) {
1317				if (!merge) {
1318					*port = execlists_schedule_in(last, port - execlists->pending);
1319					port++;
1320					last = NULL;
1321				}
1322
1323				GEM_BUG_ON(last &&
1324					   !can_merge_ctx(last->hw_context,
1325							  rq->hw_context));
1326
1327				submit = true;
1328				last = rq;
1329			}
1330		}
1331
1332		rb_erase_cached(&p->node, &execlists->queue);
1333		i915_priolist_free(p);
1334	}
1335
1336done:
1337	/*
1338	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
1339	 *
1340	 * We choose the priority hint such that if we add a request of greater
1341	 * priority than this, we kick the submission tasklet to decide on
1342	 * the right order of submitting the requests to hardware. We must
1343	 * also be prepared to reorder requests as they are in-flight on the
1344	 * HW. We derive the priority hint then as the first "hole" in
1345	 * the HW submission ports and if there are no available slots,
1346	 * the priority of the lowest executing request, i.e. last.
1347	 *
1348	 * When we do receive a higher priority request ready to run from the
1349	 * user, see queue_request(), the priority hint is bumped to that
1350	 * request triggering preemption on the next dequeue (or subsequent
1351	 * interrupt for secondary ports).
1352	 */
1353	execlists->queue_priority_hint = queue_prio(execlists);
1354	GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n",
1355		  engine->name, execlists->queue_priority_hint,
1356		  yesno(submit));
1357
1358	if (submit) {
1359		*port = execlists_schedule_in(last, port - execlists->pending);
1360		memset(port + 1, 0, (last_port - port) * sizeof(*port));
1361		execlists->switch_priority_hint =
1362			switch_prio(engine, *execlists->pending);
1363		execlists_submit_ports(engine);
1364	} else {
1365		ring_set_paused(engine, 0);
1366	}
1367}
1368
1369static void
1370cancel_port_requests(struct intel_engine_execlists * const execlists)
1371{
1372	struct i915_request * const *port, *rq;
1373
1374	for (port = execlists->pending; (rq = *port); port++)
1375		execlists_schedule_out(rq);
1376	memset(execlists->pending, 0, sizeof(execlists->pending));
1377
1378	for (port = execlists->active; (rq = *port); port++)
1379		execlists_schedule_out(rq);
1380	execlists->active =
1381		memset(execlists->inflight, 0, sizeof(execlists->inflight));
1382}
1383
1384static inline void
1385invalidate_csb_entries(const u32 *first, const u32 *last)
1386{
1387	clflush((void *)first);
1388	clflush((void *)last);
1389}
1390
1391static inline bool
1392reset_in_progress(const struct intel_engine_execlists *execlists)
1393{
1394	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1395}
1396
1397enum csb_step {
1398	CSB_NOP,
1399	CSB_PROMOTE,
1400	CSB_PREEMPT,
1401	CSB_COMPLETE,
1402};
1403
1404/*
1405 * Starting with Gen12, the status has a new format:
1406 *
1407 *     bit  0:     switched to new queue
1408 *     bit  1:     reserved
1409 *     bit  2:     semaphore wait mode (poll or signal), only valid when
1410 *                 switch detail is set to "wait on semaphore"
1411 *     bits 3-5:   engine class
1412 *     bits 6-11:  engine instance
1413 *     bits 12-14: reserved
1414 *     bits 15-25: sw context id of the lrc the GT switched to
1415 *     bits 26-31: sw counter of the lrc the GT switched to
1416 *     bits 32-35: context switch detail
1417 *                  - 0: ctx complete
1418 *                  - 1: wait on sync flip
1419 *                  - 2: wait on vblank
1420 *                  - 3: wait on scanline
1421 *                  - 4: wait on semaphore
1422 *                  - 5: context preempted (not on SEMAPHORE_WAIT or
1423 *                       WAIT_FOR_EVENT)
1424 *     bit  36:    reserved
1425 *     bits 37-43: wait detail (for switch detail 1 to 4)
1426 *     bits 44-46: reserved
1427 *     bits 47-57: sw context id of the lrc the GT switched away from
1428 *     bits 58-63: sw counter of the lrc the GT switched away from
1429 */
1430static inline enum csb_step
1431gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
1432{
1433	u32 lower_dw = csb[0];
1434	u32 upper_dw = csb[1];
1435	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
1436	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
1437	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
1438
1439	if (!ctx_away_valid && ctx_to_valid)
1440		return CSB_PROMOTE;
1441
1442	/*
1443	 * The context switch detail is not guaranteed to be 5 when a preemption
1444	 * occurs, so we can't just check for that. The check below works for
1445	 * all the cases we care about, including preemptions of WAIT
1446	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
1447	 * would require some extra handling, but we don't support that.
1448	 */
1449	if (new_queue && ctx_away_valid)
1450		return CSB_PREEMPT;
1451
1452	/*
1453	 * switch detail = 5 is covered by the case above and we do not expect a
1454	 * context switch on an unsuccessful wait instruction since we always
1455	 * use polling mode.
1456	 */
1457	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
1458
1459	if (*execlists->active) {
1460		GEM_BUG_ON(!ctx_away_valid);
1461		return CSB_COMPLETE;
 
1462	}
1463
1464	return CSB_NOP;
1465}
1466
1467static inline enum csb_step
1468gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
1469{
1470	unsigned int status = *csb;
1471
1472	if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
1473		return CSB_PROMOTE;
1474
1475	if (status & GEN8_CTX_STATUS_PREEMPTED)
1476		return CSB_PREEMPT;
1477
1478	if (*execlists->active)
1479		return CSB_COMPLETE;
1480
1481	return CSB_NOP;
1482}
1483
1484static void process_csb(struct intel_engine_cs *engine)
1485{
1486	struct intel_engine_execlists * const execlists = &engine->execlists;
1487	const u32 * const buf = execlists->csb_status;
1488	const u8 num_entries = execlists->csb_size;
1489	u8 head, tail;
1490
1491	GEM_BUG_ON(USES_GUC_SUBMISSION(engine->i915));
1492
1493	/*
1494	 * Note that csb_write, csb_status may be either in HWSP or mmio.
1495	 * When reading from the csb_write mmio register, we have to be
1496	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
1497	 * the low 4bits. As it happens we know the next 4bits are always
1498	 * zero and so we can simply masked off the low u8 of the register
1499	 * and treat it identically to reading from the HWSP (without having
1500	 * to use explicit shifting and masking, and probably bifurcating
1501	 * the code to handle the legacy mmio read).
1502	 */
1503	head = execlists->csb_head;
1504	tail = READ_ONCE(*execlists->csb_write);
1505	GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
1506	if (unlikely(head == tail))
1507		return;
1508
1509	/*
1510	 * Hopefully paired with a wmb() in HW!
1511	 *
1512	 * We must complete the read of the write pointer before any reads
1513	 * from the CSB, so that we do not see stale values. Without an rmb
1514	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
1515	 * we perform the READ_ONCE(*csb_write).
1516	 */
1517	rmb();
1518
1519	do {
1520		enum csb_step csb_step;
1521
1522		if (++head == num_entries)
1523			head = 0;
1524
1525		/*
1526		 * We are flying near dragons again.
1527		 *
1528		 * We hold a reference to the request in execlist_port[]
1529		 * but no more than that. We are operating in softirq
1530		 * context and so cannot hold any mutex or sleep. That
1531		 * prevents us stopping the requests we are processing
1532		 * in port[] from being retired simultaneously (the
1533		 * breadcrumb will be complete before we see the
1534		 * context-switch). As we only hold the reference to the
1535		 * request, any pointer chasing underneath the request
1536		 * is subject to a potential use-after-free. Thus we
1537		 * store all of the bookkeeping within port[] as
1538		 * required, and avoid using unguarded pointers beneath
1539		 * request itself. The same applies to the atomic
1540		 * status notifier.
1541		 */
1542
1543		GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n",
1544			  engine->name, head,
1545			  buf[2 * head + 0], buf[2 * head + 1]);
1546
1547		if (INTEL_GEN(engine->i915) >= 12)
1548			csb_step = gen12_csb_parse(execlists, buf + 2 * head);
1549		else
1550			csb_step = gen8_csb_parse(execlists, buf + 2 * head);
1551
1552		switch (csb_step) {
1553		case CSB_PREEMPT: /* cancel old inflight, prepare for switch */
1554			trace_ports(execlists, "preempted", execlists->active);
1555
1556			while (*execlists->active)
1557				execlists_schedule_out(*execlists->active++);
1558
1559			/* fallthrough */
1560		case CSB_PROMOTE: /* switch pending to inflight */
1561			GEM_BUG_ON(*execlists->active);
1562			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
1563			execlists->active =
1564				memcpy(execlists->inflight,
1565				       execlists->pending,
1566				       execlists_num_ports(execlists) *
1567				       sizeof(*execlists->pending));
1568
1569			if (enable_timeslice(execlists))
1570				mod_timer(&execlists->timer, jiffies + 1);
1571
1572			if (!inject_preempt_hang(execlists))
1573				ring_set_paused(engine, 0);
1574
1575			WRITE_ONCE(execlists->pending[0], NULL);
1576			break;
1577
1578		case CSB_COMPLETE: /* port0 completed, advanced to port1 */
1579			trace_ports(execlists, "completed", execlists->active);
1580
1581			/*
1582			 * We rely on the hardware being strongly
1583			 * ordered, that the breadcrumb write is
1584			 * coherent (visible from the CPU) before the
1585			 * user interrupt and CSB is processed.
1586			 */
1587			GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
1588				   !reset_in_progress(execlists));
1589			execlists_schedule_out(*execlists->active++);
1590
1591			GEM_BUG_ON(execlists->active - execlists->inflight >
1592				   execlists_num_ports(execlists));
1593			break;
1594
1595		case CSB_NOP:
1596			break;
1597		}
1598	} while (head != tail);
1599
1600	execlists->csb_head = head;
1601
1602	/*
1603	 * Gen11 has proven to fail wrt global observation point between
1604	 * entry and tail update, failing on the ordering and thus
1605	 * we see an old entry in the context status buffer.
1606	 *
1607	 * Forcibly evict out entries for the next gpu csb update,
1608	 * to increase the odds that we get a fresh entries with non
1609	 * working hardware. The cost for doing so comes out mostly with
1610	 * the wash as hardware, working or not, will need to do the
1611	 * invalidation before.
1612	 */
1613	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
1614}
1615
1616static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
1617{
1618	lockdep_assert_held(&engine->active.lock);
1619	if (!engine->execlists.pending[0]) {
1620		rcu_read_lock(); /* protect peeking at execlists->active */
1621		execlists_dequeue(engine);
1622		rcu_read_unlock();
1623	}
1624}
1625
1626/*
1627 * Check the unread Context Status Buffers and manage the submission of new
1628 * contexts to the ELSP accordingly.
1629 */
1630static void execlists_submission_tasklet(unsigned long data)
1631{
1632	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
1633	unsigned long flags;
1634
1635	process_csb(engine);
1636	if (!READ_ONCE(engine->execlists.pending[0])) {
1637		spin_lock_irqsave(&engine->active.lock, flags);
1638		__execlists_submission_tasklet(engine);
1639		spin_unlock_irqrestore(&engine->active.lock, flags);
1640	}
1641}
1642
1643static void execlists_submission_timer(struct timer_list *timer)
 
 
 
 
1644{
1645	struct intel_engine_cs *engine =
1646		from_timer(engine, timer, execlists.timer);
1647
1648	/* Kick the tasklet for some interrupt coalescing and reset handling */
1649	tasklet_hi_schedule(&engine->execlists.tasklet);
1650}
 
 
1651
1652static void queue_request(struct intel_engine_cs *engine,
1653			  struct i915_sched_node *node,
1654			  int prio)
1655{
1656	GEM_BUG_ON(!list_empty(&node->link));
1657	list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio));
1658}
1659
1660static void __submit_queue_imm(struct intel_engine_cs *engine)
 
 
 
1661{
1662	struct intel_engine_execlists * const execlists = &engine->execlists;
1663
1664	if (reset_in_progress(execlists))
1665		return; /* defer until we restart the engine following reset */
1666
1667	if (execlists->tasklet.func == execlists_submission_tasklet)
1668		__execlists_submission_tasklet(engine);
1669	else
1670		tasklet_hi_schedule(&execlists->tasklet);
1671}
1672
1673static void submit_queue(struct intel_engine_cs *engine,
1674			 const struct i915_request *rq)
1675{
1676	struct intel_engine_execlists *execlists = &engine->execlists;
1677
1678	if (rq_prio(rq) <= execlists->queue_priority_hint)
1679		return;
1680
1681	execlists->queue_priority_hint = rq_prio(rq);
1682	__submit_queue_imm(engine);
1683}
1684
1685static void execlists_submit_request(struct i915_request *request)
1686{
1687	struct intel_engine_cs *engine = request->engine;
1688	unsigned long flags;
1689
1690	/* Will be called from irq-context when using foreign fences. */
1691	spin_lock_irqsave(&engine->active.lock, flags);
1692
1693	queue_request(engine, &request->sched, rq_prio(request));
1694
1695	GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1696	GEM_BUG_ON(list_empty(&request->sched.link));
1697
1698	submit_queue(engine, request);
1699
1700	spin_unlock_irqrestore(&engine->active.lock, flags);
1701}
1702
1703static void __execlists_context_fini(struct intel_context *ce)
1704{
1705	intel_ring_put(ce->ring);
1706	i915_vma_put(ce->state);
 
 
 
1707}
1708
1709static void execlists_context_destroy(struct kref *kref)
1710{
1711	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1712
1713	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1714	GEM_BUG_ON(intel_context_is_pinned(ce));
1715
1716	if (ce->state)
1717		__execlists_context_fini(ce);
1718
1719	intel_context_fini(ce);
1720	intel_context_free(ce);
1721}
1722
1723static void
1724set_redzone(void *vaddr, const struct intel_engine_cs *engine)
1725{
1726	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1727		return;
 
 
 
 
 
1728
1729	vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
1730	vaddr += engine->context_size;
 
 
 
 
 
 
 
 
 
1731
1732	memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
1733}
1734
1735static void
1736check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
1737{
1738	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1739		return;
1740
1741	vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
1742	vaddr += engine->context_size;
 
 
 
 
 
1743
1744	if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE))
1745		dev_err_once(engine->i915->drm.dev,
1746			     "%s context redzone overwritten!\n",
1747			     engine->name);
1748}
1749
1750static void execlists_context_unpin(struct intel_context *ce)
 
1751{
1752	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
1753		      ce->engine);
1754
1755	i915_gem_context_unpin_hw_id(ce->gem_context);
1756	i915_gem_object_unpin_map(ce->state->obj);
1757	intel_ring_reset(ce->ring, ce->ring->tail);
1758}
1759
1760static void
1761__execlists_update_reg_state(struct intel_context *ce,
1762			     struct intel_engine_cs *engine)
1763{
1764	struct intel_ring *ring = ce->ring;
1765	u32 *regs = ce->lrc_reg_state;
1766
1767	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
1768	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1769
1770	regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(ring->vma);
1771	regs[CTX_RING_HEAD + 1] = ring->head;
1772	regs[CTX_RING_TAIL + 1] = ring->tail;
 
 
1773
1774	/* RPCS */
1775	if (engine->class == RENDER_CLASS) {
1776		regs[CTX_R_PWR_CLK_STATE + 1] =
1777			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
1778
1779		i915_oa_init_reg_state(engine, ce, regs);
1780	}
1781}
1782
1783static int
1784__execlists_context_pin(struct intel_context *ce,
1785			struct intel_engine_cs *engine)
 
 
 
 
1786{
1787	void *vaddr;
1788	int ret;
1789
1790	GEM_BUG_ON(!ce->state);
1791
1792	ret = intel_context_active_acquire(ce);
1793	if (ret)
1794		goto err;
1795	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
 
 
1796
1797	vaddr = i915_gem_object_pin_map(ce->state->obj,
1798					i915_coherent_map_type(engine->i915) |
1799					I915_MAP_OVERRIDE);
1800	if (IS_ERR(vaddr)) {
1801		ret = PTR_ERR(vaddr);
1802		goto unpin_active;
1803	}
1804
1805	ret = i915_gem_context_pin_hw_id(ce->gem_context);
1806	if (ret)
1807		goto unpin_map;
1808
1809	ce->lrc_desc = lrc_descriptor(ce, engine);
1810	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
1811	__execlists_update_reg_state(ce, engine);
1812
1813	return 0;
1814
1815unpin_map:
1816	i915_gem_object_unpin_map(ce->state->obj);
1817unpin_active:
1818	intel_context_active_release(ce);
1819err:
1820	return ret;
1821}
1822
1823static int execlists_context_pin(struct intel_context *ce)
 
 
 
 
 
 
 
 
1824{
1825	return __execlists_context_pin(ce, ce->engine);
1826}
 
1827
1828static int execlists_context_alloc(struct intel_context *ce)
1829{
1830	return __execlists_context_alloc(ce, ce->engine);
1831}
1832
1833static void execlists_context_reset(struct intel_context *ce)
 
1834{
1835	/*
1836	 * Because we emit WA_TAIL_DWORDS there may be a disparity
1837	 * between our bookkeeping in ce->ring->head and ce->ring->tail and
1838	 * that stored in context. As we only write new commands from
1839	 * ce->ring->tail onwards, everything before that is junk. If the GPU
1840	 * starts reading from its RING_HEAD from the context, it may try to
1841	 * execute that junk and die.
1842	 *
1843	 * The contexts that are stilled pinned on resume belong to the
1844	 * kernel, and are local to each engine. All other contexts will
1845	 * have their head/tail sanitized upon pinning before use, so they
1846	 * will never see garbage,
1847	 *
1848	 * So to avoid that we reset the context images upon resume. For
1849	 * simplicity, we just zero everything out.
1850	 */
1851	intel_ring_reset(ce->ring, 0);
1852	__execlists_update_reg_state(ce, ce->engine);
1853}
 
 
 
1854
1855static const struct intel_context_ops execlists_context_ops = {
1856	.alloc = execlists_context_alloc,
1857
1858	.pin = execlists_context_pin,
1859	.unpin = execlists_context_unpin,
 
 
 
1860
1861	.enter = intel_context_enter_engine,
1862	.exit = intel_context_exit_engine,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1863
1864	.reset = execlists_context_reset,
1865	.destroy = execlists_context_destroy,
1866};
1867
1868static int gen8_emit_init_breadcrumb(struct i915_request *rq)
 
 
 
1869{
 
1870	u32 *cs;
1871
1872	GEM_BUG_ON(!rq->timeline->has_initial_breadcrumb);
1873
1874	cs = intel_ring_begin(rq, 6);
1875	if (IS_ERR(cs))
1876		return PTR_ERR(cs);
1877
1878	/*
1879	 * Check if we have been preempted before we even get started.
1880	 *
1881	 * After this point i915_request_started() reports true, even if
1882	 * we get preempted and so are no longer running.
1883	 */
1884	*cs++ = MI_ARB_CHECK;
1885	*cs++ = MI_NOOP;
1886
1887	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1888	*cs++ = rq->timeline->hwsp_offset;
1889	*cs++ = 0;
1890	*cs++ = rq->fence.seqno - 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1891
1892	intel_ring_advance(rq, cs);
 
 
 
1893
1894	/* Record the updated position of the request's payload */
1895	rq->infix = intel_ring_offset(rq, cs);
 
1896
1897	return 0;
1898}
1899
1900static int emit_pdps(struct i915_request *rq)
 
 
1901{
1902	const struct intel_engine_cs * const engine = rq->engine;
1903	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->hw_context->vm);
1904	int err, i;
1905	u32 *cs;
1906
1907	GEM_BUG_ON(intel_vgpu_active(rq->i915));
 
1908
1909	/*
1910	 * Beware ye of the dragons, this sequence is magic!
1911	 *
1912	 * Small changes to this sequence can cause anything from
1913	 * GPU hangs to forcewake errors and machine lockups!
1914	 */
1915
1916	/* Flush any residual operations from the context load */
1917	err = engine->emit_flush(rq, EMIT_FLUSH);
1918	if (err)
1919		return err;
1920
1921	/* Magic required to prevent forcewake errors! */
1922	err = engine->emit_flush(rq, EMIT_INVALIDATE);
1923	if (err)
1924		return err;
1925
1926	cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
1927	if (IS_ERR(cs))
1928		return PTR_ERR(cs);
1929
1930	/* Ensure the LRI have landed before we invalidate & continue */
1931	*cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
1932	for (i = GEN8_3LVL_PDPES; i--; ) {
1933		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
1934		u32 base = engine->mmio_base;
1935
1936		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
1937		*cs++ = upper_32_bits(pd_daddr);
1938		*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
1939		*cs++ = lower_32_bits(pd_daddr);
1940	}
1941	*cs++ = MI_NOOP;
1942
1943	intel_ring_advance(rq, cs);
1944
1945	/* Be doubly sure the LRI have landed before proceeding */
1946	err = engine->emit_flush(rq, EMIT_FLUSH);
1947	if (err)
1948		return err;
1949
1950	/* Re-invalidate the TLB for luck */
1951	return engine->emit_flush(rq, EMIT_INVALIDATE);
1952}
1953
1954static int execlists_request_alloc(struct i915_request *request)
 
1955{
1956	int ret;
1957
1958	GEM_BUG_ON(!intel_context_is_pinned(request->hw_context));
1959
1960	/*
1961	 * Flush enough space to reduce the likelihood of waiting after
1962	 * we start building the request - in which case we will just
1963	 * have to repeat work.
1964	 */
1965	request->reserved_space += EXECLISTS_REQUEST_SIZE;
1966
1967	/*
1968	 * Note that after this point, we have committed to using
1969	 * this request as it is being used to both track the
1970	 * state of engine initialisation and liveness of the
1971	 * golden renderstate above. Think twice before you try
1972	 * to cancel/unwind this request now.
1973	 */
 
1974
1975	/* Unconditionally invalidate GPU caches and TLBs. */
1976	if (i915_vm_is_4lvl(request->hw_context->vm))
1977		ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
1978	else
1979		ret = emit_pdps(request);
1980	if (ret)
1981		return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1982
1983	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
1984	return 0;
1985}
1986
1987/*
1988 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1989 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1990 * but there is a slight complication as this is applied in WA batch where the
1991 * values are only initialized once so we cannot take register value at the
1992 * beginning and reuse it further; hence we save its value to memory, upload a
1993 * constant value with bit21 set and then we restore it back with the saved value.
1994 * To simplify the WA, a constant value is formed by using the default value
1995 * of this register. This shouldn't be a problem because we are only modifying
1996 * it for a short period and this batch in non-premptible. We can ofcourse
1997 * use additional instructions that read the actual value of the register
1998 * at that time and set our bit of interest but it makes the WA complicated.
1999 *
2000 * This WA is also required for Gen9 so extracting as a function avoids
2001 * code duplication.
2002 */
2003static u32 *
2004gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
2005{
2006	/* NB no one else is allowed to scribble over scratch + 256! */
2007	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2008	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2009	*batch++ = intel_gt_scratch_offset(engine->gt,
2010					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2011	*batch++ = 0;
2012
2013	*batch++ = MI_LOAD_REGISTER_IMM(1);
2014	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2015	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
2016
2017	batch = gen8_emit_pipe_control(batch,
2018				       PIPE_CONTROL_CS_STALL |
2019				       PIPE_CONTROL_DC_FLUSH_ENABLE,
2020				       0);
2021
2022	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
2023	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
2024	*batch++ = intel_gt_scratch_offset(engine->gt,
2025					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
2026	*batch++ = 0;
2027
2028	return batch;
2029}
2030
2031static u32 slm_offset(struct intel_engine_cs *engine)
2032{
2033	return intel_gt_scratch_offset(engine->gt,
2034				       INTEL_GT_SCRATCH_FIELD_CLEAR_SLM_WA);
2035}
2036
2037/*
2038 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
2039 * initialized at the beginning and shared across all contexts but this field
2040 * helps us to have multiple batches at different offsets and select them based
2041 * on a criteria. At the moment this batch always start at the beginning of the page
2042 * and at this point we don't have multiple wa_ctx batch buffers.
2043 *
2044 * The number of WA applied are not known at the beginning; we use this field
2045 * to return the no of DWORDS written.
2046 *
2047 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
2048 * so it adds NOOPs as padding to make it cacheline aligned.
2049 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
2050 * makes a complete batch buffer.
2051 */
2052static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2053{
2054	/* WaDisableCtxRestoreArbitration:bdw,chv */
2055	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2056
2057	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
2058	if (IS_BROADWELL(engine->i915))
2059		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2060
2061	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
2062	/* Actual scratch location is at 128 bytes offset */
2063	batch = gen8_emit_pipe_control(batch,
2064				       PIPE_CONTROL_FLUSH_L3 |
2065				       PIPE_CONTROL_GLOBAL_GTT_IVB |
2066				       PIPE_CONTROL_CS_STALL |
2067				       PIPE_CONTROL_QW_WRITE,
2068				       slm_offset(engine));
2069
2070	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2071
2072	/* Pad to end of cacheline */
2073	while ((unsigned long)batch % CACHELINE_BYTES)
2074		*batch++ = MI_NOOP;
2075
2076	/*
2077	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
2078	 * execution depends on the length specified in terms of cache lines
2079	 * in the register CTX_RCS_INDIRECT_CTX
2080	 */
2081
2082	return batch;
2083}
2084
2085struct lri {
2086	i915_reg_t reg;
2087	u32 value;
2088};
2089
2090static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
2091{
2092	GEM_BUG_ON(!count || count > 63);
2093
2094	*batch++ = MI_LOAD_REGISTER_IMM(count);
2095	do {
2096		*batch++ = i915_mmio_reg_offset(lri->reg);
2097		*batch++ = lri->value;
2098	} while (lri++, --count);
2099	*batch++ = MI_NOOP;
2100
2101	return batch;
2102}
2103
2104static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2105{
2106	static const struct lri lri[] = {
2107		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
2108		{
2109			COMMON_SLICE_CHICKEN2,
2110			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
2111				       0),
2112		},
2113
2114		/* BSpec: 11391 */
2115		{
2116			FF_SLICE_CHICKEN,
2117			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
2118				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
2119		},
2120
2121		/* BSpec: 11299 */
2122		{
2123			_3D_CHICKEN3,
2124			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
2125				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
2126		}
2127	};
2128
2129	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2130
2131	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
2132	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
2133
 
 
 
 
 
 
 
 
2134	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
2135
2136	/* WaMediaPoolStateCmdInWABB:bxt,glk */
2137	if (HAS_POOLED_EU(engine->i915)) {
2138		/*
2139		 * EU pool configuration is setup along with golden context
2140		 * during context initialization. This value depends on
2141		 * device type (2x6 or 3x6) and needs to be updated based
2142		 * on which subslice is disabled especially for 2x6
2143		 * devices, however it is safe to load default
2144		 * configuration of 3x6 device instead of masking off
2145		 * corresponding bits because HW ignores bits of a disabled
2146		 * subslice and drops down to appropriate config. Please
2147		 * see render_state_setup() in i915_gem_render_state.c for
2148		 * possible configurations, to avoid duplication they are
2149		 * not shown here again.
2150		 */
2151		*batch++ = GEN9_MEDIA_POOL_STATE;
2152		*batch++ = GEN9_MEDIA_POOL_ENABLE;
2153		*batch++ = 0x00777000;
2154		*batch++ = 0;
2155		*batch++ = 0;
2156		*batch++ = 0;
2157	}
2158
2159	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2160
2161	/* Pad to end of cacheline */
2162	while ((unsigned long)batch % CACHELINE_BYTES)
2163		*batch++ = MI_NOOP;
2164
2165	return batch;
2166}
2167
2168static u32 *
2169gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
2170{
2171	int i;
2172
2173	/*
2174	 * WaPipeControlBefore3DStateSamplePattern: cnl
2175	 *
2176	 * Ensure the engine is idle prior to programming a
2177	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
2178	 */
2179	batch = gen8_emit_pipe_control(batch,
2180				       PIPE_CONTROL_CS_STALL,
2181				       0);
2182	/*
2183	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
2184	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
2185	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
2186	 * confusing. Since gen8_emit_pipe_control() already advances the
2187	 * batch by 6 dwords, we advance the other 10 here, completing a
2188	 * cacheline. It's not clear if the workaround requires this padding
2189	 * before other commands, or if it's just the regular padding we would
2190	 * already have for the workaround bb, so leave it here for now.
2191	 */
2192	for (i = 0; i < 10; i++)
2193		*batch++ = MI_NOOP;
2194
2195	/* Pad to end of cacheline */
2196	while ((unsigned long)batch % CACHELINE_BYTES)
2197		*batch++ = MI_NOOP;
2198
2199	return batch;
2200}
2201
2202#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
2203
2204static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
2205{
2206	struct drm_i915_gem_object *obj;
2207	struct i915_vma *vma;
2208	int err;
2209
2210	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
2211	if (IS_ERR(obj))
2212		return PTR_ERR(obj);
2213
2214	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
2215	if (IS_ERR(vma)) {
2216		err = PTR_ERR(vma);
2217		goto err;
2218	}
2219
2220	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
2221	if (err)
2222		goto err;
2223
2224	engine->wa_ctx.vma = vma;
2225	return 0;
2226
2227err:
2228	i915_gem_object_put(obj);
2229	return err;
2230}
2231
2232static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
2233{
2234	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
2235}
2236
2237typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
2238
2239static int intel_init_workaround_bb(struct intel_engine_cs *engine)
2240{
2241	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2242	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
2243					    &wa_ctx->per_ctx };
2244	wa_bb_func_t wa_bb_fn[2];
2245	struct page *page;
 
2246	void *batch, *batch_ptr;
2247	unsigned int i;
2248	int ret;
2249
2250	if (engine->class != RENDER_CLASS)
2251		return 0;
 
2252
2253	switch (INTEL_GEN(engine->i915)) {
2254	case 12:
2255	case 11:
2256		return 0;
2257	case 10:
2258		wa_bb_fn[0] = gen10_init_indirectctx_bb;
2259		wa_bb_fn[1] = NULL;
2260		break;
2261	case 9:
2262		wa_bb_fn[0] = gen9_init_indirectctx_bb;
2263		wa_bb_fn[1] = NULL;
2264		break;
2265	case 8:
2266		wa_bb_fn[0] = gen8_init_indirectctx_bb;
2267		wa_bb_fn[1] = NULL;
2268		break;
2269	default:
2270		MISSING_CASE(INTEL_GEN(engine->i915));
2271		return 0;
2272	}
2273
2274	ret = lrc_setup_wa_ctx(engine);
2275	if (ret) {
2276		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
2277		return ret;
 
 
 
 
 
 
 
2278	}
2279
2280	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
2281	batch = batch_ptr = kmap_atomic(page);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2282
2283	/*
2284	 * Emit the two workaround batch buffers, recording the offset from the
2285	 * start of the workaround batch buffer object for each and their
2286	 * respective sizes.
2287	 */
 
2288	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
2289		wa_bb[i]->offset = batch_ptr - batch;
2290		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
2291						  CACHELINE_BYTES))) {
2292			ret = -EINVAL;
2293			break;
2294		}
2295		if (wa_bb_fn[i])
2296			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
2297		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
2298	}
 
2299
2300	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
2301
2302	kunmap_atomic(batch);
2303	if (ret)
2304		lrc_destroy_wa_ctx(engine);
2305
2306	return ret;
2307}
2308
2309static void enable_execlists(struct intel_engine_cs *engine)
2310{
2311	u32 mode;
2312
2313	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
2314
2315	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
2316
2317	if (INTEL_GEN(engine->i915) >= 11)
2318		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
2319	else
2320		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
2321	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
2322
2323	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
2324
2325	ENGINE_WRITE_FW(engine,
2326			RING_HWS_PGA,
2327			i915_ggtt_offset(engine->status_page.vma));
2328	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
2329}
2330
2331static bool unexpected_starting_state(struct intel_engine_cs *engine)
2332{
2333	bool unexpected = false;
2334
2335	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
2336		DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
2337		unexpected = true;
2338	}
2339
2340	return unexpected;
2341}
2342
2343static int execlists_resume(struct intel_engine_cs *engine)
2344{
2345	intel_engine_apply_workarounds(engine);
2346	intel_engine_apply_whitelist(engine);
2347
2348	intel_mocs_init_engine(engine);
2349
2350	intel_engine_reset_breadcrumbs(engine);
2351
2352	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
2353		struct drm_printer p = drm_debug_printer(__func__);
2354
2355		intel_engine_dump(engine, &p, NULL);
2356	}
2357
2358	enable_execlists(engine);
2359
2360	return 0;
2361}
2362
2363static void execlists_reset_prepare(struct intel_engine_cs *engine)
2364{
2365	struct intel_engine_execlists * const execlists = &engine->execlists;
2366	unsigned long flags;
2367
2368	GEM_TRACE("%s: depth<-%d\n", engine->name,
2369		  atomic_read(&execlists->tasklet.count));
2370
2371	/*
2372	 * Prevent request submission to the hardware until we have
2373	 * completed the reset in i915_gem_reset_finish(). If a request
2374	 * is completed by one engine, it may then queue a request
2375	 * to a second via its execlists->tasklet *just* as we are
2376	 * calling engine->resume() and also writing the ELSP.
2377	 * Turning off the execlists->tasklet until the reset is over
2378	 * prevents the race.
2379	 */
2380	__tasklet_disable_sync_once(&execlists->tasklet);
2381	GEM_BUG_ON(!reset_in_progress(execlists));
2382
2383	/* And flush any current direct submission. */
2384	spin_lock_irqsave(&engine->active.lock, flags);
2385	spin_unlock_irqrestore(&engine->active.lock, flags);
2386
2387	/*
2388	 * We stop engines, otherwise we might get failed reset and a
2389	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
2390	 * from system hang if batchbuffer is progressing when
2391	 * the reset is issued, regardless of READY_TO_RESET ack.
2392	 * Thus assume it is best to stop engines on all gens
2393	 * where we have a gpu reset.
2394	 *
2395	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
2396	 *
2397	 * FIXME: Wa for more modern gens needs to be validated
2398	 */
2399	intel_engine_stop_cs(engine);
2400}
2401
2402static void reset_csb_pointers(struct intel_engine_cs *engine)
2403{
2404	struct intel_engine_execlists * const execlists = &engine->execlists;
2405	const unsigned int reset_value = execlists->csb_size - 1;
2406
2407	ring_set_paused(engine, 0);
2408
2409	/*
2410	 * After a reset, the HW starts writing into CSB entry [0]. We
2411	 * therefore have to set our HEAD pointer back one entry so that
2412	 * the *first* entry we check is entry 0. To complicate this further,
2413	 * as we don't wait for the first interrupt after reset, we have to
2414	 * fake the HW write to point back to the last entry so that our
2415	 * inline comparison of our cached head position against the last HW
2416	 * write works even before the first interrupt.
2417	 */
2418	execlists->csb_head = reset_value;
2419	WRITE_ONCE(*execlists->csb_write, reset_value);
2420	wmb(); /* Make sure this is visible to HW (paranoia?) */
2421
2422	invalidate_csb_entries(&execlists->csb_status[0],
2423			       &execlists->csb_status[reset_value]);
2424}
2425
2426static struct i915_request *active_request(struct i915_request *rq)
2427{
2428	const struct intel_context * const ce = rq->hw_context;
2429	struct i915_request *active = NULL;
2430	struct list_head *list;
2431
2432	if (!i915_request_is_active(rq)) /* unwound, but incomplete! */
2433		return rq;
2434
2435	list = &rq->timeline->requests;
2436	list_for_each_entry_from_reverse(rq, list, link) {
2437		if (i915_request_completed(rq))
2438			break;
2439
2440		if (rq->hw_context != ce)
2441			break;
2442
2443		active = rq;
2444	}
2445
2446	return active;
2447}
2448
2449static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
2450{
2451	struct intel_engine_execlists * const execlists = &engine->execlists;
2452	struct intel_context *ce;
2453	struct i915_request *rq;
2454	u32 *regs;
2455
2456	process_csb(engine); /* drain preemption events */
2457
2458	/* Following the reset, we need to reload the CSB read/write pointers */
2459	reset_csb_pointers(engine);
2460
2461	/*
2462	 * Save the currently executing context, even if we completed
2463	 * its request, it was still running at the time of the
2464	 * reset and will have been clobbered.
2465	 */
2466	rq = execlists_active(execlists);
2467	if (!rq)
2468		goto unwind;
2469
2470	ce = rq->hw_context;
2471	GEM_BUG_ON(i915_active_is_idle(&ce->active));
2472	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2473	rq = active_request(rq);
2474	if (!rq) {
2475		ce->ring->head = ce->ring->tail;
2476		goto out_replay;
2477	}
2478
2479	ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
2480
2481	/*
2482	 * If this request hasn't started yet, e.g. it is waiting on a
2483	 * semaphore, we need to avoid skipping the request or else we
2484	 * break the signaling chain. However, if the context is corrupt
2485	 * the request will not restart and we will be stuck with a wedged
2486	 * device. It is quite often the case that if we issue a reset
2487	 * while the GPU is loading the context image, that the context
2488	 * image becomes corrupt.
2489	 *
2490	 * Otherwise, if we have not started yet, the request should replay
2491	 * perfectly and we do not need to flag the result as being erroneous.
2492	 */
2493	if (!i915_request_started(rq))
2494		goto out_replay;
2495
2496	/*
2497	 * If the request was innocent, we leave the request in the ELSP
2498	 * and will try to replay it on restarting. The context image may
2499	 * have been corrupted by the reset, in which case we may have
2500	 * to service a new GPU hang, but more likely we can continue on
2501	 * without impact.
2502	 *
2503	 * If the request was guilty, we presume the context is corrupt
2504	 * and have to at least restore the RING register in the context
2505	 * image back to the expected values to skip over the guilty request.
2506	 */
2507	__i915_request_reset(rq, stalled);
2508	if (!stalled)
2509		goto out_replay;
2510
2511	/*
2512	 * We want a simple context + ring to execute the breadcrumb update.
2513	 * We cannot rely on the context being intact across the GPU hang,
2514	 * so clear it and rebuild just what we need for the breadcrumb.
2515	 * All pending requests for this context will be zapped, and any
2516	 * future request will be after userspace has had the opportunity
2517	 * to recreate its own state.
2518	 */
2519	regs = ce->lrc_reg_state;
2520	if (engine->pinned_default_state) {
2521		memcpy(regs, /* skip restoring the vanilla PPHWSP */
2522		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
2523		       engine->context_size - PAGE_SIZE);
2524	}
2525	execlists_init_reg_state(regs, ce, engine, ce->ring);
2526
2527out_replay:
2528	GEM_TRACE("%s replay {head:%04x, tail:%04x\n",
2529		  engine->name, ce->ring->head, ce->ring->tail);
2530	intel_ring_update_space(ce->ring);
2531	__execlists_update_reg_state(ce, engine);
2532
2533unwind:
2534	/* Push back any incomplete requests for replay after the reset. */
2535	cancel_port_requests(execlists);
2536	__unwind_incomplete_requests(engine);
2537}
2538
2539static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
2540{
2541	unsigned long flags;
2542
2543	GEM_TRACE("%s\n", engine->name);
2544
2545	spin_lock_irqsave(&engine->active.lock, flags);
2546
2547	__execlists_reset(engine, stalled);
2548
2549	spin_unlock_irqrestore(&engine->active.lock, flags);
2550}
2551
2552static void nop_submission_tasklet(unsigned long data)
2553{
2554	/* The driver is wedged; don't process any more events. */
2555}
2556
2557static void execlists_cancel_requests(struct intel_engine_cs *engine)
2558{
2559	struct intel_engine_execlists * const execlists = &engine->execlists;
2560	struct i915_request *rq, *rn;
2561	struct rb_node *rb;
2562	unsigned long flags;
2563
2564	GEM_TRACE("%s\n", engine->name);
2565
2566	/*
2567	 * Before we call engine->cancel_requests(), we should have exclusive
2568	 * access to the submission state. This is arranged for us by the
2569	 * caller disabling the interrupt generation, the tasklet and other
2570	 * threads that may then access the same state, giving us a free hand
2571	 * to reset state. However, we still need to let lockdep be aware that
2572	 * we know this state may be accessed in hardirq context, so we
2573	 * disable the irq around this manipulation and we want to keep
2574	 * the spinlock focused on its duties and not accidentally conflate
2575	 * coverage to the submission's irq state. (Similarly, although we
2576	 * shouldn't need to disable irq around the manipulation of the
2577	 * submission's irq state, we also wish to remind ourselves that
2578	 * it is irq state.)
2579	 */
2580	spin_lock_irqsave(&engine->active.lock, flags);
2581
2582	__execlists_reset(engine, true);
2583
2584	/* Mark all executing requests as skipped. */
2585	list_for_each_entry(rq, &engine->active.requests, sched.link)
2586		mark_eio(rq);
2587
2588	/* Flush the queued requests to the timeline list (for retiring). */
2589	while ((rb = rb_first_cached(&execlists->queue))) {
2590		struct i915_priolist *p = to_priolist(rb);
2591		int i;
2592
2593		priolist_for_each_request_consume(rq, rn, p, i) {
2594			mark_eio(rq);
2595			__i915_request_submit(rq);
2596		}
2597
2598		rb_erase_cached(&p->node, &execlists->queue);
2599		i915_priolist_free(p);
2600	}
2601
2602	/* Cancel all attached virtual engines */
2603	while ((rb = rb_first_cached(&execlists->virtual))) {
2604		struct virtual_engine *ve =
2605			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2606
2607		rb_erase_cached(rb, &execlists->virtual);
2608		RB_CLEAR_NODE(rb);
2609
2610		spin_lock(&ve->base.active.lock);
2611		rq = fetch_and_zero(&ve->request);
2612		if (rq) {
2613			mark_eio(rq);
2614
2615			rq->engine = engine;
2616			__i915_request_submit(rq);
2617			i915_request_put(rq);
2618
2619			ve->base.execlists.queue_priority_hint = INT_MIN;
2620		}
2621		spin_unlock(&ve->base.active.lock);
2622	}
2623
2624	/* Remaining _unready_ requests will be nop'ed when submitted */
2625
2626	execlists->queue_priority_hint = INT_MIN;
2627	execlists->queue = RB_ROOT_CACHED;
2628
2629	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
2630	execlists->tasklet.func = nop_submission_tasklet;
2631
2632	spin_unlock_irqrestore(&engine->active.lock, flags);
2633}
2634
2635static void execlists_reset_finish(struct intel_engine_cs *engine)
2636{
2637	struct intel_engine_execlists * const execlists = &engine->execlists;
2638
2639	/*
2640	 * After a GPU reset, we may have requests to replay. Do so now while
2641	 * we still have the forcewake to be sure that the GPU is not allowed
2642	 * to sleep before we restart and reload a context.
2643	 */
2644	GEM_BUG_ON(!reset_in_progress(execlists));
2645	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
2646		execlists->tasklet.func(execlists->tasklet.data);
2647
2648	if (__tasklet_enable(&execlists->tasklet))
2649		/* And kick in case we missed a new request submission. */
2650		tasklet_hi_schedule(&execlists->tasklet);
2651	GEM_TRACE("%s: depth->%d\n", engine->name,
2652		  atomic_read(&execlists->tasklet.count));
2653}
2654
2655static int gen8_emit_bb_start(struct i915_request *rq,
2656			      u64 offset, u32 len,
2657			      const unsigned int flags)
2658{
2659	u32 *cs;
2660
2661	cs = intel_ring_begin(rq, 4);
2662	if (IS_ERR(cs))
2663		return PTR_ERR(cs);
2664
2665	/*
2666	 * WaDisableCtxRestoreArbitration:bdw,chv
2667	 *
2668	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
2669	 * particular all the gen that do not need the w/a at all!), if we
2670	 * took care to make sure that on every switch into this context
2671	 * (both ordinary and for preemption) that arbitrartion was enabled
2672	 * we would be fine.  However, for gen8 there is another w/a that
2673	 * requires us to not preempt inside GPGPU execution, so we keep
2674	 * arbitration disabled for gen8 batches. Arbitration will be
2675	 * re-enabled before we close the request
2676	 * (engine->emit_fini_breadcrumb).
2677	 */
2678	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2679
2680	/* FIXME(BDW+): Address space and security selectors. */
2681	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
2682		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
2683	*cs++ = lower_32_bits(offset);
2684	*cs++ = upper_32_bits(offset);
2685
2686	intel_ring_advance(rq, cs);
2687
2688	return 0;
2689}
2690
2691static int gen9_emit_bb_start(struct i915_request *rq,
2692			      u64 offset, u32 len,
2693			      const unsigned int flags)
2694{
2695	u32 *cs;
2696
2697	cs = intel_ring_begin(rq, 6);
2698	if (IS_ERR(cs))
2699		return PTR_ERR(cs);
2700
2701	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2702
2703	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
2704		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
2705	*cs++ = lower_32_bits(offset);
2706	*cs++ = upper_32_bits(offset);
2707
2708	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2709	*cs++ = MI_NOOP;
2710
2711	intel_ring_advance(rq, cs);
2712
2713	return 0;
2714}
2715
2716static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
2717{
2718	ENGINE_WRITE(engine, RING_IMR,
2719		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
2720	ENGINE_POSTING_READ(engine, RING_IMR);
2721}
2722
2723static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
2724{
2725	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
2726}
2727
2728static int gen8_emit_flush(struct i915_request *request, u32 mode)
2729{
2730	u32 cmd, *cs;
2731
2732	cs = intel_ring_begin(request, 4);
2733	if (IS_ERR(cs))
2734		return PTR_ERR(cs);
2735
2736	cmd = MI_FLUSH_DW + 1;
2737
2738	/* We always require a command barrier so that subsequent
2739	 * commands, such as breadcrumb interrupts, are strictly ordered
2740	 * wrt the contents of the write cache being flushed to memory
2741	 * (and thus being coherent from the CPU).
2742	 */
2743	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
2744
2745	if (mode & EMIT_INVALIDATE) {
2746		cmd |= MI_INVALIDATE_TLB;
2747		if (request->engine->class == VIDEO_DECODE_CLASS)
2748			cmd |= MI_INVALIDATE_BSD;
2749	}
2750
2751	*cs++ = cmd;
2752	*cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
2753	*cs++ = 0; /* upper addr */
2754	*cs++ = 0; /* value */
2755	intel_ring_advance(request, cs);
2756
2757	return 0;
2758}
2759
2760static int gen8_emit_flush_render(struct i915_request *request,
2761				  u32 mode)
2762{
2763	struct intel_engine_cs *engine = request->engine;
2764	u32 scratch_addr =
2765		intel_gt_scratch_offset(engine->gt,
2766					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
2767	bool vf_flush_wa = false, dc_flush_wa = false;
2768	u32 *cs, flags = 0;
2769	int len;
2770
2771	flags |= PIPE_CONTROL_CS_STALL;
2772
2773	if (mode & EMIT_FLUSH) {
2774		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
2775		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
2776		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
2777		flags |= PIPE_CONTROL_FLUSH_ENABLE;
2778	}
2779
2780	if (mode & EMIT_INVALIDATE) {
2781		flags |= PIPE_CONTROL_TLB_INVALIDATE;
2782		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
2783		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
2784		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
2785		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
2786		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
2787		flags |= PIPE_CONTROL_QW_WRITE;
2788		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
2789
2790		/*
2791		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
2792		 * pipe control.
2793		 */
2794		if (IS_GEN(request->i915, 9))
2795			vf_flush_wa = true;
2796
2797		/* WaForGAMHang:kbl */
2798		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
2799			dc_flush_wa = true;
2800	}
2801
2802	len = 6;
2803
2804	if (vf_flush_wa)
2805		len += 6;
2806
2807	if (dc_flush_wa)
2808		len += 12;
2809
2810	cs = intel_ring_begin(request, len);
2811	if (IS_ERR(cs))
2812		return PTR_ERR(cs);
2813
2814	if (vf_flush_wa)
2815		cs = gen8_emit_pipe_control(cs, 0, 0);
2816
2817	if (dc_flush_wa)
2818		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
2819					    0);
2820
2821	cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
2822
2823	if (dc_flush_wa)
2824		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
2825
2826	intel_ring_advance(request, cs);
2827
2828	return 0;
2829}
2830
2831static int gen11_emit_flush_render(struct i915_request *request,
2832				   u32 mode)
2833{
2834	struct intel_engine_cs *engine = request->engine;
2835	const u32 scratch_addr =
2836		intel_gt_scratch_offset(engine->gt,
2837					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
2838
2839	if (mode & EMIT_FLUSH) {
2840		u32 *cs;
2841		u32 flags = 0;
2842
2843		flags |= PIPE_CONTROL_CS_STALL;
2844
2845		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
2846		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
2847		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
2848		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
2849		flags |= PIPE_CONTROL_FLUSH_ENABLE;
2850		flags |= PIPE_CONTROL_QW_WRITE;
2851		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
2852
2853		cs = intel_ring_begin(request, 6);
2854		if (IS_ERR(cs))
2855			return PTR_ERR(cs);
2856
2857		cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
2858		intel_ring_advance(request, cs);
2859	}
2860
2861	if (mode & EMIT_INVALIDATE) {
2862		u32 *cs;
2863		u32 flags = 0;
2864
2865		flags |= PIPE_CONTROL_CS_STALL;
2866
2867		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
2868		flags |= PIPE_CONTROL_TLB_INVALIDATE;
2869		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
2870		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
2871		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
2872		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
2873		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
2874		flags |= PIPE_CONTROL_QW_WRITE;
2875		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
2876
2877		cs = intel_ring_begin(request, 6);
2878		if (IS_ERR(cs))
2879			return PTR_ERR(cs);
2880
2881		cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
2882		intel_ring_advance(request, cs);
2883	}
2884
2885	return 0;
2886}
2887
2888/*
2889 * Reserve space for 2 NOOPs at the end of each request to be
2890 * used as a workaround for not being allowed to do lite
2891 * restore with HEAD==TAIL (WaIdleLiteRestore).
2892 */
2893static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
2894{
2895	/* Ensure there's always at least one preemption point per-request. */
2896	*cs++ = MI_ARB_CHECK;
2897	*cs++ = MI_NOOP;
2898	request->wa_tail = intel_ring_offset(request, cs);
2899
2900	return cs;
2901}
2902
2903static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
2904{
2905	*cs++ = MI_SEMAPHORE_WAIT |
2906		MI_SEMAPHORE_GLOBAL_GTT |
2907		MI_SEMAPHORE_POLL |
2908		MI_SEMAPHORE_SAD_EQ_SDD;
2909	*cs++ = 0;
2910	*cs++ = intel_hws_preempt_address(request->engine);
2911	*cs++ = 0;
2912
2913	return cs;
2914}
2915
2916static __always_inline u32*
2917gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
2918				 u32 *cs)
2919{
2920	*cs++ = MI_USER_INTERRUPT;
2921
2922	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2923	if (intel_engine_has_semaphores(request->engine))
2924		cs = emit_preempt_busywait(request, cs);
2925
2926	request->tail = intel_ring_offset(request, cs);
2927	assert_ring_tail_valid(request->ring, request->tail);
2928
2929	return gen8_emit_wa_tail(request, cs);
2930}
2931
2932static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
2933{
2934	cs = gen8_emit_ggtt_write(cs,
2935				  request->fence.seqno,
2936				  request->timeline->hwsp_offset,
2937				  0);
2938
2939	return gen8_emit_fini_breadcrumb_footer(request, cs);
2940}
2941
2942static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
2943{
2944	cs = gen8_emit_ggtt_write_rcs(cs,
2945				      request->fence.seqno,
2946				      request->timeline->hwsp_offset,
2947				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
2948				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2949				      PIPE_CONTROL_DC_FLUSH_ENABLE);
2950
2951	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
2952	cs = gen8_emit_pipe_control(cs,
2953				    PIPE_CONTROL_FLUSH_ENABLE |
2954				    PIPE_CONTROL_CS_STALL,
2955				    0);
2956
2957	return gen8_emit_fini_breadcrumb_footer(request, cs);
2958}
2959
2960static u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *request,
2961					   u32 *cs)
2962{
2963	cs = gen8_emit_ggtt_write_rcs(cs,
2964				      request->fence.seqno,
2965				      request->timeline->hwsp_offset,
2966				      PIPE_CONTROL_CS_STALL |
2967				      PIPE_CONTROL_TILE_CACHE_FLUSH |
2968				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
2969				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2970				      PIPE_CONTROL_DC_FLUSH_ENABLE |
2971				      PIPE_CONTROL_FLUSH_ENABLE);
2972
2973	return gen8_emit_fini_breadcrumb_footer(request, cs);
2974}
2975
2976static void execlists_park(struct intel_engine_cs *engine)
2977{
2978	del_timer(&engine->execlists.timer);
2979}
2980
2981void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
2982{
2983	engine->submit_request = execlists_submit_request;
2984	engine->cancel_requests = execlists_cancel_requests;
2985	engine->schedule = i915_schedule;
2986	engine->execlists.tasklet.func = execlists_submission_tasklet;
2987
2988	engine->reset.prepare = execlists_reset_prepare;
2989	engine->reset.reset = execlists_reset;
2990	engine->reset.finish = execlists_reset_finish;
2991
2992	engine->park = execlists_park;
2993	engine->unpark = NULL;
2994
2995	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
2996	if (!intel_vgpu_active(engine->i915)) {
2997		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
2998		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
2999			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
3000	}
3001}
3002
3003static void execlists_destroy(struct intel_engine_cs *engine)
3004{
3005	intel_engine_cleanup_common(engine);
3006	lrc_destroy_wa_ctx(engine);
3007	kfree(engine);
3008}
3009
3010static void
3011logical_ring_default_vfuncs(struct intel_engine_cs *engine)
3012{
3013	/* Default vfuncs which can be overriden by each engine. */
3014
3015	engine->destroy = execlists_destroy;
3016	engine->resume = execlists_resume;
3017
3018	engine->reset.prepare = execlists_reset_prepare;
3019	engine->reset.reset = execlists_reset;
3020	engine->reset.finish = execlists_reset_finish;
3021
3022	engine->cops = &execlists_context_ops;
3023	engine->request_alloc = execlists_request_alloc;
3024
3025	engine->emit_flush = gen8_emit_flush;
3026	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
3027	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
3028
3029	engine->set_default_submission = intel_execlists_set_default_submission;
3030
3031	if (INTEL_GEN(engine->i915) < 11) {
3032		engine->irq_enable = gen8_logical_ring_enable_irq;
3033		engine->irq_disable = gen8_logical_ring_disable_irq;
3034	} else {
3035		/*
3036		 * TODO: On Gen11 interrupt masks need to be clear
3037		 * to allow C6 entry. Keep interrupts enabled at
3038		 * and take the hit of generating extra interrupts
3039		 * until a more refined solution exists.
3040		 */
3041	}
3042	if (IS_GEN(engine->i915, 8))
3043		engine->emit_bb_start = gen8_emit_bb_start;
3044	else
3045		engine->emit_bb_start = gen9_emit_bb_start;
3046}
3047
3048static inline void
3049logical_ring_default_irqs(struct intel_engine_cs *engine)
3050{
3051	unsigned int shift = 0;
3052
3053	if (INTEL_GEN(engine->i915) < 11) {
3054		const u8 irq_shifts[] = {
3055			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
3056			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
3057			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
3058			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
3059			[VECS0] = GEN8_VECS_IRQ_SHIFT,
3060		};
3061
3062		shift = irq_shifts[engine->id];
3063	}
3064
3065	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
3066	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
3067}
3068
3069static void rcs_submission_override(struct intel_engine_cs *engine)
3070{
3071	switch (INTEL_GEN(engine->i915)) {
3072	case 12:
3073	case 11:
3074		engine->emit_flush = gen11_emit_flush_render;
3075		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
3076		break;
3077	default:
3078		engine->emit_flush = gen8_emit_flush_render;
3079		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
3080		break;
3081	}
3082}
3083
3084int intel_execlists_submission_setup(struct intel_engine_cs *engine)
3085{
3086	tasklet_init(&engine->execlists.tasklet,
3087		     execlists_submission_tasklet, (unsigned long)engine);
3088	timer_setup(&engine->execlists.timer, execlists_submission_timer, 0);
3089
3090	logical_ring_default_vfuncs(engine);
3091	logical_ring_default_irqs(engine);
3092
3093	if (engine->class == RENDER_CLASS)
3094		rcs_submission_override(engine);
3095
3096	return 0;
3097}
3098
3099int intel_execlists_submission_init(struct intel_engine_cs *engine)
3100{
3101	struct intel_engine_execlists * const execlists = &engine->execlists;
3102	struct drm_i915_private *i915 = engine->i915;
3103	struct intel_uncore *uncore = engine->uncore;
3104	u32 base = engine->mmio_base;
3105	int ret;
3106
3107	ret = intel_engine_init_common(engine);
3108	if (ret)
3109		return ret;
3110
3111	if (intel_init_workaround_bb(engine))
3112		/*
3113		 * We continue even if we fail to initialize WA batch
3114		 * because we only expect rare glitches but nothing
3115		 * critical to prevent us from using GPU
3116		 */
3117		DRM_ERROR("WA batch buffer initialization failed\n");
3118
3119	if (HAS_LOGICAL_RING_ELSQ(i915)) {
3120		execlists->submit_reg = uncore->regs +
3121			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
3122		execlists->ctrl_reg = uncore->regs +
3123			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
3124	} else {
3125		execlists->submit_reg = uncore->regs +
3126			i915_mmio_reg_offset(RING_ELSP(base));
3127	}
3128
3129	execlists->csb_status =
3130		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
3131
3132	execlists->csb_write =
3133		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
 
3134
3135	if (INTEL_GEN(i915) < 11)
3136		execlists->csb_size = GEN8_CSB_ENTRIES;
3137	else
3138		execlists->csb_size = GEN11_CSB_ENTRIES;
3139
3140	reset_csb_pointers(engine);
3141
3142	return 0;
3143}
3144
3145static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
3146{
3147	u32 indirect_ctx_offset;
3148
3149	switch (INTEL_GEN(engine->i915)) {
3150	default:
3151		MISSING_CASE(INTEL_GEN(engine->i915));
3152		/* fall through */
3153	case 12:
3154		indirect_ctx_offset =
3155			GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3156		break;
3157	case 11:
3158		indirect_ctx_offset =
3159			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3160		break;
3161	case 10:
3162		indirect_ctx_offset =
3163			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3164		break;
3165	case 9:
3166		indirect_ctx_offset =
3167			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3168		break;
3169	case 8:
3170		indirect_ctx_offset =
3171			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
3172		break;
3173	}
3174
3175	return indirect_ctx_offset;
3176}
3177
3178static void execlists_init_reg_state(u32 *regs,
3179				     struct intel_context *ce,
3180				     struct intel_engine_cs *engine,
3181				     struct intel_ring *ring)
3182{
3183	struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(ce->vm);
3184	bool rcs = engine->class == RENDER_CLASS;
3185	u32 base = engine->mmio_base;
3186
3187	/*
3188	 * A context is actually a big batch buffer with several
3189	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
3190	 * values we are setting here are only for the first context restore:
3191	 * on a subsequent save, the GPU will recreate this batchbuffer with new
3192	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
3193	 * we are not initializing here).
3194	 *
3195	 * Must keep consistent with virtual_update_register_offsets().
3196	 */
3197	regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
3198				 MI_LRI_FORCE_POSTED;
3199
3200	CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(base),
3201		_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
3202		_MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH));
3203	if (INTEL_GEN(engine->i915) < 11) {
3204		regs[CTX_CONTEXT_CONTROL + 1] |=
3205			_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
3206					    CTX_CTRL_RS_CTX_ENABLE);
3207	}
3208	CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0);
3209	CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0);
3210	CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0);
3211	CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base),
3212		RING_CTL_SIZE(ring->size) | RING_VALID);
3213	CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0);
3214	CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0);
3215	CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT);
3216	CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0);
3217	CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0);
3218	CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0);
3219	if (rcs) {
3220		struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3221
3222		CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0);
3223		CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET,
3224			RING_INDIRECT_CTX_OFFSET(base), 0);
3225		if (wa_ctx->indirect_ctx.size) {
3226			u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3227
3228			regs[CTX_RCS_INDIRECT_CTX + 1] =
3229				(ggtt_offset + wa_ctx->indirect_ctx.offset) |
3230				(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
3231
3232			regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] =
3233				intel_lr_indirect_ctx_offset(engine) << 6;
3234		}
3235
3236		CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0);
3237		if (wa_ctx->per_ctx.size) {
3238			u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
3239
3240			regs[CTX_BB_PER_CTX_PTR + 1] =
3241				(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
3242		}
3243	}
 
3244
3245	regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED;
3246
3247	CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
3248	/* PDP values well be assigned later if needed */
3249	CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(base, 3), 0);
3250	CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(base, 3), 0);
3251	CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(base, 2), 0);
3252	CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(base, 2), 0);
3253	CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(base, 1), 0);
3254	CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(base, 1), 0);
3255	CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(base, 0), 0);
3256	CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(base, 0), 0);
3257
3258	if (i915_vm_is_4lvl(&ppgtt->vm)) {
3259		/* 64b PPGTT (48bit canonical)
3260		 * PDP0_DESCRIPTOR contains the base address to PML4 and
3261		 * other PDP Descriptors are ignored.
3262		 */
3263		ASSIGN_CTX_PML4(ppgtt, regs);
3264	} else {
3265		ASSIGN_CTX_PDP(ppgtt, regs, 3);
3266		ASSIGN_CTX_PDP(ppgtt, regs, 2);
3267		ASSIGN_CTX_PDP(ppgtt, regs, 1);
3268		ASSIGN_CTX_PDP(ppgtt, regs, 0);
3269	}
3270
3271	if (rcs) {
3272		regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
3273		CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
3274	}
3275
3276	regs[CTX_END] = MI_BATCH_BUFFER_END;
3277	if (INTEL_GEN(engine->i915) >= 10)
3278		regs[CTX_END] |= BIT(0);
3279}
3280
3281static int
3282populate_lr_context(struct intel_context *ce,
3283		    struct drm_i915_gem_object *ctx_obj,
3284		    struct intel_engine_cs *engine,
3285		    struct intel_ring *ring)
3286{
3287	void *vaddr;
3288	u32 *regs;
3289	int ret;
3290
3291	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
3292	if (IS_ERR(vaddr)) {
3293		ret = PTR_ERR(vaddr);
3294		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
3295		return ret;
3296	}
3297
3298	set_redzone(vaddr, engine);
3299
3300	if (engine->default_state) {
3301		/*
3302		 * We only want to copy over the template context state;
3303		 * skipping over the headers reserved for GuC communication,
3304		 * leaving those as zero.
3305		 */
3306		const unsigned long start = LRC_HEADER_PAGES * PAGE_SIZE;
3307		void *defaults;
3308
3309		defaults = i915_gem_object_pin_map(engine->default_state,
3310						   I915_MAP_WB);
3311		if (IS_ERR(defaults)) {
3312			ret = PTR_ERR(defaults);
3313			goto err_unpin_ctx;
3314		}
3315
3316		memcpy(vaddr + start, defaults + start, engine->context_size);
3317		i915_gem_object_unpin_map(engine->default_state);
3318	}
3319
3320	/* The second page of the context object contains some fields which must
3321	 * be set up prior to the first execution. */
3322	regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
3323	execlists_init_reg_state(regs, ce, engine, ring);
3324	if (!engine->default_state)
3325		regs[CTX_CONTEXT_CONTROL + 1] |=
3326			_MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
3327
3328	ret = 0;
3329err_unpin_ctx:
3330	__i915_gem_object_flush_map(ctx_obj,
3331				    LRC_HEADER_PAGES * PAGE_SIZE,
3332				    engine->context_size);
3333	i915_gem_object_unpin_map(ctx_obj);
3334	return ret;
3335}
3336
3337static int __execlists_context_alloc(struct intel_context *ce,
3338				     struct intel_engine_cs *engine)
3339{
3340	struct drm_i915_gem_object *ctx_obj;
3341	struct intel_ring *ring;
3342	struct i915_vma *vma;
3343	u32 context_size;
3344	int ret;
3345
3346	GEM_BUG_ON(ce->state);
3347	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
3348
3349	/*
3350	 * Before the actual start of the context image, we insert a few pages
3351	 * for our own use and for sharing with the GuC.
 
 
3352	 */
3353	context_size += LRC_HEADER_PAGES * PAGE_SIZE;
3354	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3355		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
3356
3357	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
3358	if (IS_ERR(ctx_obj))
3359		return PTR_ERR(ctx_obj);
3360
3361	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
3362	if (IS_ERR(vma)) {
3363		ret = PTR_ERR(vma);
3364		goto error_deref_obj;
3365	}
3366
3367	if (!ce->timeline) {
3368		struct intel_timeline *tl;
3369
3370		tl = intel_timeline_create(engine->gt, NULL);
3371		if (IS_ERR(tl)) {
3372			ret = PTR_ERR(tl);
3373			goto error_deref_obj;
3374		}
3375
3376		ce->timeline = tl;
3377	}
3378
3379	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
3380	if (IS_ERR(ring)) {
3381		ret = PTR_ERR(ring);
3382		goto error_deref_obj;
3383	}
3384
3385	ret = populate_lr_context(ce, ctx_obj, engine, ring);
3386	if (ret) {
3387		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
3388		goto error_ring_free;
3389	}
3390
3391	ce->ring = ring;
3392	ce->state = vma;
3393
3394	return 0;
3395
3396error_ring_free:
3397	intel_ring_put(ring);
3398error_deref_obj:
3399	i915_gem_object_put(ctx_obj);
3400	return ret;
3401}
3402
3403static struct list_head *virtual_queue(struct virtual_engine *ve)
3404{
3405	return &ve->base.execlists.default_priolist.requests[0];
3406}
3407
3408static void virtual_context_destroy(struct kref *kref)
3409{
3410	struct virtual_engine *ve =
3411		container_of(kref, typeof(*ve), context.ref);
3412	unsigned int n;
3413
3414	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
3415	GEM_BUG_ON(ve->request);
3416	GEM_BUG_ON(ve->context.inflight);
3417
3418	for (n = 0; n < ve->num_siblings; n++) {
3419		struct intel_engine_cs *sibling = ve->siblings[n];
3420		struct rb_node *node = &ve->nodes[sibling->id].rb;
3421
3422		if (RB_EMPTY_NODE(node))
3423			continue;
3424
3425		spin_lock_irq(&sibling->active.lock);
3426
3427		/* Detachment is lazily performed in the execlists tasklet */
3428		if (!RB_EMPTY_NODE(node))
3429			rb_erase_cached(node, &sibling->execlists.virtual);
3430
3431		spin_unlock_irq(&sibling->active.lock);
3432	}
3433	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
3434
3435	if (ve->context.state)
3436		__execlists_context_fini(&ve->context);
3437	intel_context_fini(&ve->context);
3438
3439	kfree(ve->bonds);
3440	kfree(ve);
3441}
3442
3443static void virtual_engine_initial_hint(struct virtual_engine *ve)
3444{
3445	int swp;
3446
3447	/*
3448	 * Pick a random sibling on starting to help spread the load around.
3449	 *
3450	 * New contexts are typically created with exactly the same order
3451	 * of siblings, and often started in batches. Due to the way we iterate
3452	 * the array of sibling when submitting requests, sibling[0] is
3453	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
3454	 * randomised across the system, we also help spread the load by the
3455	 * first engine we inspect being different each time.
3456	 *
3457	 * NB This does not force us to execute on this engine, it will just
3458	 * typically be the first we inspect for submission.
3459	 */
3460	swp = prandom_u32_max(ve->num_siblings);
3461	if (!swp)
3462		return;
3463
3464	swap(ve->siblings[swp], ve->siblings[0]);
3465	virtual_update_register_offsets(ve->context.lrc_reg_state,
3466					ve->siblings[0]);
3467}
3468
3469static int virtual_context_pin(struct intel_context *ce)
3470{
3471	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
3472	int err;
3473
3474	/* Note: we must use a real engine class for setting up reg state */
3475	err = __execlists_context_pin(ce, ve->siblings[0]);
3476	if (err)
3477		return err;
3478
3479	virtual_engine_initial_hint(ve);
3480	return 0;
3481}
3482
3483static void virtual_context_enter(struct intel_context *ce)
3484{
3485	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
3486	unsigned int n;
3487
3488	for (n = 0; n < ve->num_siblings; n++)
3489		intel_engine_pm_get(ve->siblings[n]);
3490
3491	intel_timeline_enter(ce->timeline);
3492}
3493
3494static void virtual_context_exit(struct intel_context *ce)
3495{
3496	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
3497	unsigned int n;
3498
3499	intel_timeline_exit(ce->timeline);
3500
3501	for (n = 0; n < ve->num_siblings; n++)
3502		intel_engine_pm_put(ve->siblings[n]);
3503}
3504
3505static const struct intel_context_ops virtual_context_ops = {
3506	.pin = virtual_context_pin,
3507	.unpin = execlists_context_unpin,
3508
3509	.enter = virtual_context_enter,
3510	.exit = virtual_context_exit,
3511
3512	.destroy = virtual_context_destroy,
3513};
3514
3515static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
3516{
3517	struct i915_request *rq;
3518	intel_engine_mask_t mask;
3519
3520	rq = READ_ONCE(ve->request);
3521	if (!rq)
3522		return 0;
3523
3524	/* The rq is ready for submission; rq->execution_mask is now stable. */
3525	mask = rq->execution_mask;
3526	if (unlikely(!mask)) {
3527		/* Invalid selection, submit to a random engine in error */
3528		i915_request_skip(rq, -ENODEV);
3529		mask = ve->siblings[0]->mask;
3530	}
3531
3532	GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n",
3533		  ve->base.name,
3534		  rq->fence.context, rq->fence.seqno,
3535		  mask, ve->base.execlists.queue_priority_hint);
3536
3537	return mask;
3538}
3539
3540static void virtual_submission_tasklet(unsigned long data)
3541{
3542	struct virtual_engine * const ve = (struct virtual_engine *)data;
3543	const int prio = ve->base.execlists.queue_priority_hint;
3544	intel_engine_mask_t mask;
3545	unsigned int n;
3546
3547	rcu_read_lock();
3548	mask = virtual_submission_mask(ve);
3549	rcu_read_unlock();
3550	if (unlikely(!mask))
3551		return;
3552
3553	local_irq_disable();
3554	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
3555		struct intel_engine_cs *sibling = ve->siblings[n];
3556		struct ve_node * const node = &ve->nodes[sibling->id];
3557		struct rb_node **parent, *rb;
3558		bool first;
3559
3560		if (unlikely(!(mask & sibling->mask))) {
3561			if (!RB_EMPTY_NODE(&node->rb)) {
3562				spin_lock(&sibling->active.lock);
3563				rb_erase_cached(&node->rb,
3564						&sibling->execlists.virtual);
3565				RB_CLEAR_NODE(&node->rb);
3566				spin_unlock(&sibling->active.lock);
3567			}
3568			continue;
3569		}
3570
3571		spin_lock(&sibling->active.lock);
3572
3573		if (!RB_EMPTY_NODE(&node->rb)) {
3574			/*
3575			 * Cheat and avoid rebalancing the tree if we can
3576			 * reuse this node in situ.
3577			 */
3578			first = rb_first_cached(&sibling->execlists.virtual) ==
3579				&node->rb;
3580			if (prio == node->prio || (prio > node->prio && first))
3581				goto submit_engine;
3582
3583			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
3584		}
3585
3586		rb = NULL;
3587		first = true;
3588		parent = &sibling->execlists.virtual.rb_root.rb_node;
3589		while (*parent) {
3590			struct ve_node *other;
3591
3592			rb = *parent;
3593			other = rb_entry(rb, typeof(*other), rb);
3594			if (prio > other->prio) {
3595				parent = &rb->rb_left;
3596			} else {
3597				parent = &rb->rb_right;
3598				first = false;
3599			}
3600		}
3601
3602		rb_link_node(&node->rb, rb, parent);
3603		rb_insert_color_cached(&node->rb,
3604				       &sibling->execlists.virtual,
3605				       first);
3606
3607submit_engine:
3608		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
3609		node->prio = prio;
3610		if (first && prio > sibling->execlists.queue_priority_hint) {
3611			sibling->execlists.queue_priority_hint = prio;
3612			tasklet_hi_schedule(&sibling->execlists.tasklet);
3613		}
3614
3615		spin_unlock(&sibling->active.lock);
3616	}
3617	local_irq_enable();
3618}
3619
3620static void virtual_submit_request(struct i915_request *rq)
3621{
3622	struct virtual_engine *ve = to_virtual_engine(rq->engine);
3623	struct i915_request *old;
3624	unsigned long flags;
3625
3626	GEM_TRACE("%s: rq=%llx:%lld\n",
3627		  ve->base.name,
3628		  rq->fence.context,
3629		  rq->fence.seqno);
3630
3631	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
3632
3633	spin_lock_irqsave(&ve->base.active.lock, flags);
3634
3635	old = ve->request;
3636	if (old) { /* background completion event from preempt-to-busy */
3637		GEM_BUG_ON(!i915_request_completed(old));
3638		__i915_request_submit(old);
3639		i915_request_put(old);
3640	}
3641
3642	if (i915_request_completed(rq)) {
3643		__i915_request_submit(rq);
3644
3645		ve->base.execlists.queue_priority_hint = INT_MIN;
3646		ve->request = NULL;
3647	} else {
3648		ve->base.execlists.queue_priority_hint = rq_prio(rq);
3649		ve->request = i915_request_get(rq);
3650
3651		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
3652		list_move_tail(&rq->sched.link, virtual_queue(ve));
3653
3654		tasklet_schedule(&ve->base.execlists.tasklet);
3655	}
3656
3657	spin_unlock_irqrestore(&ve->base.active.lock, flags);
3658}
3659
3660static struct ve_bond *
3661virtual_find_bond(struct virtual_engine *ve,
3662		  const struct intel_engine_cs *master)
3663{
3664	int i;
3665
3666	for (i = 0; i < ve->num_bonds; i++) {
3667		if (ve->bonds[i].master == master)
3668			return &ve->bonds[i];
3669	}
3670
3671	return NULL;
3672}
3673
3674static void
3675virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
3676{
3677	struct virtual_engine *ve = to_virtual_engine(rq->engine);
3678	intel_engine_mask_t allowed, exec;
3679	struct ve_bond *bond;
3680
3681	allowed = ~to_request(signal)->engine->mask;
3682
3683	bond = virtual_find_bond(ve, to_request(signal)->engine);
3684	if (bond)
3685		allowed &= bond->sibling_mask;
3686
3687	/* Restrict the bonded request to run on only the available engines */
3688	exec = READ_ONCE(rq->execution_mask);
3689	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
3690		;
3691
3692	/* Prevent the master from being re-run on the bonded engines */
3693	to_request(signal)->execution_mask &= ~allowed;
3694}
3695
3696struct intel_context *
3697intel_execlists_create_virtual(struct i915_gem_context *ctx,
3698			       struct intel_engine_cs **siblings,
3699			       unsigned int count)
3700{
3701	struct virtual_engine *ve;
3702	unsigned int n;
3703	int err;
3704
3705	if (count == 0)
3706		return ERR_PTR(-EINVAL);
3707
3708	if (count == 1)
3709		return intel_context_create(ctx, siblings[0]);
3710
3711	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
3712	if (!ve)
3713		return ERR_PTR(-ENOMEM);
3714
3715	ve->base.i915 = ctx->i915;
3716	ve->base.gt = siblings[0]->gt;
3717	ve->base.id = -1;
3718	ve->base.class = OTHER_CLASS;
3719	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
3720	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
3721
3722	/*
3723	 * The decision on whether to submit a request using semaphores
3724	 * depends on the saturated state of the engine. We only compute
3725	 * this during HW submission of the request, and we need for this
3726	 * state to be globally applied to all requests being submitted
3727	 * to this engine. Virtual engines encompass more than one physical
3728	 * engine and so we cannot accurately tell in advance if one of those
3729	 * engines is already saturated and so cannot afford to use a semaphore
3730	 * and be pessimized in priority for doing so -- if we are the only
3731	 * context using semaphores after all other clients have stopped, we
3732	 * will be starved on the saturated system. Such a global switch for
3733	 * semaphores is less than ideal, but alas is the current compromise.
3734	 */
3735	ve->base.saturated = ALL_ENGINES;
3736
3737	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
3738
3739	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
3740
3741	intel_engine_init_execlists(&ve->base);
3742
3743	ve->base.cops = &virtual_context_ops;
3744	ve->base.request_alloc = execlists_request_alloc;
3745
3746	ve->base.schedule = i915_schedule;
3747	ve->base.submit_request = virtual_submit_request;
3748	ve->base.bond_execute = virtual_bond_execute;
3749
3750	INIT_LIST_HEAD(virtual_queue(ve));
3751	ve->base.execlists.queue_priority_hint = INT_MIN;
3752	tasklet_init(&ve->base.execlists.tasklet,
3753		     virtual_submission_tasklet,
3754		     (unsigned long)ve);
3755
3756	intel_context_init(&ve->context, ctx, &ve->base);
3757
3758	for (n = 0; n < count; n++) {
3759		struct intel_engine_cs *sibling = siblings[n];
3760
3761		GEM_BUG_ON(!is_power_of_2(sibling->mask));
3762		if (sibling->mask & ve->base.mask) {
3763			DRM_DEBUG("duplicate %s entry in load balancer\n",
3764				  sibling->name);
3765			err = -EINVAL;
3766			goto err_put;
3767		}
3768
3769		/*
3770		 * The virtual engine implementation is tightly coupled to
3771		 * the execlists backend -- we push out request directly
3772		 * into a tree inside each physical engine. We could support
3773		 * layering if we handle cloning of the requests and
3774		 * submitting a copy into each backend.
3775		 */
3776		if (sibling->execlists.tasklet.func !=
3777		    execlists_submission_tasklet) {
3778			err = -ENODEV;
3779			goto err_put;
3780		}
3781
3782		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
3783		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
3784
3785		ve->siblings[ve->num_siblings++] = sibling;
3786		ve->base.mask |= sibling->mask;
3787
3788		/*
3789		 * All physical engines must be compatible for their emission
3790		 * functions (as we build the instructions during request
3791		 * construction and do not alter them before submission
3792		 * on the physical engine). We use the engine class as a guide
3793		 * here, although that could be refined.
3794		 */
3795		if (ve->base.class != OTHER_CLASS) {
3796			if (ve->base.class != sibling->class) {
3797				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
3798					  sibling->class, ve->base.class);
3799				err = -EINVAL;
3800				goto err_put;
3801			}
3802			continue;
3803		}
3804
3805		ve->base.class = sibling->class;
3806		ve->base.uabi_class = sibling->uabi_class;
3807		snprintf(ve->base.name, sizeof(ve->base.name),
3808			 "v%dx%d", ve->base.class, count);
3809		ve->base.context_size = sibling->context_size;
3810
3811		ve->base.emit_bb_start = sibling->emit_bb_start;
3812		ve->base.emit_flush = sibling->emit_flush;
3813		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
3814		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
3815		ve->base.emit_fini_breadcrumb_dw =
3816			sibling->emit_fini_breadcrumb_dw;
3817
3818		ve->base.flags = sibling->flags;
3819	}
3820
3821	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
3822
3823	err = __execlists_context_alloc(&ve->context, siblings[0]);
3824	if (err)
3825		goto err_put;
3826
3827	__set_bit(CONTEXT_ALLOC_BIT, &ve->context.flags);
3828
3829	return &ve->context;
3830
3831err_put:
3832	intel_context_put(&ve->context);
3833	return ERR_PTR(err);
3834}
3835
3836struct intel_context *
3837intel_execlists_clone_virtual(struct i915_gem_context *ctx,
3838			      struct intel_engine_cs *src)
3839{
3840	struct virtual_engine *se = to_virtual_engine(src);
3841	struct intel_context *dst;
3842
3843	dst = intel_execlists_create_virtual(ctx,
3844					     se->siblings,
3845					     se->num_siblings);
3846	if (IS_ERR(dst))
3847		return dst;
3848
3849	if (se->num_bonds) {
3850		struct virtual_engine *de = to_virtual_engine(dst->engine);
3851
3852		de->bonds = kmemdup(se->bonds,
3853				    sizeof(*se->bonds) * se->num_bonds,
3854				    GFP_KERNEL);
3855		if (!de->bonds) {
3856			intel_context_put(dst);
3857			return ERR_PTR(-ENOMEM);
3858		}
3859
3860		de->num_bonds = se->num_bonds;
3861	}
3862
3863	return dst;
3864}
3865
3866int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
3867				     const struct intel_engine_cs *master,
3868				     const struct intel_engine_cs *sibling)
3869{
3870	struct virtual_engine *ve = to_virtual_engine(engine);
3871	struct ve_bond *bond;
3872	int n;
3873
3874	/* Sanity check the sibling is part of the virtual engine */
3875	for (n = 0; n < ve->num_siblings; n++)
3876		if (sibling == ve->siblings[n])
3877			break;
3878	if (n == ve->num_siblings)
3879		return -EINVAL;
3880
3881	bond = virtual_find_bond(ve, master);
3882	if (bond) {
3883		bond->sibling_mask |= sibling->mask;
3884		return 0;
3885	}
3886
3887	bond = krealloc(ve->bonds,
3888			sizeof(*bond) * (ve->num_bonds + 1),
3889			GFP_KERNEL);
3890	if (!bond)
3891		return -ENOMEM;
3892
3893	bond[ve->num_bonds].master = master;
3894	bond[ve->num_bonds].sibling_mask = sibling->mask;
3895
3896	ve->bonds = bond;
3897	ve->num_bonds++;
3898
3899	return 0;
3900}
3901
3902void intel_execlists_show_requests(struct intel_engine_cs *engine,
3903				   struct drm_printer *m,
3904				   void (*show_request)(struct drm_printer *m,
3905							struct i915_request *rq,
3906							const char *prefix),
3907				   unsigned int max)
3908{
3909	const struct intel_engine_execlists *execlists = &engine->execlists;
3910	struct i915_request *rq, *last;
3911	unsigned long flags;
3912	unsigned int count;
3913	struct rb_node *rb;
3914
3915	spin_lock_irqsave(&engine->active.lock, flags);
3916
3917	last = NULL;
3918	count = 0;
3919	list_for_each_entry(rq, &engine->active.requests, sched.link) {
3920		if (count++ < max - 1)
3921			show_request(m, rq, "\t\tE ");
3922		else
3923			last = rq;
3924	}
3925	if (last) {
3926		if (count > max) {
3927			drm_printf(m,
3928				   "\t\t...skipping %d executing requests...\n",
3929				   count - max);
3930		}
3931		show_request(m, last, "\t\tE ");
3932	}
3933
3934	last = NULL;
3935	count = 0;
3936	if (execlists->queue_priority_hint != INT_MIN)
3937		drm_printf(m, "\t\tQueue priority hint: %d\n",
3938			   execlists->queue_priority_hint);
3939	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
3940		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
3941		int i;
3942
3943		priolist_for_each_request(rq, p, i) {
3944			if (count++ < max - 1)
3945				show_request(m, rq, "\t\tQ ");
3946			else
3947				last = rq;
3948		}
3949	}
3950	if (last) {
3951		if (count > max) {
3952			drm_printf(m,
3953				   "\t\t...skipping %d queued requests...\n",
3954				   count - max);
3955		}
3956		show_request(m, last, "\t\tQ ");
3957	}
3958
3959	last = NULL;
3960	count = 0;
3961	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
3962		struct virtual_engine *ve =
3963			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3964		struct i915_request *rq = READ_ONCE(ve->request);
3965
3966		if (rq) {
3967			if (count++ < max - 1)
3968				show_request(m, rq, "\t\tV ");
3969			else
3970				last = rq;
3971		}
3972	}
3973	if (last) {
3974		if (count > max) {
3975			drm_printf(m,
3976				   "\t\t...skipping %d virtual requests...\n",
3977				   count - max);
3978		}
3979		show_request(m, last, "\t\tV ");
3980	}
3981
3982	spin_unlock_irqrestore(&engine->active.lock, flags);
3983}
3984
3985void intel_lr_context_reset(struct intel_engine_cs *engine,
3986			    struct intel_context *ce,
3987			    u32 head,
3988			    bool scrub)
3989{
3990	/*
3991	 * We want a simple context + ring to execute the breadcrumb update.
3992	 * We cannot rely on the context being intact across the GPU hang,
3993	 * so clear it and rebuild just what we need for the breadcrumb.
3994	 * All pending requests for this context will be zapped, and any
3995	 * future request will be after userspace has had the opportunity
3996	 * to recreate its own state.
3997	 */
3998	if (scrub) {
3999		u32 *regs = ce->lrc_reg_state;
4000
4001		if (engine->pinned_default_state) {
4002			memcpy(regs, /* skip restoring the vanilla PPHWSP */
4003			       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
4004			       engine->context_size - PAGE_SIZE);
4005		}
4006		execlists_init_reg_state(regs, ce, engine, ce->ring);
4007	}
4008
4009	/* Rerun the request; its payload has been neutered (if guilty). */
4010	ce->ring->head = head;
4011	intel_ring_update_space(ce->ring);
4012
4013	__execlists_update_reg_state(ce, engine);
4014}
4015
4016#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
4017#include "selftest_lrc.c"
4018#endif

   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2014 Intel Corporation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
   4 */
   5
   6#include "gem/i915_gem_lmem.h"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
   7
   8#include "gen8_engine_cs.h"
   9#include "i915_drv.h"
  10#include "i915_perf.h"
  11#include "i915_reg.h"
  12#include "intel_context.h"
  13#include "intel_engine.h"
  14#include "intel_engine_regs.h"
  15#include "intel_gpu_commands.h"
  16#include "intel_gt.h"
  17#include "intel_gt_regs.h"
  18#include "intel_lrc.h"
  19#include "intel_lrc_reg.h"
  20#include "intel_ring.h"
  21#include "shmem_utils.h"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  22
  23/*
  24 * The per-platform tables are u8-encoded in @data. Decode @data and set the
  25 * addresses' offset and commands in @regs. The following encoding is used
  26 * for each byte. There are 2 steps: decoding commands and decoding addresses.
  27 *
  28 * Commands:
  29 * [7]: create NOPs - number of NOPs are set in lower bits
  30 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
  31 *      MI_LRI_FORCE_POSTED
  32 * [5:0]: Number of NOPs or registers to set values to in case of
  33 *        MI_LOAD_REGISTER_IMM
  34 *
  35 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
  36 * number of registers. They are set by using the REG/REG16 macros: the former
  37 * is used for offsets smaller than 0x200 while the latter is for values bigger
  38 * than that. Those macros already set all the bits documented below correctly:
  39 *
  40 * [7]: When a register offset needs more than 6 bits, use additional bytes, to
  41 *      follow, for the lower bits
  42 * [6:0]: Register offset, without considering the engine base.
  43 *
  44 * This function only tweaks the commands and register offsets. Values are not
  45 * filled out.
  46 */
  47static void set_offsets(u32 *regs,
  48			const u8 *data,
  49			const struct intel_engine_cs *engine,
  50			bool close)
  51#define NOP(x) (BIT(7) | (x))
  52#define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
  53#define POSTED BIT(0)
  54#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
  55#define REG16(x) \
  56	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
  57	(((x) >> 2) & 0x7f)
  58#define END 0
  59{
  60	const u32 base = engine->mmio_base;
  61
  62	while (*data) {
  63		u8 count, flags;
  64
  65		if (*data & BIT(7)) { /* skip */
  66			count = *data++ & ~BIT(7);
  67			regs += count;
  68			continue;
  69		}
  70
  71		count = *data & 0x3f;
  72		flags = *data >> 6;
  73		data++;
  74
  75		*regs = MI_LOAD_REGISTER_IMM(count);
  76		if (flags & POSTED)
  77			*regs |= MI_LRI_FORCE_POSTED;
  78		if (GRAPHICS_VER(engine->i915) >= 11)
  79			*regs |= MI_LRI_LRM_CS_MMIO;
  80		regs++;
  81
  82		GEM_BUG_ON(!count);
  83		do {
  84			u32 offset = 0;
  85			u8 v;
  86
  87			do {
  88				v = *data++;
  89				offset <<= 7;
  90				offset |= v & ~BIT(7);
  91			} while (v & BIT(7));
  92
  93			regs[0] = base + (offset << 2);
  94			regs += 2;
  95		} while (--count);
  96	}
  97
  98	if (close) {
  99		/* Close the batch; used mainly by live_lrc_layout() */
 100		*regs = MI_BATCH_BUFFER_END;
 101		if (GRAPHICS_VER(engine->i915) >= 11)
 102			*regs |= BIT(0);
 103	}
 104}
 105
 106static const u8 gen8_xcs_offsets[] = {
 107	NOP(1),
 108	LRI(11, 0),
 109	REG16(0x244),
 110	REG(0x034),
 111	REG(0x030),
 112	REG(0x038),
 113	REG(0x03c),
 114	REG(0x168),
 115	REG(0x140),
 116	REG(0x110),
 117	REG(0x11c),
 118	REG(0x114),
 119	REG(0x118),
 120
 121	NOP(9),
 122	LRI(9, 0),
 123	REG16(0x3a8),
 124	REG16(0x28c),
 125	REG16(0x288),
 126	REG16(0x284),
 127	REG16(0x280),
 128	REG16(0x27c),
 129	REG16(0x278),
 130	REG16(0x274),
 131	REG16(0x270),
 132
 133	NOP(13),
 134	LRI(2, 0),
 135	REG16(0x200),
 136	REG(0x028),
 137
 138	END
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 139};
 140
 141static const u8 gen9_xcs_offsets[] = {
 142	NOP(1),
 143	LRI(14, POSTED),
 144	REG16(0x244),
 145	REG(0x034),
 146	REG(0x030),
 147	REG(0x038),
 148	REG(0x03c),
 149	REG(0x168),
 150	REG(0x140),
 151	REG(0x110),
 152	REG(0x11c),
 153	REG(0x114),
 154	REG(0x118),
 155	REG(0x1c0),
 156	REG(0x1c4),
 157	REG(0x1c8),
 158
 159	NOP(3),
 160	LRI(9, POSTED),
 161	REG16(0x3a8),
 162	REG16(0x28c),
 163	REG16(0x288),
 164	REG16(0x284),
 165	REG16(0x280),
 166	REG16(0x27c),
 167	REG16(0x278),
 168	REG16(0x274),
 169	REG16(0x270),
 170
 171	NOP(13),
 172	LRI(1, POSTED),
 173	REG16(0x200),
 174
 175	NOP(13),
 176	LRI(44, POSTED),
 177	REG(0x028),
 178	REG(0x09c),
 179	REG(0x0c0),
 180	REG(0x178),
 181	REG(0x17c),
 182	REG16(0x358),
 183	REG(0x170),
 184	REG(0x150),
 185	REG(0x154),
 186	REG(0x158),
 187	REG16(0x41c),
 188	REG16(0x600),
 189	REG16(0x604),
 190	REG16(0x608),
 191	REG16(0x60c),
 192	REG16(0x610),
 193	REG16(0x614),
 194	REG16(0x618),
 195	REG16(0x61c),
 196	REG16(0x620),
 197	REG16(0x624),
 198	REG16(0x628),
 199	REG16(0x62c),
 200	REG16(0x630),
 201	REG16(0x634),
 202	REG16(0x638),
 203	REG16(0x63c),
 204	REG16(0x640),
 205	REG16(0x644),
 206	REG16(0x648),
 207	REG16(0x64c),
 208	REG16(0x650),
 209	REG16(0x654),
 210	REG16(0x658),
 211	REG16(0x65c),
 212	REG16(0x660),
 213	REG16(0x664),
 214	REG16(0x668),
 215	REG16(0x66c),
 216	REG16(0x670),
 217	REG16(0x674),
 218	REG16(0x678),
 219	REG16(0x67c),
 220	REG(0x068),
 221
 222	END
 223};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 224
 225static const u8 gen12_xcs_offsets[] = {
 226	NOP(1),
 227	LRI(13, POSTED),
 228	REG16(0x244),
 229	REG(0x034),
 230	REG(0x030),
 231	REG(0x038),
 232	REG(0x03c),
 233	REG(0x168),
 234	REG(0x140),
 235	REG(0x110),
 236	REG(0x1c0),
 237	REG(0x1c4),
 238	REG(0x1c8),
 239	REG(0x180),
 240	REG16(0x2b4),
 241
 242	NOP(5),
 243	LRI(9, POSTED),
 244	REG16(0x3a8),
 245	REG16(0x28c),
 246	REG16(0x288),
 247	REG16(0x284),
 248	REG16(0x280),
 249	REG16(0x27c),
 250	REG16(0x278),
 251	REG16(0x274),
 252	REG16(0x270),
 253
 254	END
 255};
 
 256
 257static const u8 dg2_xcs_offsets[] = {
 258	NOP(1),
 259	LRI(15, POSTED),
 260	REG16(0x244),
 261	REG(0x034),
 262	REG(0x030),
 263	REG(0x038),
 264	REG(0x03c),
 265	REG(0x168),
 266	REG(0x140),
 267	REG(0x110),
 268	REG(0x1c0),
 269	REG(0x1c4),
 270	REG(0x1c8),
 271	REG(0x180),
 272	REG16(0x2b4),
 273	REG(0x120),
 274	REG(0x124),
 275
 276	NOP(1),
 277	LRI(9, POSTED),
 278	REG16(0x3a8),
 279	REG16(0x28c),
 280	REG16(0x288),
 281	REG16(0x284),
 282	REG16(0x280),
 283	REG16(0x27c),
 284	REG16(0x278),
 285	REG16(0x274),
 286	REG16(0x270),
 287
 288	END
 289};
 
 
 
 290
 291static const u8 gen8_rcs_offsets[] = {
 292	NOP(1),
 293	LRI(14, POSTED),
 294	REG16(0x244),
 295	REG(0x034),
 296	REG(0x030),
 297	REG(0x038),
 298	REG(0x03c),
 299	REG(0x168),
 300	REG(0x140),
 301	REG(0x110),
 302	REG(0x11c),
 303	REG(0x114),
 304	REG(0x118),
 305	REG(0x1c0),
 306	REG(0x1c4),
 307	REG(0x1c8),
 308
 309	NOP(3),
 310	LRI(9, POSTED),
 311	REG16(0x3a8),
 312	REG16(0x28c),
 313	REG16(0x288),
 314	REG16(0x284),
 315	REG16(0x280),
 316	REG16(0x27c),
 317	REG16(0x278),
 318	REG16(0x274),
 319	REG16(0x270),
 320
 321	NOP(13),
 322	LRI(1, 0),
 323	REG(0x0c8),
 324
 325	END
 326};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 327
 328static const u8 gen9_rcs_offsets[] = {
 329	NOP(1),
 330	LRI(14, POSTED),
 331	REG16(0x244),
 332	REG(0x34),
 333	REG(0x30),
 334	REG(0x38),
 335	REG(0x3c),
 336	REG(0x168),
 337	REG(0x140),
 338	REG(0x110),
 339	REG(0x11c),
 340	REG(0x114),
 341	REG(0x118),
 342	REG(0x1c0),
 343	REG(0x1c4),
 344	REG(0x1c8),
 345
 346	NOP(3),
 347	LRI(9, POSTED),
 348	REG16(0x3a8),
 349	REG16(0x28c),
 350	REG16(0x288),
 351	REG16(0x284),
 352	REG16(0x280),
 353	REG16(0x27c),
 354	REG16(0x278),
 355	REG16(0x274),
 356	REG16(0x270),
 357
 358	NOP(13),
 359	LRI(1, 0),
 360	REG(0xc8),
 361
 362	NOP(13),
 363	LRI(44, POSTED),
 364	REG(0x28),
 365	REG(0x9c),
 366	REG(0xc0),
 367	REG(0x178),
 368	REG(0x17c),
 369	REG16(0x358),
 370	REG(0x170),
 371	REG(0x150),
 372	REG(0x154),
 373	REG(0x158),
 374	REG16(0x41c),
 375	REG16(0x600),
 376	REG16(0x604),
 377	REG16(0x608),
 378	REG16(0x60c),
 379	REG16(0x610),
 380	REG16(0x614),
 381	REG16(0x618),
 382	REG16(0x61c),
 383	REG16(0x620),
 384	REG16(0x624),
 385	REG16(0x628),
 386	REG16(0x62c),
 387	REG16(0x630),
 388	REG16(0x634),
 389	REG16(0x638),
 390	REG16(0x63c),
 391	REG16(0x640),
 392	REG16(0x644),
 393	REG16(0x648),
 394	REG16(0x64c),
 395	REG16(0x650),
 396	REG16(0x654),
 397	REG16(0x658),
 398	REG16(0x65c),
 399	REG16(0x660),
 400	REG16(0x664),
 401	REG16(0x668),
 402	REG16(0x66c),
 403	REG16(0x670),
 404	REG16(0x674),
 405	REG16(0x678),
 406	REG16(0x67c),
 407	REG(0x68),
 408
 409	END
 410};
 
 411
 412static const u8 gen11_rcs_offsets[] = {
 413	NOP(1),
 414	LRI(15, POSTED),
 415	REG16(0x244),
 416	REG(0x034),
 417	REG(0x030),
 418	REG(0x038),
 419	REG(0x03c),
 420	REG(0x168),
 421	REG(0x140),
 422	REG(0x110),
 423	REG(0x11c),
 424	REG(0x114),
 425	REG(0x118),
 426	REG(0x1c0),
 427	REG(0x1c4),
 428	REG(0x1c8),
 429	REG(0x180),
 430
 431	NOP(1),
 432	LRI(9, POSTED),
 433	REG16(0x3a8),
 434	REG16(0x28c),
 435	REG16(0x288),
 436	REG16(0x284),
 437	REG16(0x280),
 438	REG16(0x27c),
 439	REG16(0x278),
 440	REG16(0x274),
 441	REG16(0x270),
 442
 443	LRI(1, POSTED),
 444	REG(0x1b0),
 445
 446	NOP(10),
 447	LRI(1, 0),
 448	REG(0x0c8),
 449
 450	END
 451};
 
 
 
 
 
 
 
 
 
 
 
 452
 453static const u8 gen12_rcs_offsets[] = {
 454	NOP(1),
 455	LRI(13, POSTED),
 456	REG16(0x244),
 457	REG(0x034),
 458	REG(0x030),
 459	REG(0x038),
 460	REG(0x03c),
 461	REG(0x168),
 462	REG(0x140),
 463	REG(0x110),
 464	REG(0x1c0),
 465	REG(0x1c4),
 466	REG(0x1c8),
 467	REG(0x180),
 468	REG16(0x2b4),
 469
 470	NOP(5),
 471	LRI(9, POSTED),
 472	REG16(0x3a8),
 473	REG16(0x28c),
 474	REG16(0x288),
 475	REG16(0x284),
 476	REG16(0x280),
 477	REG16(0x27c),
 478	REG16(0x278),
 479	REG16(0x274),
 480	REG16(0x270),
 481
 482	LRI(3, POSTED),
 483	REG(0x1b0),
 484	REG16(0x5a8),
 485	REG16(0x5ac),
 486
 487	NOP(6),
 488	LRI(1, 0),
 489	REG(0x0c8),
 490	NOP(3 + 9 + 1),
 491
 492	LRI(51, POSTED),
 493	REG16(0x588),
 494	REG16(0x588),
 495	REG16(0x588),
 496	REG16(0x588),
 497	REG16(0x588),
 498	REG16(0x588),
 499	REG(0x028),
 500	REG(0x09c),
 501	REG(0x0c0),
 502	REG(0x178),
 503	REG(0x17c),
 504	REG16(0x358),
 505	REG(0x170),
 506	REG(0x150),
 507	REG(0x154),
 508	REG(0x158),
 509	REG16(0x41c),
 510	REG16(0x600),
 511	REG16(0x604),
 512	REG16(0x608),
 513	REG16(0x60c),
 514	REG16(0x610),
 515	REG16(0x614),
 516	REG16(0x618),
 517	REG16(0x61c),
 518	REG16(0x620),
 519	REG16(0x624),
 520	REG16(0x628),
 521	REG16(0x62c),
 522	REG16(0x630),
 523	REG16(0x634),
 524	REG16(0x638),
 525	REG16(0x63c),
 526	REG16(0x640),
 527	REG16(0x644),
 528	REG16(0x648),
 529	REG16(0x64c),
 530	REG16(0x650),
 531	REG16(0x654),
 532	REG16(0x658),
 533	REG16(0x65c),
 534	REG16(0x660),
 535	REG16(0x664),
 536	REG16(0x668),
 537	REG16(0x66c),
 538	REG16(0x670),
 539	REG16(0x674),
 540	REG16(0x678),
 541	REG16(0x67c),
 542	REG(0x068),
 543	REG(0x084),
 544	NOP(1),
 545
 546	END
 547};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 548
 549static const u8 xehp_rcs_offsets[] = {
 550	NOP(1),
 551	LRI(13, POSTED),
 552	REG16(0x244),
 553	REG(0x034),
 554	REG(0x030),
 555	REG(0x038),
 556	REG(0x03c),
 557	REG(0x168),
 558	REG(0x140),
 559	REG(0x110),
 560	REG(0x1c0),
 561	REG(0x1c4),
 562	REG(0x1c8),
 563	REG(0x180),
 564	REG16(0x2b4),
 565
 566	NOP(5),
 567	LRI(9, POSTED),
 568	REG16(0x3a8),
 569	REG16(0x28c),
 570	REG16(0x288),
 571	REG16(0x284),
 572	REG16(0x280),
 573	REG16(0x27c),
 574	REG16(0x278),
 575	REG16(0x274),
 576	REG16(0x270),
 577
 578	LRI(3, POSTED),
 579	REG(0x1b0),
 580	REG16(0x5a8),
 581	REG16(0x5ac),
 582
 583	NOP(6),
 584	LRI(1, 0),
 585	REG(0x0c8),
 586
 587	END
 588};
 
 
 589
 590static const u8 dg2_rcs_offsets[] = {
 591	NOP(1),
 592	LRI(15, POSTED),
 593	REG16(0x244),
 594	REG(0x034),
 595	REG(0x030),
 596	REG(0x038),
 597	REG(0x03c),
 598	REG(0x168),
 599	REG(0x140),
 600	REG(0x110),
 601	REG(0x1c0),
 602	REG(0x1c4),
 603	REG(0x1c8),
 604	REG(0x180),
 605	REG16(0x2b4),
 606	REG(0x120),
 607	REG(0x124),
 608
 609	NOP(1),
 610	LRI(9, POSTED),
 611	REG16(0x3a8),
 612	REG16(0x28c),
 613	REG16(0x288),
 614	REG16(0x284),
 615	REG16(0x280),
 616	REG16(0x27c),
 617	REG16(0x278),
 618	REG16(0x274),
 619	REG16(0x270),
 620
 621	LRI(3, POSTED),
 622	REG(0x1b0),
 623	REG16(0x5a8),
 624	REG16(0x5ac),
 625
 626	NOP(6),
 627	LRI(1, 0),
 628	REG(0x0c8),
 629
 630	END
 631};
 
 
 
 
 
 
 
 
 
 632
 633static const u8 mtl_rcs_offsets[] = {
 634	NOP(1),
 635	LRI(15, POSTED),
 636	REG16(0x244),
 637	REG(0x034),
 638	REG(0x030),
 639	REG(0x038),
 640	REG(0x03c),
 641	REG(0x168),
 642	REG(0x140),
 643	REG(0x110),
 644	REG(0x1c0),
 645	REG(0x1c4),
 646	REG(0x1c8),
 647	REG(0x180),
 648	REG16(0x2b4),
 649	REG(0x120),
 650	REG(0x124),
 651
 652	NOP(1),
 653	LRI(9, POSTED),
 654	REG16(0x3a8),
 655	REG16(0x28c),
 656	REG16(0x288),
 657	REG16(0x284),
 658	REG16(0x280),
 659	REG16(0x27c),
 660	REG16(0x278),
 661	REG16(0x274),
 662	REG16(0x270),
 663
 664	NOP(2),
 665	LRI(2, POSTED),
 666	REG16(0x5a8),
 667	REG16(0x5ac),
 668
 669	NOP(6),
 670	LRI(1, 0),
 671	REG(0x0c8),
 672
 673	END
 674};
 675
 676#undef END
 677#undef REG16
 678#undef REG
 679#undef LRI
 680#undef NOP
 681
 682static const u8 *reg_offsets(const struct intel_engine_cs *engine)
 683{
 684	/*
 685	 * The gen12+ lists only have the registers we program in the basic
 686	 * default state. We rely on the context image using relative
 687	 * addressing to automatic fixup the register state between the
 688	 * physical engines for virtual engine.
 689	 */
 690	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
 691		   !intel_engine_has_relative_mmio(engine));
 692
 693	if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
 694		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70))
 695			return mtl_rcs_offsets;
 696		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 697			return dg2_rcs_offsets;
 698		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 699			return xehp_rcs_offsets;
 700		else if (GRAPHICS_VER(engine->i915) >= 12)
 701			return gen12_rcs_offsets;
 702		else if (GRAPHICS_VER(engine->i915) >= 11)
 703			return gen11_rcs_offsets;
 704		else if (GRAPHICS_VER(engine->i915) >= 9)
 705			return gen9_rcs_offsets;
 706		else
 707			return gen8_rcs_offsets;
 708	} else {
 709		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 710			return dg2_xcs_offsets;
 711		else if (GRAPHICS_VER(engine->i915) >= 12)
 712			return gen12_xcs_offsets;
 713		else if (GRAPHICS_VER(engine->i915) >= 9)
 714			return gen9_xcs_offsets;
 715		else
 716			return gen8_xcs_offsets;
 717	}
 
 
 718}
 719
 720static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
 721{
 722	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 723		return 0x70;
 724	else if (GRAPHICS_VER(engine->i915) >= 12)
 725		return 0x60;
 726	else if (GRAPHICS_VER(engine->i915) >= 9)
 727		return 0x54;
 728	else if (engine->class == RENDER_CLASS)
 729		return 0x58;
 730	else
 731		return -1;
 732}
 733
 734static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
 
 735{
 736	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 737		return 0x80;
 738	else if (GRAPHICS_VER(engine->i915) >= 12)
 739		return 0x70;
 740	else if (GRAPHICS_VER(engine->i915) >= 9)
 741		return 0x64;
 742	else if (GRAPHICS_VER(engine->i915) >= 8 &&
 743		 engine->class == RENDER_CLASS)
 744		return 0xc4;
 745	else
 746		return -1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 747}
 748
 749static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
 
 750{
 751	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 752		return 0x84;
 753	else if (GRAPHICS_VER(engine->i915) >= 12)
 754		return 0x74;
 755	else if (GRAPHICS_VER(engine->i915) >= 9)
 756		return 0x68;
 757	else if (engine->class == RENDER_CLASS)
 758		return 0xd8;
 759	else
 760		return -1;
 761}
 762
 763static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
 
 764{
 765	if (GRAPHICS_VER(engine->i915) >= 12)
 766		return 0x12;
 767	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
 768		return 0x18;
 769	else
 770		return -1;
 
 
 
 771}
 772
 773static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
 
 774{
 775	int x;
 
 
 
 776
 777	x = lrc_ring_wa_bb_per_ctx(engine);
 778	if (x < 0)
 779		return x;
 780
 781	return x + 2;
 782}
 783
 784static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
 
 785{
 786	int x;
 
 787
 788	x = lrc_ring_indirect_ptr(engine);
 789	if (x < 0)
 790		return x;
 791
 792	return x + 2;
 
 
 
 
 
 
 
 
 
 793}
 794
 795static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
 796{
 
 
 797
 798	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 799		/*
 800		 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
 801		 * simply to match the RCS context image layout.
 802		 */
 803		return 0xc6;
 804	else if (engine->class != RENDER_CLASS)
 805		return -1;
 806	else if (GRAPHICS_VER(engine->i915) >= 12)
 807		return 0xb6;
 808	else if (GRAPHICS_VER(engine->i915) >= 11)
 809		return 0xaa;
 810	else
 811		return -1;
 812}
 813
 814static u32
 815lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
 
 816{
 817	if (GRAPHICS_VER(engine->i915) >= 12)
 818		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 819	else if (GRAPHICS_VER(engine->i915) >= 11)
 820		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 821	else if (GRAPHICS_VER(engine->i915) >= 9)
 822		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 823	else if (GRAPHICS_VER(engine->i915) >= 8)
 824		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 825
 826	GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8);
 
 
 827
 828	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 829}
 830
 831static void
 832lrc_setup_indirect_ctx(u32 *regs,
 833		       const struct intel_engine_cs *engine,
 834		       u32 ctx_bb_ggtt_addr,
 835		       u32 size)
 836{
 837	GEM_BUG_ON(!size);
 838	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
 839	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
 840	regs[lrc_ring_indirect_ptr(engine) + 1] =
 841		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
 842
 843	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
 844	regs[lrc_ring_indirect_offset(engine) + 1] =
 845		lrc_ring_indirect_offset_default(engine) << 6;
 846}
 847
 848static void init_common_regs(u32 * const regs,
 849			     const struct intel_context *ce,
 850			     const struct intel_engine_cs *engine,
 851			     bool inhibit)
 852{
 853	u32 ctl;
 854	int loc;
 855
 856	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
 857	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
 858	if (inhibit)
 859		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
 860	if (GRAPHICS_VER(engine->i915) < 11)
 861		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
 862					   CTX_CTRL_RS_CTX_ENABLE);
 863	regs[CTX_CONTEXT_CONTROL] = ctl;
 864
 865	regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
 866
 867	loc = lrc_ring_bb_offset(engine);
 868	if (loc != -1)
 869		regs[loc + 1] = 0;
 870}
 871
 872static void init_wa_bb_regs(u32 * const regs,
 873			    const struct intel_engine_cs *engine)
 874{
 875	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
 
 876
 877	if (wa_ctx->per_ctx.size) {
 878		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
 879
 880		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
 881		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
 882			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
 883	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 884
 885	if (wa_ctx->indirect_ctx.size) {
 886		lrc_setup_indirect_ctx(regs, engine,
 887				       i915_ggtt_offset(wa_ctx->vma) +
 888				       wa_ctx->indirect_ctx.offset,
 889				       wa_ctx->indirect_ctx.size);
 890	}
 891}
 892
 893static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
 894{
 895	if (i915_vm_is_4lvl(&ppgtt->vm)) {
 896		/* 64b PPGTT (48bit canonical)
 897		 * PDP0_DESCRIPTOR contains the base address to PML4 and
 898		 * other PDP Descriptors are ignored.
 899		 */
 900		ASSIGN_CTX_PML4(ppgtt, regs);
 901	} else {
 902		ASSIGN_CTX_PDP(ppgtt, regs, 3);
 903		ASSIGN_CTX_PDP(ppgtt, regs, 2);
 904		ASSIGN_CTX_PDP(ppgtt, regs, 1);
 905		ASSIGN_CTX_PDP(ppgtt, regs, 0);
 906	}
 907}
 908
 909static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
 
 
 
 910{
 911	if (i915_is_ggtt(vm))
 912		return i915_vm_to_ggtt(vm)->alias;
 913	else
 914		return i915_vm_to_ppgtt(vm);
 
 
 
 
 
 
 
 
 915}
 916
 917static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
 
 
 918{
 919	int x;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 920
 921	x = lrc_ring_mi_mode(engine);
 922	if (x != -1) {
 923		regs[x + 1] &= ~STOP_RING;
 924		regs[x + 1] |= STOP_RING << 16;
 
 925	}
 
 
 926}
 927
 928static void __lrc_init_regs(u32 *regs,
 929			    const struct intel_context *ce,
 930			    const struct intel_engine_cs *engine,
 931			    bool inhibit)
 932{
 
 
 
 
 
 933	/*
 934	 * A context is actually a big batch buffer with several
 935	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
 936	 * values we are setting here are only for the first context restore:
 937	 * on a subsequent save, the GPU will recreate this batchbuffer with new
 938	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
 939	 * we are not initializing here).
 940	 *
 941	 * Must keep consistent with virtual_update_register_offsets().
 
 
 
 
 
 
 942	 */
 
 
 
 
 
 
 
 943
 944	if (inhibit)
 945		memset(regs, 0, PAGE_SIZE);
 
 
 946
 947	set_offsets(regs, reg_offsets(engine), engine, inhibit);
 
 
 
 
 948
 949	init_common_regs(regs, ce, engine, inhibit);
 950	init_ppgtt_regs(regs, vm_alias(ce->vm));
 
 
 
 951
 952	init_wa_bb_regs(regs, engine);
 
 953
 954	__reset_stop_ring(regs, engine);
 955}
 956
 957void lrc_init_regs(const struct intel_context *ce,
 958		   const struct intel_engine_cs *engine,
 959		   bool inhibit)
 960{
 961	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 962}
 963
 964void lrc_reset_regs(const struct intel_context *ce,
 965		    const struct intel_engine_cs *engine)
 966{
 967	__reset_stop_ring(ce->lrc_reg_state, engine);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 968}
 969
 970static void
 971set_redzone(void *vaddr, const struct intel_engine_cs *engine)
 
 972{
 973	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 974		return;
 
 
 975
 976	vaddr += engine->context_size;
 
 
 
 
 
 
 
 
 
 
 
 977
 978	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
 979}
 980
 981static void
 982check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
 983{
 984	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 985		return;
 986
 987	vaddr += engine->context_size;
 988
 989	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
 990		drm_err_once(&engine->i915->drm,
 991			     "%s context redzone overwritten!\n",
 992			     engine->name);
 
 
 
 993}
 994
 995static u32 context_wa_bb_offset(const struct intel_context *ce)
 
 996{
 997	return PAGE_SIZE * ce->wa_bb_page;
 
 
 
 
 
 998}
 999
1000static u32 *context_indirect_bb(const struct intel_context *ce)
1001{
1002	void *ptr;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1003
1004	GEM_BUG_ON(!ce->wa_bb_page);
 
 
1005
1006	ptr = ce->lrc_reg_state;
1007	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1008	ptr += context_wa_bb_offset(ce);
1009
1010	return ptr;
 
 
 
 
 
1011}
1012
1013void lrc_init_state(struct intel_context *ce,
1014		    struct intel_engine_cs *engine,
1015		    void *state)
1016{
1017	bool inhibit = true;
 
 
 
 
1018
1019	set_redzone(state, engine);
 
1020
1021	if (engine->default_state) {
1022		shmem_read(engine->default_state, 0,
1023			   state, engine->context_size);
1024		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
1025		inhibit = false;
1026	}
 
1027
1028	/* Clear the ppHWSP (inc. per-context counters) */
1029	memset(state, 0, PAGE_SIZE);
1030
1031	/* Clear the indirect wa and storage */
1032	if (ce->wa_bb_page)
1033		memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
1034
1035	/*
1036	 * The second page of the context object contains some registers which
1037	 * must be set up prior to the first execution.
1038	 */
1039	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
1040}
1041
1042u32 lrc_indirect_bb(const struct intel_context *ce)
 
1043{
1044	return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
 
 
 
1045}
1046
1047static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
 
1048{
1049	/* If predication is active, this will be noop'ed */
1050	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1051	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1052	*cs++ = 0;
1053	*cs++ = 0; /* No predication */
1054
1055	/* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
1056	*cs++ = MI_BATCH_BUFFER_END | BIT(15);
1057	*cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
1058
1059	/* Instructions are no longer predicated (disabled), we can proceed */
1060	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1061	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1062	*cs++ = 0;
1063	*cs++ = 1; /* enable predication before the next BB */
1064
1065	*cs++ = MI_BATCH_BUFFER_END;
1066	GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
1067
1068	return cs;
 
 
1069}
1070
1071static struct i915_vma *
1072__lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
1073{
1074	struct drm_i915_gem_object *obj;
1075	struct i915_vma *vma;
1076	u32 context_size;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1077
1078	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
 
 
 
1079
1080	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1081		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
1082
1083	if (GRAPHICS_VER(engine->i915) >= 12) {
1084		ce->wa_bb_page = context_size / PAGE_SIZE;
1085		context_size += PAGE_SIZE;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1086	}
1087
1088	if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1089		ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1090		context_size += PARENT_SCRATCH_SIZE;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1091	}
1092
1093	obj = i915_gem_object_create_lmem(engine->i915, context_size,
1094					  I915_BO_ALLOC_PM_VOLATILE);
1095	if (IS_ERR(obj))
1096		obj = i915_gem_object_create_shmem(engine->i915, context_size);
1097	if (IS_ERR(obj))
1098		return ERR_CAST(obj);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1099
1100	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1101	if (IS_ERR(vma)) {
1102		i915_gem_object_put(obj);
1103		return vma;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1104	}
 
 
 
 
 
 
 
 
 
 
1105
1106	return vma;
 
 
 
1107}
1108
1109static struct intel_timeline *
1110pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1111{
1112	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
 
 
1113
1114	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
 
 
 
1115}
1116
1117int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1118{
1119	struct intel_ring *ring;
1120	struct i915_vma *vma;
1121	int err;
 
 
 
 
 
1122
1123	GEM_BUG_ON(ce->state);
 
 
 
 
 
 
 
 
1124
1125	vma = __lrc_alloc_state(ce, engine);
1126	if (IS_ERR(vma))
1127		return PTR_ERR(vma);
 
 
 
1128
1129	ring = intel_engine_create_ring(engine, ce->ring_size);
1130	if (IS_ERR(ring)) {
1131		err = PTR_ERR(ring);
1132		goto err_vma;
1133	}
1134
1135	if (!page_mask_bits(ce->timeline)) {
1136		struct intel_timeline *tl;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1137
1138		/*
1139		 * Use the static global HWSP for the kernel context, and
1140		 * a dynamically allocated cacheline for everyone else.
 
 
 
 
 
 
 
 
 
 
 
 
 
1141		 */
1142		if (unlikely(ce->timeline))
1143			tl = pinned_timeline(ce, engine);
 
 
 
 
 
1144		else
1145			tl = intel_timeline_create(engine->gt);
1146		if (IS_ERR(tl)) {
1147			err = PTR_ERR(tl);
1148			goto err_ring;
1149		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1150
1151		ce->timeline = tl;
1152	}
 
1153
1154	ce->ring = ring;
1155	ce->state = vma;
 
 
1156
1157	return 0;
1158
1159err_ring:
1160	intel_ring_put(ring);
1161err_vma:
1162	i915_vma_put(vma);
1163	return err;
 
 
 
 
 
 
 
1164}
1165
1166void lrc_reset(struct intel_context *ce)
1167{
1168	GEM_BUG_ON(!intel_context_is_pinned(ce));
 
 
 
 
 
 
1169
1170	intel_ring_reset(ce->ring, ce->ring->emit);
 
 
 
 
 
 
 
1171
1172	/* Scrub away the garbage */
1173	lrc_init_regs(ce, ce->engine, true);
1174	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
 
 
 
1175}
1176
1177int
1178lrc_pre_pin(struct intel_context *ce,
1179	    struct intel_engine_cs *engine,
1180	    struct i915_gem_ww_ctx *ww,
1181	    void **vaddr)
1182{
1183	GEM_BUG_ON(!ce->state);
1184	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1185
1186	*vaddr = i915_gem_object_pin_map(ce->state->obj,
1187					 i915_coherent_map_type(ce->engine->i915,
1188								ce->state->obj,
1189								false) |
1190					 I915_MAP_OVERRIDE);
1191
1192	return PTR_ERR_OR_ZERO(*vaddr);
 
 
 
 
 
1193}
1194
1195int
1196lrc_pin(struct intel_context *ce,
1197	struct intel_engine_cs *engine,
1198	void *vaddr)
1199{
1200	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1201
1202	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1203		lrc_init_state(ce, engine, vaddr);
1204
1205	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1206	return 0;
 
 
1207}
1208
1209void lrc_unpin(struct intel_context *ce)
 
1210{
1211	if (unlikely(ce->parallel.last_rq)) {
1212		i915_request_put(ce->parallel.last_rq);
1213		ce->parallel.last_rq = NULL;
1214	}
1215	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1216		      ce->engine);
 
1217}
1218
1219void lrc_post_unpin(struct intel_context *ce)
1220{
1221	i915_gem_object_unpin_map(ce->state->obj);
 
 
 
 
 
 
 
 
 
 
 
 
 
1222}
1223
1224void lrc_fini(struct intel_context *ce)
1225{
1226	if (!ce->state)
1227		return;
1228
1229	intel_ring_put(fetch_and_zero(&ce->ring));
1230	i915_vma_put(fetch_and_zero(&ce->state));
1231}
1232
1233void lrc_destroy(struct kref *kref)
1234{
1235	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1236
1237	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1238	GEM_BUG_ON(intel_context_is_pinned(ce));
1239
1240	lrc_fini(ce);
 
1241
1242	intel_context_fini(ce);
1243	intel_context_free(ce);
1244}
1245
1246static u32 *
1247gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1248{
1249	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1250		MI_SRM_LRM_GLOBAL_GTT |
1251		MI_LRI_LRM_CS_MMIO;
1252	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1253	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1254		CTX_TIMESTAMP * sizeof(u32);
1255	*cs++ = 0;
1256
1257	*cs++ = MI_LOAD_REGISTER_REG |
1258		MI_LRR_SOURCE_CS_MMIO |
1259		MI_LRI_LRM_CS_MMIO;
1260	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1261	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1262
1263	*cs++ = MI_LOAD_REGISTER_REG |
1264		MI_LRR_SOURCE_CS_MMIO |
1265		MI_LRI_LRM_CS_MMIO;
1266	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1267	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1268
1269	return cs;
1270}
1271
1272static u32 *
1273gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1274{
1275	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
 
1276
1277	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1278		MI_SRM_LRM_GLOBAL_GTT |
1279		MI_LRI_LRM_CS_MMIO;
1280	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1281	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1282		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1283	*cs++ = 0;
1284
1285	return cs;
 
 
 
1286}
1287
1288static u32 *
1289gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1290{
1291	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
 
1292
1293	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1294		MI_SRM_LRM_GLOBAL_GTT |
1295		MI_LRI_LRM_CS_MMIO;
1296	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1297	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1298		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1299	*cs++ = 0;
 
 
 
 
 
 
 
1300
1301	*cs++ = MI_LOAD_REGISTER_REG |
1302		MI_LRR_SOURCE_CS_MMIO |
1303		MI_LRI_LRM_CS_MMIO;
1304	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1305	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1306
1307	return cs;
 
 
 
 
 
 
1308}
1309
1310/*
1311 * On DG2 during context restore of a preempted context in GPGPU mode,
1312 * RCS restore hang is detected. This is extremely timing dependent.
1313 * To address this below sw wabb is implemented for DG2 A steppings.
1314 */
1315static u32 *
1316dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs)
1317{
1318	*cs++ = MI_LOAD_REGISTER_IMM(1);
1319	*cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG);
1320	*cs++ = 0x21;
1321
1322	*cs++ = MI_LOAD_REGISTER_REG;
1323	*cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1324	*cs++ = i915_mmio_reg_offset(GEN12_CULLBIT1);
1325
1326	*cs++ = MI_LOAD_REGISTER_REG;
1327	*cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1328	*cs++ = i915_mmio_reg_offset(GEN12_CULLBIT2);
1329
1330	return cs;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1331}
1332
1333/*
1334 * The bspec's tuning guide asks us to program a vertical watermark value of
1335 * 0x3FF.  However this register is not saved/restored properly by the
1336 * hardware, so we're required to apply the desired value via INDIRECT_CTX
1337 * batch buffer to ensure the value takes effect properly.  All other bits
1338 * in this register should remain at 0 (the hardware default).
1339 */
1340static u32 *
1341dg2_emit_draw_watermark_setting(u32 *cs)
1342{
1343	*cs++ = MI_LOAD_REGISTER_IMM(1);
1344	*cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
1345	*cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
1346
1347	return cs;
 
 
1348}
1349
1350static u32 *
1351gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1352{
1353	cs = gen12_emit_timestamp_wa(ce, cs);
1354	cs = gen12_emit_cmd_buf_wa(ce, cs);
1355	cs = gen12_emit_restore_scratch(ce, cs);
1356
1357	/* Wa_22011450934:dg2 */
1358	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) ||
1359	    IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0))
1360		cs = dg2_emit_rcs_hang_wabb(ce, cs);
1361
1362	/* Wa_16013000631:dg2 */
1363	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1364	    IS_DG2_G11(ce->engine->i915))
1365		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1366
1367	/* hsdes: 1809175790 */
1368	if (!HAS_FLAT_CCS(ce->engine->i915))
1369		cs = gen12_emit_aux_table_inv(ce->engine->gt,
1370					      cs, GEN12_GFX_CCS_AUX_NV);
1371
1372	/* Wa_16014892111 */
1373	if (IS_DG2(ce->engine->i915))
1374		cs = dg2_emit_draw_watermark_setting(cs);
1375
1376	return cs;
1377}
1378
1379static u32 *
1380gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1381{
1382	cs = gen12_emit_timestamp_wa(ce, cs);
1383	cs = gen12_emit_restore_scratch(ce, cs);
1384
1385	/* Wa_16013000631:dg2 */
1386	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1387	    IS_DG2_G11(ce->engine->i915))
1388		if (ce->engine->class == COMPUTE_CLASS)
1389			cs = gen8_emit_pipe_control(cs,
1390						    PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1391						    0);
1392
1393	/* hsdes: 1809175790 */
1394	if (!HAS_FLAT_CCS(ce->engine->i915)) {
1395		if (ce->engine->class == VIDEO_DECODE_CLASS)
1396			cs = gen12_emit_aux_table_inv(ce->engine->gt,
1397						      cs, GEN12_VD0_AUX_NV);
1398		else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS)
1399			cs = gen12_emit_aux_table_inv(ce->engine->gt,
1400						      cs, GEN12_VE0_AUX_NV);
1401	}
1402
1403	return cs;
1404}
 
1405
1406static void
1407setup_indirect_ctx_bb(const struct intel_context *ce,
1408		      const struct intel_engine_cs *engine,
1409		      u32 *(*emit)(const struct intel_context *, u32 *))
1410{
1411	u32 * const start = context_indirect_bb(ce);
1412	u32 *cs;
1413
1414	cs = emit(ce, start);
1415	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1416	while ((unsigned long)cs % CACHELINE_BYTES)
1417		*cs++ = MI_NOOP;
1418
1419	GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1420	setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1421
1422	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1423			       lrc_indirect_bb(ce),
1424			       (cs - start) * sizeof(*cs));
1425}
 
 
1426
1427/*
1428 * The context descriptor encodes various attributes of a context,
1429 * including its GTT address and some flags. Because it's fairly
1430 * expensive to calculate, we'll just do it once and cache the result,
1431 * which remains valid until the context is unpinned.
1432 *
1433 * This is what a descriptor looks like, from LSB to MSB::
1434 *
1435 *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1436 *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1437 *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1438 *      bits 53-54:    mbz, reserved for use by hardware
1439 *      bits 55-63:    group ID, currently unused and set to 0
1440 *
1441 * Starting from Gen11, the upper dword of the descriptor has a new format:
1442 *
1443 *      bits 32-36:    reserved
1444 *      bits 37-47:    SW context ID
1445 *      bits 48:53:    engine instance
1446 *      bit 54:        mbz, reserved for use by hardware
1447 *      bits 55-60:    SW counter
1448 *      bits 61-63:    engine class
1449 *
1450 * On Xe_HP, the upper dword of the descriptor has a new format:
1451 *
1452 *      bits 32-37:    virtual function number
1453 *      bit 38:        mbz, reserved for use by hardware
1454 *      bits 39-54:    SW context ID
1455 *      bits 55-57:    reserved
1456 *      bits 58-63:    SW counter
1457 *
1458 * engine info, SW context ID and SW counter need to form a unique number
1459 * (Context ID) per lrc.
1460 */
1461static u32 lrc_descriptor(const struct intel_context *ce)
1462{
1463	u32 desc;
1464
1465	desc = INTEL_LEGACY_32B_CONTEXT;
1466	if (i915_vm_is_4lvl(ce->vm))
1467		desc = INTEL_LEGACY_64B_CONTEXT;
1468	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1469
1470	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1471	if (GRAPHICS_VER(ce->vm->i915) == 8)
1472		desc |= GEN8_CTX_L3LLC_COHERENT;
1473
1474	return i915_ggtt_offset(ce->state) | desc;
1475}
1476
1477u32 lrc_update_regs(const struct intel_context *ce,
1478		    const struct intel_engine_cs *engine,
1479		    u32 head)
1480{
1481	struct intel_ring *ring = ce->ring;
1482	u32 *regs = ce->lrc_reg_state;
 
 
1483
1484	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1485	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1486
1487	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1488	regs[CTX_RING_HEAD] = head;
1489	regs[CTX_RING_TAIL] = ring->tail;
1490	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
 
 
1491
1492	/* RPCS */
1493	if (engine->class == RENDER_CLASS) {
1494		regs[CTX_R_PWR_CLK_STATE] =
1495			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1496
1497		i915_oa_init_reg_state(ce, engine);
1498	}
 
 
1499
1500	if (ce->wa_bb_page) {
1501		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
 
1502
1503		fn = gen12_emit_indirect_ctx_xcs;
1504		if (ce->engine->class == RENDER_CLASS)
1505			fn = gen12_emit_indirect_ctx_rcs;
 
 
1506
1507		/* Mutually exclusive wrt to global indirect bb */
1508		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1509		setup_indirect_ctx_bb(ce, engine, fn);
 
1510	}
 
 
 
1511
1512	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
 
 
 
 
 
 
1513}
1514
1515void lrc_update_offsets(struct intel_context *ce,
1516			struct intel_engine_cs *engine)
1517{
1518	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1519}
 
 
 
 
 
 
 
 
1520
1521void lrc_check_regs(const struct intel_context *ce,
1522		    const struct intel_engine_cs *engine,
1523		    const char *when)
1524{
1525	const struct intel_ring *ring = ce->ring;
1526	u32 *regs = ce->lrc_reg_state;
1527	bool valid = true;
1528	int x;
1529
1530	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1531		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1532		       engine->name,
1533		       regs[CTX_RING_START],
1534		       i915_ggtt_offset(ring->vma));
1535		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1536		valid = false;
1537	}
1538
1539	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1540	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1541		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1542		       engine->name,
1543		       regs[CTX_RING_CTL],
1544		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1545		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1546		valid = false;
1547	}
1548
1549	x = lrc_ring_mi_mode(engine);
1550	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1551		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1552		       engine->name, regs[x + 1]);
1553		regs[x + 1] &= ~STOP_RING;
1554		regs[x + 1] |= STOP_RING << 16;
1555		valid = false;
1556	}
1557
1558	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
 
1559}
1560
1561/*
1562 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1563 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1564 * but there is a slight complication as this is applied in WA batch where the
1565 * values are only initialized once so we cannot take register value at the
1566 * beginning and reuse it further; hence we save its value to memory, upload a
1567 * constant value with bit21 set and then we restore it back with the saved value.
1568 * To simplify the WA, a constant value is formed by using the default value
1569 * of this register. This shouldn't be a problem because we are only modifying
1570 * it for a short period and this batch in non-premptible. We can ofcourse
1571 * use additional instructions that read the actual value of the register
1572 * at that time and set our bit of interest but it makes the WA complicated.
1573 *
1574 * This WA is also required for Gen9 so extracting as a function avoids
1575 * code duplication.
1576 */
1577static u32 *
1578gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1579{
1580	/* NB no one else is allowed to scribble over scratch + 256! */
1581	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1582	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1583	*batch++ = intel_gt_scratch_offset(engine->gt,
1584					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1585	*batch++ = 0;
1586
1587	*batch++ = MI_LOAD_REGISTER_IMM(1);
1588	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1589	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1590
1591	batch = gen8_emit_pipe_control(batch,
1592				       PIPE_CONTROL_CS_STALL |
1593				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1594				       0);
1595
1596	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1597	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1598	*batch++ = intel_gt_scratch_offset(engine->gt,
1599					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1600	*batch++ = 0;
1601
1602	return batch;
1603}
1604
 
 
 
 
 
 
1605/*
1606 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1607 * initialized at the beginning and shared across all contexts but this field
1608 * helps us to have multiple batches at different offsets and select them based
1609 * on a criteria. At the moment this batch always start at the beginning of the page
1610 * and at this point we don't have multiple wa_ctx batch buffers.
1611 *
1612 * The number of WA applied are not known at the beginning; we use this field
1613 * to return the no of DWORDS written.
1614 *
1615 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1616 * so it adds NOOPs as padding to make it cacheline aligned.
1617 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1618 * makes a complete batch buffer.
1619 */
1620static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1621{
1622	/* WaDisableCtxRestoreArbitration:bdw,chv */
1623	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1624
1625	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1626	if (IS_BROADWELL(engine->i915))
1627		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1628
1629	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1630	/* Actual scratch location is at 128 bytes offset */
1631	batch = gen8_emit_pipe_control(batch,
1632				       PIPE_CONTROL_FLUSH_L3 |
1633				       PIPE_CONTROL_STORE_DATA_INDEX |
1634				       PIPE_CONTROL_CS_STALL |
1635				       PIPE_CONTROL_QW_WRITE,
1636				       LRC_PPHWSP_SCRATCH_ADDR);
1637
1638	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1639
1640	/* Pad to end of cacheline */
1641	while ((unsigned long)batch % CACHELINE_BYTES)
1642		*batch++ = MI_NOOP;
1643
1644	/*
1645	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1646	 * execution depends on the length specified in terms of cache lines
1647	 * in the register CTX_RCS_INDIRECT_CTX
1648	 */
1649
1650	return batch;
1651}
1652
1653struct lri {
1654	i915_reg_t reg;
1655	u32 value;
1656};
1657
1658static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1659{
1660	GEM_BUG_ON(!count || count > 63);
1661
1662	*batch++ = MI_LOAD_REGISTER_IMM(count);
1663	do {
1664		*batch++ = i915_mmio_reg_offset(lri->reg);
1665		*batch++ = lri->value;
1666	} while (lri++, --count);
1667	*batch++ = MI_NOOP;
1668
1669	return batch;
1670}
1671
1672static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1673{
1674	static const struct lri lri[] = {
1675		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1676		{
1677			COMMON_SLICE_CHICKEN2,
1678			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1679				       0),
1680		},
1681
1682		/* BSpec: 11391 */
1683		{
1684			FF_SLICE_CHICKEN,
1685			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1686				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1687		},
1688
1689		/* BSpec: 11299 */
1690		{
1691			_3D_CHICKEN3,
1692			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1693				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1694		}
1695	};
1696
1697	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1698
1699	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1700	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1701
1702	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1703	batch = gen8_emit_pipe_control(batch,
1704				       PIPE_CONTROL_FLUSH_L3 |
1705				       PIPE_CONTROL_STORE_DATA_INDEX |
1706				       PIPE_CONTROL_CS_STALL |
1707				       PIPE_CONTROL_QW_WRITE,
1708				       LRC_PPHWSP_SCRATCH_ADDR);
1709
1710	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1711
1712	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1713	if (HAS_POOLED_EU(engine->i915)) {
1714		/*
1715		 * EU pool configuration is setup along with golden context
1716		 * during context initialization. This value depends on
1717		 * device type (2x6 or 3x6) and needs to be updated based
1718		 * on which subslice is disabled especially for 2x6
1719		 * devices, however it is safe to load default
1720		 * configuration of 3x6 device instead of masking off
1721		 * corresponding bits because HW ignores bits of a disabled
1722		 * subslice and drops down to appropriate config. Please
1723		 * see render_state_setup() in i915_gem_render_state.c for
1724		 * possible configurations, to avoid duplication they are
1725		 * not shown here again.
1726		 */
1727		*batch++ = GEN9_MEDIA_POOL_STATE;
1728		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1729		*batch++ = 0x00777000;
1730		*batch++ = 0;
1731		*batch++ = 0;
1732		*batch++ = 0;
1733	}
1734
1735	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1736
1737	/* Pad to end of cacheline */
1738	while ((unsigned long)batch % CACHELINE_BYTES)
1739		*batch++ = MI_NOOP;
1740
1741	return batch;
1742}
1743
1744#define CTX_WA_BB_SIZE (PAGE_SIZE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1745
1746static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1747{
1748	struct drm_i915_gem_object *obj;
1749	struct i915_vma *vma;
1750	int err;
1751
1752	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1753	if (IS_ERR(obj))
1754		return PTR_ERR(obj);
1755
1756	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1757	if (IS_ERR(vma)) {
1758		err = PTR_ERR(vma);
1759		goto err;
1760	}
1761
 
 
 
 
1762	engine->wa_ctx.vma = vma;
1763	return 0;
1764
1765err:
1766	i915_gem_object_put(obj);
1767	return err;
1768}
1769
1770void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1771{
1772	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1773}
1774
1775typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1776
1777void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1778{
1779	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1780	struct i915_wa_ctx_bb *wa_bb[] = {
1781		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1782	};
1783	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1784	struct i915_gem_ww_ctx ww;
1785	void *batch, *batch_ptr;
1786	unsigned int i;
1787	int err;
1788
1789	if (GRAPHICS_VER(engine->i915) >= 11 ||
1790	    !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1791		return;
1792
1793	if (GRAPHICS_VER(engine->i915) == 9) {
 
 
 
 
 
 
 
 
1794		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1795		wa_bb_fn[1] = NULL;
1796	} else if (GRAPHICS_VER(engine->i915) == 8) {
 
1797		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1798		wa_bb_fn[1] = NULL;
 
 
 
 
1799	}
1800
1801	err = lrc_create_wa_ctx(engine);
1802	if (err) {
1803		/*
1804		 * We continue even if we fail to initialize WA batch
1805		 * because we only expect rare glitches but nothing
1806		 * critical to prevent us from using GPU
1807		 */
1808		drm_err(&engine->i915->drm,
1809			"Ignoring context switch w/a allocation error:%d\n",
1810			err);
1811		return;
1812	}
1813
1814	if (!engine->wa_ctx.vma)
1815		return;
1816
1817	i915_gem_ww_ctx_init(&ww, true);
1818retry:
1819	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1820	if (!err)
1821		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1822	if (err)
1823		goto err;
1824
1825	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1826	if (IS_ERR(batch)) {
1827		err = PTR_ERR(batch);
1828		goto err_unpin;
1829	}
1830
1831	/*
1832	 * Emit the two workaround batch buffers, recording the offset from the
1833	 * start of the workaround batch buffer object for each and their
1834	 * respective sizes.
1835	 */
1836	batch_ptr = batch;
1837	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1838		wa_bb[i]->offset = batch_ptr - batch;
1839		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1840						  CACHELINE_BYTES))) {
1841			err = -EINVAL;
1842			break;
1843		}
1844		if (wa_bb_fn[i])
1845			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1846		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1847	}
1848	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1849
1850	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1851	__i915_gem_object_release_map(wa_ctx->vma->obj);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1852
1853	/* Verify that we can handle failure to setup the wa_ctx */
1854	if (!err)
1855		err = i915_inject_probe_error(engine->i915, -ENODEV);
1856
1857err_unpin:
1858	if (err)
1859		i915_vma_unpin(wa_ctx->vma);
1860err:
1861	if (err == -EDEADLK) {
1862		err = i915_gem_ww_ctx_backoff(&ww);
1863		if (!err)
1864			goto retry;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1865	}
1866	i915_gem_ww_ctx_fini(&ww);
1867
1868	if (err) {
1869		i915_vma_put(engine->wa_ctx.vma);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1870
1871		/* Clear all flags to prevent further use */
1872		memset(wa_ctx, 0, sizeof(*wa_ctx));
 
1873	}
 
 
 
 
1874}
1875
1876static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
 
 
 
 
1877{
1878#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1879	stats->runtime.num_underflow++;
1880	stats->runtime.max_underflow =
1881		max_t(u32, stats->runtime.max_underflow, -dt);
1882#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1883}
1884
1885static u32 lrc_get_runtime(const struct intel_context *ce)
 
1886{
 
 
 
 
 
 
 
 
 
1887	/*
1888	 * We can use either ppHWSP[16] which is recorded before the context
1889	 * switch (and so excludes the cost of context switches) or use the
1890	 * value from the context image itself, which is saved/restored earlier
1891	 * and so includes the cost of the save.
1892	 */
1893	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1894}
1895
1896void lrc_update_runtime(struct intel_context *ce)
1897{
1898	struct intel_context_stats *stats = &ce->stats;
1899	u32 old;
1900	s32 dt;
1901
1902	old = stats->runtime.last;
1903	stats->runtime.last = lrc_get_runtime(ce);
1904	dt = stats->runtime.last - old;
1905	if (!dt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1906		return;
1907
1908	if (unlikely(dt < 0)) {
1909		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1910			 old, stats->runtime.last, dt);
1911		st_runtime_underflow(stats, dt);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1912		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1913	}
1914
1915	ewma_runtime_add(&stats->runtime.avg, dt);
1916	stats->runtime.total += dt;
 
 
 
1917}
1918
1919#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1920#include "selftest_lrc.c"
1921#endif