intel_lrc.c - drivers/gpu/drm/i915/gt/intel_lrc.c - Linux source code v4.6

Note: File does not exist in v4.6.
   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2014 Intel Corporation
   4 */
   5
   6#include "gem/i915_gem_lmem.h"
   7
   8#include "gen8_engine_cs.h"
   9#include "i915_drv.h"
  10#include "i915_perf.h"
  11#include "intel_engine.h"
  12#include "intel_gpu_commands.h"
  13#include "intel_gt.h"
  14#include "intel_lrc.h"
  15#include "intel_lrc_reg.h"
  16#include "intel_ring.h"
  17#include "shmem_utils.h"
  18
  19static void set_offsets(u32 *regs,
  20			const u8 *data,
  21			const struct intel_engine_cs *engine,
  22			bool close)
  23#define NOP(x) (BIT(7) | (x))
  24#define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
  25#define POSTED BIT(0)
  26#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
  27#define REG16(x) \
  28	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
  29	(((x) >> 2) & 0x7f)
  30#define END 0
  31{
  32	const u32 base = engine->mmio_base;
  33
  34	while (*data) {
  35		u8 count, flags;
  36
  37		if (*data & BIT(7)) { /* skip */
  38			count = *data++ & ~BIT(7);
  39			regs += count;
  40			continue;
  41		}
  42
  43		count = *data & 0x3f;
  44		flags = *data >> 6;
  45		data++;
  46
  47		*regs = MI_LOAD_REGISTER_IMM(count);
  48		if (flags & POSTED)
  49			*regs |= MI_LRI_FORCE_POSTED;
  50		if (GRAPHICS_VER(engine->i915) >= 11)
  51			*regs |= MI_LRI_LRM_CS_MMIO;
  52		regs++;
  53
  54		GEM_BUG_ON(!count);
  55		do {
  56			u32 offset = 0;
  57			u8 v;
  58
  59			do {
  60				v = *data++;
  61				offset <<= 7;
  62				offset |= v & ~BIT(7);
  63			} while (v & BIT(7));
  64
  65			regs[0] = base + (offset << 2);
  66			regs += 2;
  67		} while (--count);
  68	}
  69
  70	if (close) {
  71		/* Close the batch; used mainly by live_lrc_layout() */
  72		*regs = MI_BATCH_BUFFER_END;
  73		if (GRAPHICS_VER(engine->i915) >= 10)
  74			*regs |= BIT(0);
  75	}
  76}
  77
  78static const u8 gen8_xcs_offsets[] = {
  79	NOP(1),
  80	LRI(11, 0),
  81	REG16(0x244),
  82	REG(0x034),
  83	REG(0x030),
  84	REG(0x038),
  85	REG(0x03c),
  86	REG(0x168),
  87	REG(0x140),
  88	REG(0x110),
  89	REG(0x11c),
  90	REG(0x114),
  91	REG(0x118),
  92
  93	NOP(9),
  94	LRI(9, 0),
  95	REG16(0x3a8),
  96	REG16(0x28c),
  97	REG16(0x288),
  98	REG16(0x284),
  99	REG16(0x280),
 100	REG16(0x27c),
 101	REG16(0x278),
 102	REG16(0x274),
 103	REG16(0x270),
 104
 105	NOP(13),
 106	LRI(2, 0),
 107	REG16(0x200),
 108	REG(0x028),
 109
 110	END
 111};
 112
 113static const u8 gen9_xcs_offsets[] = {
 114	NOP(1),
 115	LRI(14, POSTED),
 116	REG16(0x244),
 117	REG(0x034),
 118	REG(0x030),
 119	REG(0x038),
 120	REG(0x03c),
 121	REG(0x168),
 122	REG(0x140),
 123	REG(0x110),
 124	REG(0x11c),
 125	REG(0x114),
 126	REG(0x118),
 127	REG(0x1c0),
 128	REG(0x1c4),
 129	REG(0x1c8),
 130
 131	NOP(3),
 132	LRI(9, POSTED),
 133	REG16(0x3a8),
 134	REG16(0x28c),
 135	REG16(0x288),
 136	REG16(0x284),
 137	REG16(0x280),
 138	REG16(0x27c),
 139	REG16(0x278),
 140	REG16(0x274),
 141	REG16(0x270),
 142
 143	NOP(13),
 144	LRI(1, POSTED),
 145	REG16(0x200),
 146
 147	NOP(13),
 148	LRI(44, POSTED),
 149	REG(0x028),
 150	REG(0x09c),
 151	REG(0x0c0),
 152	REG(0x178),
 153	REG(0x17c),
 154	REG16(0x358),
 155	REG(0x170),
 156	REG(0x150),
 157	REG(0x154),
 158	REG(0x158),
 159	REG16(0x41c),
 160	REG16(0x600),
 161	REG16(0x604),
 162	REG16(0x608),
 163	REG16(0x60c),
 164	REG16(0x610),
 165	REG16(0x614),
 166	REG16(0x618),
 167	REG16(0x61c),
 168	REG16(0x620),
 169	REG16(0x624),
 170	REG16(0x628),
 171	REG16(0x62c),
 172	REG16(0x630),
 173	REG16(0x634),
 174	REG16(0x638),
 175	REG16(0x63c),
 176	REG16(0x640),
 177	REG16(0x644),
 178	REG16(0x648),
 179	REG16(0x64c),
 180	REG16(0x650),
 181	REG16(0x654),
 182	REG16(0x658),
 183	REG16(0x65c),
 184	REG16(0x660),
 185	REG16(0x664),
 186	REG16(0x668),
 187	REG16(0x66c),
 188	REG16(0x670),
 189	REG16(0x674),
 190	REG16(0x678),
 191	REG16(0x67c),
 192	REG(0x068),
 193
 194	END
 195};
 196
 197static const u8 gen12_xcs_offsets[] = {
 198	NOP(1),
 199	LRI(13, POSTED),
 200	REG16(0x244),
 201	REG(0x034),
 202	REG(0x030),
 203	REG(0x038),
 204	REG(0x03c),
 205	REG(0x168),
 206	REG(0x140),
 207	REG(0x110),
 208	REG(0x1c0),
 209	REG(0x1c4),
 210	REG(0x1c8),
 211	REG(0x180),
 212	REG16(0x2b4),
 213
 214	NOP(5),
 215	LRI(9, POSTED),
 216	REG16(0x3a8),
 217	REG16(0x28c),
 218	REG16(0x288),
 219	REG16(0x284),
 220	REG16(0x280),
 221	REG16(0x27c),
 222	REG16(0x278),
 223	REG16(0x274),
 224	REG16(0x270),
 225
 226	END
 227};
 228
 229static const u8 gen8_rcs_offsets[] = {
 230	NOP(1),
 231	LRI(14, POSTED),
 232	REG16(0x244),
 233	REG(0x034),
 234	REG(0x030),
 235	REG(0x038),
 236	REG(0x03c),
 237	REG(0x168),
 238	REG(0x140),
 239	REG(0x110),
 240	REG(0x11c),
 241	REG(0x114),
 242	REG(0x118),
 243	REG(0x1c0),
 244	REG(0x1c4),
 245	REG(0x1c8),
 246
 247	NOP(3),
 248	LRI(9, POSTED),
 249	REG16(0x3a8),
 250	REG16(0x28c),
 251	REG16(0x288),
 252	REG16(0x284),
 253	REG16(0x280),
 254	REG16(0x27c),
 255	REG16(0x278),
 256	REG16(0x274),
 257	REG16(0x270),
 258
 259	NOP(13),
 260	LRI(1, 0),
 261	REG(0x0c8),
 262
 263	END
 264};
 265
 266static const u8 gen9_rcs_offsets[] = {
 267	NOP(1),
 268	LRI(14, POSTED),
 269	REG16(0x244),
 270	REG(0x34),
 271	REG(0x30),
 272	REG(0x38),
 273	REG(0x3c),
 274	REG(0x168),
 275	REG(0x140),
 276	REG(0x110),
 277	REG(0x11c),
 278	REG(0x114),
 279	REG(0x118),
 280	REG(0x1c0),
 281	REG(0x1c4),
 282	REG(0x1c8),
 283
 284	NOP(3),
 285	LRI(9, POSTED),
 286	REG16(0x3a8),
 287	REG16(0x28c),
 288	REG16(0x288),
 289	REG16(0x284),
 290	REG16(0x280),
 291	REG16(0x27c),
 292	REG16(0x278),
 293	REG16(0x274),
 294	REG16(0x270),
 295
 296	NOP(13),
 297	LRI(1, 0),
 298	REG(0xc8),
 299
 300	NOP(13),
 301	LRI(44, POSTED),
 302	REG(0x28),
 303	REG(0x9c),
 304	REG(0xc0),
 305	REG(0x178),
 306	REG(0x17c),
 307	REG16(0x358),
 308	REG(0x170),
 309	REG(0x150),
 310	REG(0x154),
 311	REG(0x158),
 312	REG16(0x41c),
 313	REG16(0x600),
 314	REG16(0x604),
 315	REG16(0x608),
 316	REG16(0x60c),
 317	REG16(0x610),
 318	REG16(0x614),
 319	REG16(0x618),
 320	REG16(0x61c),
 321	REG16(0x620),
 322	REG16(0x624),
 323	REG16(0x628),
 324	REG16(0x62c),
 325	REG16(0x630),
 326	REG16(0x634),
 327	REG16(0x638),
 328	REG16(0x63c),
 329	REG16(0x640),
 330	REG16(0x644),
 331	REG16(0x648),
 332	REG16(0x64c),
 333	REG16(0x650),
 334	REG16(0x654),
 335	REG16(0x658),
 336	REG16(0x65c),
 337	REG16(0x660),
 338	REG16(0x664),
 339	REG16(0x668),
 340	REG16(0x66c),
 341	REG16(0x670),
 342	REG16(0x674),
 343	REG16(0x678),
 344	REG16(0x67c),
 345	REG(0x68),
 346
 347	END
 348};
 349
 350static const u8 gen11_rcs_offsets[] = {
 351	NOP(1),
 352	LRI(15, POSTED),
 353	REG16(0x244),
 354	REG(0x034),
 355	REG(0x030),
 356	REG(0x038),
 357	REG(0x03c),
 358	REG(0x168),
 359	REG(0x140),
 360	REG(0x110),
 361	REG(0x11c),
 362	REG(0x114),
 363	REG(0x118),
 364	REG(0x1c0),
 365	REG(0x1c4),
 366	REG(0x1c8),
 367	REG(0x180),
 368
 369	NOP(1),
 370	LRI(9, POSTED),
 371	REG16(0x3a8),
 372	REG16(0x28c),
 373	REG16(0x288),
 374	REG16(0x284),
 375	REG16(0x280),
 376	REG16(0x27c),
 377	REG16(0x278),
 378	REG16(0x274),
 379	REG16(0x270),
 380
 381	LRI(1, POSTED),
 382	REG(0x1b0),
 383
 384	NOP(10),
 385	LRI(1, 0),
 386	REG(0x0c8),
 387
 388	END
 389};
 390
 391static const u8 gen12_rcs_offsets[] = {
 392	NOP(1),
 393	LRI(13, POSTED),
 394	REG16(0x244),
 395	REG(0x034),
 396	REG(0x030),
 397	REG(0x038),
 398	REG(0x03c),
 399	REG(0x168),
 400	REG(0x140),
 401	REG(0x110),
 402	REG(0x1c0),
 403	REG(0x1c4),
 404	REG(0x1c8),
 405	REG(0x180),
 406	REG16(0x2b4),
 407
 408	NOP(5),
 409	LRI(9, POSTED),
 410	REG16(0x3a8),
 411	REG16(0x28c),
 412	REG16(0x288),
 413	REG16(0x284),
 414	REG16(0x280),
 415	REG16(0x27c),
 416	REG16(0x278),
 417	REG16(0x274),
 418	REG16(0x270),
 419
 420	LRI(3, POSTED),
 421	REG(0x1b0),
 422	REG16(0x5a8),
 423	REG16(0x5ac),
 424
 425	NOP(6),
 426	LRI(1, 0),
 427	REG(0x0c8),
 428	NOP(3 + 9 + 1),
 429
 430	LRI(51, POSTED),
 431	REG16(0x588),
 432	REG16(0x588),
 433	REG16(0x588),
 434	REG16(0x588),
 435	REG16(0x588),
 436	REG16(0x588),
 437	REG(0x028),
 438	REG(0x09c),
 439	REG(0x0c0),
 440	REG(0x178),
 441	REG(0x17c),
 442	REG16(0x358),
 443	REG(0x170),
 444	REG(0x150),
 445	REG(0x154),
 446	REG(0x158),
 447	REG16(0x41c),
 448	REG16(0x600),
 449	REG16(0x604),
 450	REG16(0x608),
 451	REG16(0x60c),
 452	REG16(0x610),
 453	REG16(0x614),
 454	REG16(0x618),
 455	REG16(0x61c),
 456	REG16(0x620),
 457	REG16(0x624),
 458	REG16(0x628),
 459	REG16(0x62c),
 460	REG16(0x630),
 461	REG16(0x634),
 462	REG16(0x638),
 463	REG16(0x63c),
 464	REG16(0x640),
 465	REG16(0x644),
 466	REG16(0x648),
 467	REG16(0x64c),
 468	REG16(0x650),
 469	REG16(0x654),
 470	REG16(0x658),
 471	REG16(0x65c),
 472	REG16(0x660),
 473	REG16(0x664),
 474	REG16(0x668),
 475	REG16(0x66c),
 476	REG16(0x670),
 477	REG16(0x674),
 478	REG16(0x678),
 479	REG16(0x67c),
 480	REG(0x068),
 481	REG(0x084),
 482	NOP(1),
 483
 484	END
 485};
 486
 487#undef END
 488#undef REG16
 489#undef REG
 490#undef LRI
 491#undef NOP
 492
 493static const u8 *reg_offsets(const struct intel_engine_cs *engine)
 494{
 495	/*
 496	 * The gen12+ lists only have the registers we program in the basic
 497	 * default state. We rely on the context image using relative
 498	 * addressing to automatic fixup the register state between the
 499	 * physical engines for virtual engine.
 500	 */
 501	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
 502		   !intel_engine_has_relative_mmio(engine));
 503
 504	if (engine->class == RENDER_CLASS) {
 505		if (GRAPHICS_VER(engine->i915) >= 12)
 506			return gen12_rcs_offsets;
 507		else if (GRAPHICS_VER(engine->i915) >= 11)
 508			return gen11_rcs_offsets;
 509		else if (GRAPHICS_VER(engine->i915) >= 9)
 510			return gen9_rcs_offsets;
 511		else
 512			return gen8_rcs_offsets;
 513	} else {
 514		if (GRAPHICS_VER(engine->i915) >= 12)
 515			return gen12_xcs_offsets;
 516		else if (GRAPHICS_VER(engine->i915) >= 9)
 517			return gen9_xcs_offsets;
 518		else
 519			return gen8_xcs_offsets;
 520	}
 521}
 522
 523static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
 524{
 525	if (GRAPHICS_VER(engine->i915) >= 12)
 526		return 0x60;
 527	else if (GRAPHICS_VER(engine->i915) >= 9)
 528		return 0x54;
 529	else if (engine->class == RENDER_CLASS)
 530		return 0x58;
 531	else
 532		return -1;
 533}
 534
 535static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
 536{
 537	if (GRAPHICS_VER(engine->i915) >= 12)
 538		return 0x74;
 539	else if (GRAPHICS_VER(engine->i915) >= 9)
 540		return 0x68;
 541	else if (engine->class == RENDER_CLASS)
 542		return 0xd8;
 543	else
 544		return -1;
 545}
 546
 547static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
 548{
 549	if (GRAPHICS_VER(engine->i915) >= 12)
 550		return 0x12;
 551	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
 552		return 0x18;
 553	else
 554		return -1;
 555}
 556
 557static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
 558{
 559	int x;
 560
 561	x = lrc_ring_wa_bb_per_ctx(engine);
 562	if (x < 0)
 563		return x;
 564
 565	return x + 2;
 566}
 567
 568static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
 569{
 570	int x;
 571
 572	x = lrc_ring_indirect_ptr(engine);
 573	if (x < 0)
 574		return x;
 575
 576	return x + 2;
 577}
 578
 579static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
 580{
 581	if (engine->class != RENDER_CLASS)
 582		return -1;
 583
 584	if (GRAPHICS_VER(engine->i915) >= 12)
 585		return 0xb6;
 586	else if (GRAPHICS_VER(engine->i915) >= 11)
 587		return 0xaa;
 588	else
 589		return -1;
 590}
 591
 592static u32
 593lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
 594{
 595	switch (GRAPHICS_VER(engine->i915)) {
 596	default:
 597		MISSING_CASE(GRAPHICS_VER(engine->i915));
 598		fallthrough;
 599	case 12:
 600		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 601	case 11:
 602		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 603	case 10:
 604		return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 605	case 9:
 606		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 607	case 8:
 608		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 609	}
 610}
 611
 612static void
 613lrc_setup_indirect_ctx(u32 *regs,
 614		       const struct intel_engine_cs *engine,
 615		       u32 ctx_bb_ggtt_addr,
 616		       u32 size)
 617{
 618	GEM_BUG_ON(!size);
 619	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
 620	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
 621	regs[lrc_ring_indirect_ptr(engine) + 1] =
 622		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
 623
 624	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
 625	regs[lrc_ring_indirect_offset(engine) + 1] =
 626		lrc_ring_indirect_offset_default(engine) << 6;
 627}
 628
 629static void init_common_regs(u32 * const regs,
 630			     const struct intel_context *ce,
 631			     const struct intel_engine_cs *engine,
 632			     bool inhibit)
 633{
 634	u32 ctl;
 635
 636	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
 637	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
 638	if (inhibit)
 639		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
 640	if (GRAPHICS_VER(engine->i915) < 11)
 641		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
 642					   CTX_CTRL_RS_CTX_ENABLE);
 643	regs[CTX_CONTEXT_CONTROL] = ctl;
 644
 645	regs[CTX_TIMESTAMP] = ce->runtime.last;
 646}
 647
 648static void init_wa_bb_regs(u32 * const regs,
 649			    const struct intel_engine_cs *engine)
 650{
 651	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
 652
 653	if (wa_ctx->per_ctx.size) {
 654		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
 655
 656		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
 657		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
 658			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
 659	}
 660
 661	if (wa_ctx->indirect_ctx.size) {
 662		lrc_setup_indirect_ctx(regs, engine,
 663				       i915_ggtt_offset(wa_ctx->vma) +
 664				       wa_ctx->indirect_ctx.offset,
 665				       wa_ctx->indirect_ctx.size);
 666	}
 667}
 668
 669static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
 670{
 671	if (i915_vm_is_4lvl(&ppgtt->vm)) {
 672		/* 64b PPGTT (48bit canonical)
 673		 * PDP0_DESCRIPTOR contains the base address to PML4 and
 674		 * other PDP Descriptors are ignored.
 675		 */
 676		ASSIGN_CTX_PML4(ppgtt, regs);
 677	} else {
 678		ASSIGN_CTX_PDP(ppgtt, regs, 3);
 679		ASSIGN_CTX_PDP(ppgtt, regs, 2);
 680		ASSIGN_CTX_PDP(ppgtt, regs, 1);
 681		ASSIGN_CTX_PDP(ppgtt, regs, 0);
 682	}
 683}
 684
 685static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
 686{
 687	if (i915_is_ggtt(vm))
 688		return i915_vm_to_ggtt(vm)->alias;
 689	else
 690		return i915_vm_to_ppgtt(vm);
 691}
 692
 693static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
 694{
 695	int x;
 696
 697	x = lrc_ring_mi_mode(engine);
 698	if (x != -1) {
 699		regs[x + 1] &= ~STOP_RING;
 700		regs[x + 1] |= STOP_RING << 16;
 701	}
 702}
 703
 704static void __lrc_init_regs(u32 *regs,
 705			    const struct intel_context *ce,
 706			    const struct intel_engine_cs *engine,
 707			    bool inhibit)
 708{
 709	/*
 710	 * A context is actually a big batch buffer with several
 711	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
 712	 * values we are setting here are only for the first context restore:
 713	 * on a subsequent save, the GPU will recreate this batchbuffer with new
 714	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
 715	 * we are not initializing here).
 716	 *
 717	 * Must keep consistent with virtual_update_register_offsets().
 718	 */
 719
 720	if (inhibit)
 721		memset(regs, 0, PAGE_SIZE);
 722
 723	set_offsets(regs, reg_offsets(engine), engine, inhibit);
 724
 725	init_common_regs(regs, ce, engine, inhibit);
 726	init_ppgtt_regs(regs, vm_alias(ce->vm));
 727
 728	init_wa_bb_regs(regs, engine);
 729
 730	__reset_stop_ring(regs, engine);
 731}
 732
 733void lrc_init_regs(const struct intel_context *ce,
 734		   const struct intel_engine_cs *engine,
 735		   bool inhibit)
 736{
 737	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
 738}
 739
 740void lrc_reset_regs(const struct intel_context *ce,
 741		    const struct intel_engine_cs *engine)
 742{
 743	__reset_stop_ring(ce->lrc_reg_state, engine);
 744}
 745
 746static void
 747set_redzone(void *vaddr, const struct intel_engine_cs *engine)
 748{
 749	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 750		return;
 751
 752	vaddr += engine->context_size;
 753
 754	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
 755}
 756
 757static void
 758check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
 759{
 760	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 761		return;
 762
 763	vaddr += engine->context_size;
 764
 765	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
 766		drm_err_once(&engine->i915->drm,
 767			     "%s context redzone overwritten!\n",
 768			     engine->name);
 769}
 770
 771void lrc_init_state(struct intel_context *ce,
 772		    struct intel_engine_cs *engine,
 773		    void *state)
 774{
 775	bool inhibit = true;
 776
 777	set_redzone(state, engine);
 778
 779	if (engine->default_state) {
 780		shmem_read(engine->default_state, 0,
 781			   state, engine->context_size);
 782		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
 783		inhibit = false;
 784	}
 785
 786	/* Clear the ppHWSP (inc. per-context counters) */
 787	memset(state, 0, PAGE_SIZE);
 788
 789	/*
 790	 * The second page of the context object contains some registers which
 791	 * must be set up prior to the first execution.
 792	 */
 793	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
 794}
 795
 796static struct i915_vma *
 797__lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
 798{
 799	struct drm_i915_gem_object *obj;
 800	struct i915_vma *vma;
 801	u32 context_size;
 802
 803	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
 804
 805	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 806		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
 807
 808	if (GRAPHICS_VER(engine->i915) == 12) {
 809		ce->wa_bb_page = context_size / PAGE_SIZE;
 810		context_size += PAGE_SIZE;
 811	}
 812
 813	obj = i915_gem_object_create_lmem(engine->i915, context_size, 0);
 814	if (IS_ERR(obj))
 815		obj = i915_gem_object_create_shmem(engine->i915, context_size);
 816	if (IS_ERR(obj))
 817		return ERR_CAST(obj);
 818
 819	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
 820	if (IS_ERR(vma)) {
 821		i915_gem_object_put(obj);
 822		return vma;
 823	}
 824
 825	return vma;
 826}
 827
 828static struct intel_timeline *
 829pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
 830{
 831	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
 832
 833	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
 834}
 835
 836int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
 837{
 838	struct intel_ring *ring;
 839	struct i915_vma *vma;
 840	int err;
 841
 842	GEM_BUG_ON(ce->state);
 843
 844	vma = __lrc_alloc_state(ce, engine);
 845	if (IS_ERR(vma))
 846		return PTR_ERR(vma);
 847
 848	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
 849	if (IS_ERR(ring)) {
 850		err = PTR_ERR(ring);
 851		goto err_vma;
 852	}
 853
 854	if (!page_mask_bits(ce->timeline)) {
 855		struct intel_timeline *tl;
 856
 857		/*
 858		 * Use the static global HWSP for the kernel context, and
 859		 * a dynamically allocated cacheline for everyone else.
 860		 */
 861		if (unlikely(ce->timeline))
 862			tl = pinned_timeline(ce, engine);
 863		else
 864			tl = intel_timeline_create(engine->gt);
 865		if (IS_ERR(tl)) {
 866			err = PTR_ERR(tl);
 867			goto err_ring;
 868		}
 869
 870		ce->timeline = tl;
 871	}
 872
 873	ce->ring = ring;
 874	ce->state = vma;
 875
 876	return 0;
 877
 878err_ring:
 879	intel_ring_put(ring);
 880err_vma:
 881	i915_vma_put(vma);
 882	return err;
 883}
 884
 885void lrc_reset(struct intel_context *ce)
 886{
 887	GEM_BUG_ON(!intel_context_is_pinned(ce));
 888
 889	intel_ring_reset(ce->ring, ce->ring->emit);
 890
 891	/* Scrub away the garbage */
 892	lrc_init_regs(ce, ce->engine, true);
 893	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
 894}
 895
 896int
 897lrc_pre_pin(struct intel_context *ce,
 898	    struct intel_engine_cs *engine,
 899	    struct i915_gem_ww_ctx *ww,
 900	    void **vaddr)
 901{
 902	GEM_BUG_ON(!ce->state);
 903	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
 904
 905	*vaddr = i915_gem_object_pin_map(ce->state->obj,
 906					 i915_coherent_map_type(ce->engine->i915,
 907								ce->state->obj,
 908								false) |
 909					 I915_MAP_OVERRIDE);
 910
 911	return PTR_ERR_OR_ZERO(*vaddr);
 912}
 913
 914int
 915lrc_pin(struct intel_context *ce,
 916	struct intel_engine_cs *engine,
 917	void *vaddr)
 918{
 919	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
 920
 921	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
 922		lrc_init_state(ce, engine, vaddr);
 923
 924	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
 925	return 0;
 926}
 927
 928void lrc_unpin(struct intel_context *ce)
 929{
 930	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
 931		      ce->engine);
 932}
 933
 934void lrc_post_unpin(struct intel_context *ce)
 935{
 936	i915_gem_object_unpin_map(ce->state->obj);
 937}
 938
 939void lrc_fini(struct intel_context *ce)
 940{
 941	if (!ce->state)
 942		return;
 943
 944	intel_ring_put(fetch_and_zero(&ce->ring));
 945	i915_vma_put(fetch_and_zero(&ce->state));
 946}
 947
 948void lrc_destroy(struct kref *kref)
 949{
 950	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
 951
 952	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
 953	GEM_BUG_ON(intel_context_is_pinned(ce));
 954
 955	lrc_fini(ce);
 956
 957	intel_context_fini(ce);
 958	intel_context_free(ce);
 959}
 960
 961static u32 *
 962gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
 963{
 964	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
 965		MI_SRM_LRM_GLOBAL_GTT |
 966		MI_LRI_LRM_CS_MMIO;
 967	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
 968	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
 969		CTX_TIMESTAMP * sizeof(u32);
 970	*cs++ = 0;
 971
 972	*cs++ = MI_LOAD_REGISTER_REG |
 973		MI_LRR_SOURCE_CS_MMIO |
 974		MI_LRI_LRM_CS_MMIO;
 975	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
 976	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
 977
 978	*cs++ = MI_LOAD_REGISTER_REG |
 979		MI_LRR_SOURCE_CS_MMIO |
 980		MI_LRI_LRM_CS_MMIO;
 981	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
 982	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
 983
 984	return cs;
 985}
 986
 987static u32 *
 988gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
 989{
 990	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
 991
 992	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
 993		MI_SRM_LRM_GLOBAL_GTT |
 994		MI_LRI_LRM_CS_MMIO;
 995	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
 996	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
 997		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
 998	*cs++ = 0;
 999
1000	return cs;
1001}
1002
1003static u32 *
1004gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1005{
1006	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1007
1008	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1009		MI_SRM_LRM_GLOBAL_GTT |
1010		MI_LRI_LRM_CS_MMIO;
1011	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1012	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1013		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1014	*cs++ = 0;
1015
1016	*cs++ = MI_LOAD_REGISTER_REG |
1017		MI_LRR_SOURCE_CS_MMIO |
1018		MI_LRI_LRM_CS_MMIO;
1019	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1020	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1021
1022	return cs;
1023}
1024
1025static u32 *
1026gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1027{
1028	cs = gen12_emit_timestamp_wa(ce, cs);
1029	cs = gen12_emit_cmd_buf_wa(ce, cs);
1030	cs = gen12_emit_restore_scratch(ce, cs);
1031
1032	return cs;
1033}
1034
1035static u32 *
1036gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1037{
1038	cs = gen12_emit_timestamp_wa(ce, cs);
1039	cs = gen12_emit_restore_scratch(ce, cs);
1040
1041	return cs;
1042}
1043
1044static u32 context_wa_bb_offset(const struct intel_context *ce)
1045{
1046	return PAGE_SIZE * ce->wa_bb_page;
1047}
1048
1049static u32 *context_indirect_bb(const struct intel_context *ce)
1050{
1051	void *ptr;
1052
1053	GEM_BUG_ON(!ce->wa_bb_page);
1054
1055	ptr = ce->lrc_reg_state;
1056	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1057	ptr += context_wa_bb_offset(ce);
1058
1059	return ptr;
1060}
1061
1062static void
1063setup_indirect_ctx_bb(const struct intel_context *ce,
1064		      const struct intel_engine_cs *engine,
1065		      u32 *(*emit)(const struct intel_context *, u32 *))
1066{
1067	u32 * const start = context_indirect_bb(ce);
1068	u32 *cs;
1069
1070	cs = emit(ce, start);
1071	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1072	while ((unsigned long)cs % CACHELINE_BYTES)
1073		*cs++ = MI_NOOP;
1074
1075	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1076			       i915_ggtt_offset(ce->state) +
1077			       context_wa_bb_offset(ce),
1078			       (cs - start) * sizeof(*cs));
1079}
1080
1081/*
1082 * The context descriptor encodes various attributes of a context,
1083 * including its GTT address and some flags. Because it's fairly
1084 * expensive to calculate, we'll just do it once and cache the result,
1085 * which remains valid until the context is unpinned.
1086 *
1087 * This is what a descriptor looks like, from LSB to MSB::
1088 *
1089 *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1090 *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1091 *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1092 *      bits 53-54:    mbz, reserved for use by hardware
1093 *      bits 55-63:    group ID, currently unused and set to 0
1094 *
1095 * Starting from Gen11, the upper dword of the descriptor has a new format:
1096 *
1097 *      bits 32-36:    reserved
1098 *      bits 37-47:    SW context ID
1099 *      bits 48:53:    engine instance
1100 *      bit 54:        mbz, reserved for use by hardware
1101 *      bits 55-60:    SW counter
1102 *      bits 61-63:    engine class
1103 *
1104 * engine info, SW context ID and SW counter need to form a unique number
1105 * (Context ID) per lrc.
1106 */
1107static u32 lrc_descriptor(const struct intel_context *ce)
1108{
1109	u32 desc;
1110
1111	desc = INTEL_LEGACY_32B_CONTEXT;
1112	if (i915_vm_is_4lvl(ce->vm))
1113		desc = INTEL_LEGACY_64B_CONTEXT;
1114	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1115
1116	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1117	if (GRAPHICS_VER(ce->vm->i915) == 8)
1118		desc |= GEN8_CTX_L3LLC_COHERENT;
1119
1120	return i915_ggtt_offset(ce->state) | desc;
1121}
1122
1123u32 lrc_update_regs(const struct intel_context *ce,
1124		    const struct intel_engine_cs *engine,
1125		    u32 head)
1126{
1127	struct intel_ring *ring = ce->ring;
1128	u32 *regs = ce->lrc_reg_state;
1129
1130	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1131	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1132
1133	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1134	regs[CTX_RING_HEAD] = head;
1135	regs[CTX_RING_TAIL] = ring->tail;
1136	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1137
1138	/* RPCS */
1139	if (engine->class == RENDER_CLASS) {
1140		regs[CTX_R_PWR_CLK_STATE] =
1141			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1142
1143		i915_oa_init_reg_state(ce, engine);
1144	}
1145
1146	if (ce->wa_bb_page) {
1147		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1148
1149		fn = gen12_emit_indirect_ctx_xcs;
1150		if (ce->engine->class == RENDER_CLASS)
1151			fn = gen12_emit_indirect_ctx_rcs;
1152
1153		/* Mutually exclusive wrt to global indirect bb */
1154		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1155		setup_indirect_ctx_bb(ce, engine, fn);
1156	}
1157
1158	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1159}
1160
1161void lrc_update_offsets(struct intel_context *ce,
1162			struct intel_engine_cs *engine)
1163{
1164	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1165}
1166
1167void lrc_check_regs(const struct intel_context *ce,
1168		    const struct intel_engine_cs *engine,
1169		    const char *when)
1170{
1171	const struct intel_ring *ring = ce->ring;
1172	u32 *regs = ce->lrc_reg_state;
1173	bool valid = true;
1174	int x;
1175
1176	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1177		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1178		       engine->name,
1179		       regs[CTX_RING_START],
1180		       i915_ggtt_offset(ring->vma));
1181		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1182		valid = false;
1183	}
1184
1185	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1186	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1187		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1188		       engine->name,
1189		       regs[CTX_RING_CTL],
1190		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1191		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1192		valid = false;
1193	}
1194
1195	x = lrc_ring_mi_mode(engine);
1196	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1197		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1198		       engine->name, regs[x + 1]);
1199		regs[x + 1] &= ~STOP_RING;
1200		regs[x + 1] |= STOP_RING << 16;
1201		valid = false;
1202	}
1203
1204	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1205}
1206
1207/*
1208 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1209 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1210 * but there is a slight complication as this is applied in WA batch where the
1211 * values are only initialized once so we cannot take register value at the
1212 * beginning and reuse it further; hence we save its value to memory, upload a
1213 * constant value with bit21 set and then we restore it back with the saved value.
1214 * To simplify the WA, a constant value is formed by using the default value
1215 * of this register. This shouldn't be a problem because we are only modifying
1216 * it for a short period and this batch in non-premptible. We can ofcourse
1217 * use additional instructions that read the actual value of the register
1218 * at that time and set our bit of interest but it makes the WA complicated.
1219 *
1220 * This WA is also required for Gen9 so extracting as a function avoids
1221 * code duplication.
1222 */
1223static u32 *
1224gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1225{
1226	/* NB no one else is allowed to scribble over scratch + 256! */
1227	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1228	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1229	*batch++ = intel_gt_scratch_offset(engine->gt,
1230					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1231	*batch++ = 0;
1232
1233	*batch++ = MI_LOAD_REGISTER_IMM(1);
1234	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1235	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1236
1237	batch = gen8_emit_pipe_control(batch,
1238				       PIPE_CONTROL_CS_STALL |
1239				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1240				       0);
1241
1242	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1243	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1244	*batch++ = intel_gt_scratch_offset(engine->gt,
1245					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1246	*batch++ = 0;
1247
1248	return batch;
1249}
1250
1251/*
1252 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1253 * initialized at the beginning and shared across all contexts but this field
1254 * helps us to have multiple batches at different offsets and select them based
1255 * on a criteria. At the moment this batch always start at the beginning of the page
1256 * and at this point we don't have multiple wa_ctx batch buffers.
1257 *
1258 * The number of WA applied are not known at the beginning; we use this field
1259 * to return the no of DWORDS written.
1260 *
1261 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1262 * so it adds NOOPs as padding to make it cacheline aligned.
1263 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1264 * makes a complete batch buffer.
1265 */
1266static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1267{
1268	/* WaDisableCtxRestoreArbitration:bdw,chv */
1269	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1270
1271	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1272	if (IS_BROADWELL(engine->i915))
1273		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1274
1275	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1276	/* Actual scratch location is at 128 bytes offset */
1277	batch = gen8_emit_pipe_control(batch,
1278				       PIPE_CONTROL_FLUSH_L3 |
1279				       PIPE_CONTROL_STORE_DATA_INDEX |
1280				       PIPE_CONTROL_CS_STALL |
1281				       PIPE_CONTROL_QW_WRITE,
1282				       LRC_PPHWSP_SCRATCH_ADDR);
1283
1284	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1285
1286	/* Pad to end of cacheline */
1287	while ((unsigned long)batch % CACHELINE_BYTES)
1288		*batch++ = MI_NOOP;
1289
1290	/*
1291	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1292	 * execution depends on the length specified in terms of cache lines
1293	 * in the register CTX_RCS_INDIRECT_CTX
1294	 */
1295
1296	return batch;
1297}
1298
1299struct lri {
1300	i915_reg_t reg;
1301	u32 value;
1302};
1303
1304static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1305{
1306	GEM_BUG_ON(!count || count > 63);
1307
1308	*batch++ = MI_LOAD_REGISTER_IMM(count);
1309	do {
1310		*batch++ = i915_mmio_reg_offset(lri->reg);
1311		*batch++ = lri->value;
1312	} while (lri++, --count);
1313	*batch++ = MI_NOOP;
1314
1315	return batch;
1316}
1317
1318static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1319{
1320	static const struct lri lri[] = {
1321		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1322		{
1323			COMMON_SLICE_CHICKEN2,
1324			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1325				       0),
1326		},
1327
1328		/* BSpec: 11391 */
1329		{
1330			FF_SLICE_CHICKEN,
1331			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1332				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1333		},
1334
1335		/* BSpec: 11299 */
1336		{
1337			_3D_CHICKEN3,
1338			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1339				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1340		}
1341	};
1342
1343	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1344
1345	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1346	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1347
1348	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1349	batch = gen8_emit_pipe_control(batch,
1350				       PIPE_CONTROL_FLUSH_L3 |
1351				       PIPE_CONTROL_STORE_DATA_INDEX |
1352				       PIPE_CONTROL_CS_STALL |
1353				       PIPE_CONTROL_QW_WRITE,
1354				       LRC_PPHWSP_SCRATCH_ADDR);
1355
1356	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1357
1358	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1359	if (HAS_POOLED_EU(engine->i915)) {
1360		/*
1361		 * EU pool configuration is setup along with golden context
1362		 * during context initialization. This value depends on
1363		 * device type (2x6 or 3x6) and needs to be updated based
1364		 * on which subslice is disabled especially for 2x6
1365		 * devices, however it is safe to load default
1366		 * configuration of 3x6 device instead of masking off
1367		 * corresponding bits because HW ignores bits of a disabled
1368		 * subslice and drops down to appropriate config. Please
1369		 * see render_state_setup() in i915_gem_render_state.c for
1370		 * possible configurations, to avoid duplication they are
1371		 * not shown here again.
1372		 */
1373		*batch++ = GEN9_MEDIA_POOL_STATE;
1374		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1375		*batch++ = 0x00777000;
1376		*batch++ = 0;
1377		*batch++ = 0;
1378		*batch++ = 0;
1379	}
1380
1381	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1382
1383	/* Pad to end of cacheline */
1384	while ((unsigned long)batch % CACHELINE_BYTES)
1385		*batch++ = MI_NOOP;
1386
1387	return batch;
1388}
1389
1390static u32 *
1391gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1392{
1393	int i;
1394
1395	/*
1396	 * WaPipeControlBefore3DStateSamplePattern: cnl
1397	 *
1398	 * Ensure the engine is idle prior to programming a
1399	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
1400	 */
1401	batch = gen8_emit_pipe_control(batch,
1402				       PIPE_CONTROL_CS_STALL,
1403				       0);
1404	/*
1405	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
1406	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
1407	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
1408	 * confusing. Since gen8_emit_pipe_control() already advances the
1409	 * batch by 6 dwords, we advance the other 10 here, completing a
1410	 * cacheline. It's not clear if the workaround requires this padding
1411	 * before other commands, or if it's just the regular padding we would
1412	 * already have for the workaround bb, so leave it here for now.
1413	 */
1414	for (i = 0; i < 10; i++)
1415		*batch++ = MI_NOOP;
1416
1417	/* Pad to end of cacheline */
1418	while ((unsigned long)batch % CACHELINE_BYTES)
1419		*batch++ = MI_NOOP;
1420
1421	return batch;
1422}
1423
1424#define CTX_WA_BB_SIZE (PAGE_SIZE)
1425
1426static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1427{
1428	struct drm_i915_gem_object *obj;
1429	struct i915_vma *vma;
1430	int err;
1431
1432	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1433	if (IS_ERR(obj))
1434		return PTR_ERR(obj);
1435
1436	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1437	if (IS_ERR(vma)) {
1438		err = PTR_ERR(vma);
1439		goto err;
1440	}
1441
1442	engine->wa_ctx.vma = vma;
1443	return 0;
1444
1445err:
1446	i915_gem_object_put(obj);
1447	return err;
1448}
1449
1450void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1451{
1452	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1453}
1454
1455typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1456
1457void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1458{
1459	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1460	struct i915_wa_ctx_bb *wa_bb[] = {
1461		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1462	};
1463	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1464	struct i915_gem_ww_ctx ww;
1465	void *batch, *batch_ptr;
1466	unsigned int i;
1467	int err;
1468
1469	if (engine->class != RENDER_CLASS)
1470		return;
1471
1472	switch (GRAPHICS_VER(engine->i915)) {
1473	case 12:
1474	case 11:
1475		return;
1476	case 10:
1477		wa_bb_fn[0] = gen10_init_indirectctx_bb;
1478		wa_bb_fn[1] = NULL;
1479		break;
1480	case 9:
1481		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1482		wa_bb_fn[1] = NULL;
1483		break;
1484	case 8:
1485		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1486		wa_bb_fn[1] = NULL;
1487		break;
1488	default:
1489		MISSING_CASE(GRAPHICS_VER(engine->i915));
1490		return;
1491	}
1492
1493	err = lrc_create_wa_ctx(engine);
1494	if (err) {
1495		/*
1496		 * We continue even if we fail to initialize WA batch
1497		 * because we only expect rare glitches but nothing
1498		 * critical to prevent us from using GPU
1499		 */
1500		drm_err(&engine->i915->drm,
1501			"Ignoring context switch w/a allocation error:%d\n",
1502			err);
1503		return;
1504	}
1505
1506	if (!engine->wa_ctx.vma)
1507		return;
1508
1509	i915_gem_ww_ctx_init(&ww, true);
1510retry:
1511	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1512	if (!err)
1513		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1514	if (err)
1515		goto err;
1516
1517	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1518	if (IS_ERR(batch)) {
1519		err = PTR_ERR(batch);
1520		goto err_unpin;
1521	}
1522
1523	/*
1524	 * Emit the two workaround batch buffers, recording the offset from the
1525	 * start of the workaround batch buffer object for each and their
1526	 * respective sizes.
1527	 */
1528	batch_ptr = batch;
1529	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1530		wa_bb[i]->offset = batch_ptr - batch;
1531		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1532						  CACHELINE_BYTES))) {
1533			err = -EINVAL;
1534			break;
1535		}
1536		if (wa_bb_fn[i])
1537			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1538		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1539	}
1540	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1541
1542	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1543	__i915_gem_object_release_map(wa_ctx->vma->obj);
1544
1545	/* Verify that we can handle failure to setup the wa_ctx */
1546	if (!err)
1547		err = i915_inject_probe_error(engine->i915, -ENODEV);
1548
1549err_unpin:
1550	if (err)
1551		i915_vma_unpin(wa_ctx->vma);
1552err:
1553	if (err == -EDEADLK) {
1554		err = i915_gem_ww_ctx_backoff(&ww);
1555		if (!err)
1556			goto retry;
1557	}
1558	i915_gem_ww_ctx_fini(&ww);
1559
1560	if (err) {
1561		i915_vma_put(engine->wa_ctx.vma);
1562
1563		/* Clear all flags to prevent further use */
1564		memset(wa_ctx, 0, sizeof(*wa_ctx));
1565	}
1566}
1567
1568static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1569{
1570#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1571	ce->runtime.num_underflow++;
1572	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1573#endif
1574}
1575
1576void lrc_update_runtime(struct intel_context *ce)
1577{
1578	u32 old;
1579	s32 dt;
1580
1581	if (intel_context_is_barrier(ce))
1582		return;
1583
1584	old = ce->runtime.last;
1585	ce->runtime.last = lrc_get_runtime(ce);
1586	dt = ce->runtime.last - old;
1587
1588	if (unlikely(dt < 0)) {
1589		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1590			 old, ce->runtime.last, dt);
1591		st_update_runtime_underflow(ce, dt);
1592		return;
1593	}
1594
1595	ewma_runtime_add(&ce->runtime.avg, dt);
1596	ce->runtime.total += dt;
1597}
1598
1599#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1600#include "selftest_lrc.c"
1601#endif