Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2014 Intel Corporation
   4 */
   5
   6#include "gem/i915_gem_lmem.h"
   7
   8#include "gen8_engine_cs.h"
   9#include "i915_drv.h"
  10#include "i915_perf.h"
  11#include "i915_reg.h"
  12#include "intel_context.h"
  13#include "intel_engine.h"
  14#include "intel_engine_regs.h"
  15#include "intel_gpu_commands.h"
  16#include "intel_gt.h"
  17#include "intel_gt_regs.h"
  18#include "intel_lrc.h"
  19#include "intel_lrc_reg.h"
  20#include "intel_ring.h"
  21#include "shmem_utils.h"
  22
  23/*
  24 * The per-platform tables are u8-encoded in @data. Decode @data and set the
  25 * addresses' offset and commands in @regs. The following encoding is used
  26 * for each byte. There are 2 steps: decoding commands and decoding addresses.
  27 *
  28 * Commands:
  29 * [7]: create NOPs - number of NOPs are set in lower bits
  30 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
  31 *      MI_LRI_FORCE_POSTED
  32 * [5:0]: Number of NOPs or registers to set values to in case of
  33 *        MI_LOAD_REGISTER_IMM
  34 *
  35 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
  36 * number of registers. They are set by using the REG/REG16 macros: the former
  37 * is used for offsets smaller than 0x200 while the latter is for values bigger
  38 * than that. Those macros already set all the bits documented below correctly:
  39 *
  40 * [7]: When a register offset needs more than 6 bits, use additional bytes, to
  41 *      follow, for the lower bits
  42 * [6:0]: Register offset, without considering the engine base.
  43 *
  44 * This function only tweaks the commands and register offsets. Values are not
  45 * filled out.
  46 */
  47static void set_offsets(u32 *regs,
  48			const u8 *data,
  49			const struct intel_engine_cs *engine,
  50			bool close)
  51#define NOP(x) (BIT(7) | (x))
  52#define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
  53#define POSTED BIT(0)
  54#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
  55#define REG16(x) \
  56	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
  57	(((x) >> 2) & 0x7f)
  58#define END 0
  59{
  60	const u32 base = engine->mmio_base;
  61
  62	while (*data) {
  63		u8 count, flags;
  64
  65		if (*data & BIT(7)) { /* skip */
  66			count = *data++ & ~BIT(7);
  67			regs += count;
  68			continue;
  69		}
  70
  71		count = *data & 0x3f;
  72		flags = *data >> 6;
  73		data++;
  74
  75		*regs = MI_LOAD_REGISTER_IMM(count);
  76		if (flags & POSTED)
  77			*regs |= MI_LRI_FORCE_POSTED;
  78		if (GRAPHICS_VER(engine->i915) >= 11)
  79			*regs |= MI_LRI_LRM_CS_MMIO;
  80		regs++;
  81
  82		GEM_BUG_ON(!count);
  83		do {
  84			u32 offset = 0;
  85			u8 v;
  86
  87			do {
  88				v = *data++;
  89				offset <<= 7;
  90				offset |= v & ~BIT(7);
  91			} while (v & BIT(7));
  92
  93			regs[0] = base + (offset << 2);
  94			regs += 2;
  95		} while (--count);
  96	}
  97
  98	if (close) {
  99		/* Close the batch; used mainly by live_lrc_layout() */
 100		*regs = MI_BATCH_BUFFER_END;
 101		if (GRAPHICS_VER(engine->i915) >= 11)
 102			*regs |= BIT(0);
 103	}
 104}
 105
 106static const u8 gen8_xcs_offsets[] = {
 107	NOP(1),
 108	LRI(11, 0),
 109	REG16(0x244),
 110	REG(0x034),
 111	REG(0x030),
 112	REG(0x038),
 113	REG(0x03c),
 114	REG(0x168),
 115	REG(0x140),
 116	REG(0x110),
 117	REG(0x11c),
 118	REG(0x114),
 119	REG(0x118),
 120
 121	NOP(9),
 122	LRI(9, 0),
 123	REG16(0x3a8),
 124	REG16(0x28c),
 125	REG16(0x288),
 126	REG16(0x284),
 127	REG16(0x280),
 128	REG16(0x27c),
 129	REG16(0x278),
 130	REG16(0x274),
 131	REG16(0x270),
 132
 133	NOP(13),
 134	LRI(2, 0),
 135	REG16(0x200),
 136	REG(0x028),
 137
 138	END
 139};
 140
 141static const u8 gen9_xcs_offsets[] = {
 142	NOP(1),
 143	LRI(14, POSTED),
 144	REG16(0x244),
 145	REG(0x034),
 146	REG(0x030),
 147	REG(0x038),
 148	REG(0x03c),
 149	REG(0x168),
 150	REG(0x140),
 151	REG(0x110),
 152	REG(0x11c),
 153	REG(0x114),
 154	REG(0x118),
 155	REG(0x1c0),
 156	REG(0x1c4),
 157	REG(0x1c8),
 158
 159	NOP(3),
 160	LRI(9, POSTED),
 161	REG16(0x3a8),
 162	REG16(0x28c),
 163	REG16(0x288),
 164	REG16(0x284),
 165	REG16(0x280),
 166	REG16(0x27c),
 167	REG16(0x278),
 168	REG16(0x274),
 169	REG16(0x270),
 170
 171	NOP(13),
 172	LRI(1, POSTED),
 173	REG16(0x200),
 174
 175	NOP(13),
 176	LRI(44, POSTED),
 177	REG(0x028),
 178	REG(0x09c),
 179	REG(0x0c0),
 180	REG(0x178),
 181	REG(0x17c),
 182	REG16(0x358),
 183	REG(0x170),
 184	REG(0x150),
 185	REG(0x154),
 186	REG(0x158),
 187	REG16(0x41c),
 188	REG16(0x600),
 189	REG16(0x604),
 190	REG16(0x608),
 191	REG16(0x60c),
 192	REG16(0x610),
 193	REG16(0x614),
 194	REG16(0x618),
 195	REG16(0x61c),
 196	REG16(0x620),
 197	REG16(0x624),
 198	REG16(0x628),
 199	REG16(0x62c),
 200	REG16(0x630),
 201	REG16(0x634),
 202	REG16(0x638),
 203	REG16(0x63c),
 204	REG16(0x640),
 205	REG16(0x644),
 206	REG16(0x648),
 207	REG16(0x64c),
 208	REG16(0x650),
 209	REG16(0x654),
 210	REG16(0x658),
 211	REG16(0x65c),
 212	REG16(0x660),
 213	REG16(0x664),
 214	REG16(0x668),
 215	REG16(0x66c),
 216	REG16(0x670),
 217	REG16(0x674),
 218	REG16(0x678),
 219	REG16(0x67c),
 220	REG(0x068),
 221
 222	END
 223};
 224
 225static const u8 gen12_xcs_offsets[] = {
 226	NOP(1),
 227	LRI(13, POSTED),
 228	REG16(0x244),
 229	REG(0x034),
 230	REG(0x030),
 231	REG(0x038),
 232	REG(0x03c),
 233	REG(0x168),
 234	REG(0x140),
 235	REG(0x110),
 236	REG(0x1c0),
 237	REG(0x1c4),
 238	REG(0x1c8),
 239	REG(0x180),
 240	REG16(0x2b4),
 241
 242	NOP(5),
 243	LRI(9, POSTED),
 244	REG16(0x3a8),
 245	REG16(0x28c),
 246	REG16(0x288),
 247	REG16(0x284),
 248	REG16(0x280),
 249	REG16(0x27c),
 250	REG16(0x278),
 251	REG16(0x274),
 252	REG16(0x270),
 253
 254	END
 255};
 256
 257static const u8 dg2_xcs_offsets[] = {
 258	NOP(1),
 259	LRI(15, POSTED),
 260	REG16(0x244),
 261	REG(0x034),
 262	REG(0x030),
 263	REG(0x038),
 264	REG(0x03c),
 265	REG(0x168),
 266	REG(0x140),
 267	REG(0x110),
 268	REG(0x1c0),
 269	REG(0x1c4),
 270	REG(0x1c8),
 271	REG(0x180),
 272	REG16(0x2b4),
 273	REG(0x120),
 274	REG(0x124),
 275
 276	NOP(1),
 277	LRI(9, POSTED),
 278	REG16(0x3a8),
 279	REG16(0x28c),
 280	REG16(0x288),
 281	REG16(0x284),
 282	REG16(0x280),
 283	REG16(0x27c),
 284	REG16(0x278),
 285	REG16(0x274),
 286	REG16(0x270),
 287
 288	END
 289};
 290
 291static const u8 gen8_rcs_offsets[] = {
 292	NOP(1),
 293	LRI(14, POSTED),
 294	REG16(0x244),
 295	REG(0x034),
 296	REG(0x030),
 297	REG(0x038),
 298	REG(0x03c),
 299	REG(0x168),
 300	REG(0x140),
 301	REG(0x110),
 302	REG(0x11c),
 303	REG(0x114),
 304	REG(0x118),
 305	REG(0x1c0),
 306	REG(0x1c4),
 307	REG(0x1c8),
 308
 309	NOP(3),
 310	LRI(9, POSTED),
 311	REG16(0x3a8),
 312	REG16(0x28c),
 313	REG16(0x288),
 314	REG16(0x284),
 315	REG16(0x280),
 316	REG16(0x27c),
 317	REG16(0x278),
 318	REG16(0x274),
 319	REG16(0x270),
 320
 321	NOP(13),
 322	LRI(1, 0),
 323	REG(0x0c8),
 324
 325	END
 326};
 327
 328static const u8 gen9_rcs_offsets[] = {
 329	NOP(1),
 330	LRI(14, POSTED),
 331	REG16(0x244),
 332	REG(0x34),
 333	REG(0x30),
 334	REG(0x38),
 335	REG(0x3c),
 336	REG(0x168),
 337	REG(0x140),
 338	REG(0x110),
 339	REG(0x11c),
 340	REG(0x114),
 341	REG(0x118),
 342	REG(0x1c0),
 343	REG(0x1c4),
 344	REG(0x1c8),
 345
 346	NOP(3),
 347	LRI(9, POSTED),
 348	REG16(0x3a8),
 349	REG16(0x28c),
 350	REG16(0x288),
 351	REG16(0x284),
 352	REG16(0x280),
 353	REG16(0x27c),
 354	REG16(0x278),
 355	REG16(0x274),
 356	REG16(0x270),
 357
 358	NOP(13),
 359	LRI(1, 0),
 360	REG(0xc8),
 361
 362	NOP(13),
 363	LRI(44, POSTED),
 364	REG(0x28),
 365	REG(0x9c),
 366	REG(0xc0),
 367	REG(0x178),
 368	REG(0x17c),
 369	REG16(0x358),
 370	REG(0x170),
 371	REG(0x150),
 372	REG(0x154),
 373	REG(0x158),
 374	REG16(0x41c),
 375	REG16(0x600),
 376	REG16(0x604),
 377	REG16(0x608),
 378	REG16(0x60c),
 379	REG16(0x610),
 380	REG16(0x614),
 381	REG16(0x618),
 382	REG16(0x61c),
 383	REG16(0x620),
 384	REG16(0x624),
 385	REG16(0x628),
 386	REG16(0x62c),
 387	REG16(0x630),
 388	REG16(0x634),
 389	REG16(0x638),
 390	REG16(0x63c),
 391	REG16(0x640),
 392	REG16(0x644),
 393	REG16(0x648),
 394	REG16(0x64c),
 395	REG16(0x650),
 396	REG16(0x654),
 397	REG16(0x658),
 398	REG16(0x65c),
 399	REG16(0x660),
 400	REG16(0x664),
 401	REG16(0x668),
 402	REG16(0x66c),
 403	REG16(0x670),
 404	REG16(0x674),
 405	REG16(0x678),
 406	REG16(0x67c),
 407	REG(0x68),
 408
 409	END
 410};
 411
 412static const u8 gen11_rcs_offsets[] = {
 413	NOP(1),
 414	LRI(15, POSTED),
 415	REG16(0x244),
 416	REG(0x034),
 417	REG(0x030),
 418	REG(0x038),
 419	REG(0x03c),
 420	REG(0x168),
 421	REG(0x140),
 422	REG(0x110),
 423	REG(0x11c),
 424	REG(0x114),
 425	REG(0x118),
 426	REG(0x1c0),
 427	REG(0x1c4),
 428	REG(0x1c8),
 429	REG(0x180),
 430
 431	NOP(1),
 432	LRI(9, POSTED),
 433	REG16(0x3a8),
 434	REG16(0x28c),
 435	REG16(0x288),
 436	REG16(0x284),
 437	REG16(0x280),
 438	REG16(0x27c),
 439	REG16(0x278),
 440	REG16(0x274),
 441	REG16(0x270),
 442
 443	LRI(1, POSTED),
 444	REG(0x1b0),
 445
 446	NOP(10),
 447	LRI(1, 0),
 448	REG(0x0c8),
 449
 450	END
 451};
 452
 453static const u8 gen12_rcs_offsets[] = {
 454	NOP(1),
 455	LRI(13, POSTED),
 456	REG16(0x244),
 457	REG(0x034),
 458	REG(0x030),
 459	REG(0x038),
 460	REG(0x03c),
 461	REG(0x168),
 462	REG(0x140),
 463	REG(0x110),
 464	REG(0x1c0),
 465	REG(0x1c4),
 466	REG(0x1c8),
 467	REG(0x180),
 468	REG16(0x2b4),
 469
 470	NOP(5),
 471	LRI(9, POSTED),
 472	REG16(0x3a8),
 473	REG16(0x28c),
 474	REG16(0x288),
 475	REG16(0x284),
 476	REG16(0x280),
 477	REG16(0x27c),
 478	REG16(0x278),
 479	REG16(0x274),
 480	REG16(0x270),
 481
 482	LRI(3, POSTED),
 483	REG(0x1b0),
 484	REG16(0x5a8),
 485	REG16(0x5ac),
 486
 487	NOP(6),
 488	LRI(1, 0),
 489	REG(0x0c8),
 490	NOP(3 + 9 + 1),
 491
 492	LRI(51, POSTED),
 493	REG16(0x588),
 494	REG16(0x588),
 495	REG16(0x588),
 496	REG16(0x588),
 497	REG16(0x588),
 498	REG16(0x588),
 499	REG(0x028),
 500	REG(0x09c),
 501	REG(0x0c0),
 502	REG(0x178),
 503	REG(0x17c),
 504	REG16(0x358),
 505	REG(0x170),
 506	REG(0x150),
 507	REG(0x154),
 508	REG(0x158),
 509	REG16(0x41c),
 510	REG16(0x600),
 511	REG16(0x604),
 512	REG16(0x608),
 513	REG16(0x60c),
 514	REG16(0x610),
 515	REG16(0x614),
 516	REG16(0x618),
 517	REG16(0x61c),
 518	REG16(0x620),
 519	REG16(0x624),
 520	REG16(0x628),
 521	REG16(0x62c),
 522	REG16(0x630),
 523	REG16(0x634),
 524	REG16(0x638),
 525	REG16(0x63c),
 526	REG16(0x640),
 527	REG16(0x644),
 528	REG16(0x648),
 529	REG16(0x64c),
 530	REG16(0x650),
 531	REG16(0x654),
 532	REG16(0x658),
 533	REG16(0x65c),
 534	REG16(0x660),
 535	REG16(0x664),
 536	REG16(0x668),
 537	REG16(0x66c),
 538	REG16(0x670),
 539	REG16(0x674),
 540	REG16(0x678),
 541	REG16(0x67c),
 542	REG(0x068),
 543	REG(0x084),
 544	NOP(1),
 545
 546	END
 547};
 548
 549static const u8 xehp_rcs_offsets[] = {
 550	NOP(1),
 551	LRI(13, POSTED),
 552	REG16(0x244),
 553	REG(0x034),
 554	REG(0x030),
 555	REG(0x038),
 556	REG(0x03c),
 557	REG(0x168),
 558	REG(0x140),
 559	REG(0x110),
 560	REG(0x1c0),
 561	REG(0x1c4),
 562	REG(0x1c8),
 563	REG(0x180),
 564	REG16(0x2b4),
 565
 566	NOP(5),
 567	LRI(9, POSTED),
 568	REG16(0x3a8),
 569	REG16(0x28c),
 570	REG16(0x288),
 571	REG16(0x284),
 572	REG16(0x280),
 573	REG16(0x27c),
 574	REG16(0x278),
 575	REG16(0x274),
 576	REG16(0x270),
 577
 578	LRI(3, POSTED),
 579	REG(0x1b0),
 580	REG16(0x5a8),
 581	REG16(0x5ac),
 582
 583	NOP(6),
 584	LRI(1, 0),
 585	REG(0x0c8),
 586
 587	END
 588};
 589
 590static const u8 dg2_rcs_offsets[] = {
 591	NOP(1),
 592	LRI(15, POSTED),
 593	REG16(0x244),
 594	REG(0x034),
 595	REG(0x030),
 596	REG(0x038),
 597	REG(0x03c),
 598	REG(0x168),
 599	REG(0x140),
 600	REG(0x110),
 601	REG(0x1c0),
 602	REG(0x1c4),
 603	REG(0x1c8),
 604	REG(0x180),
 605	REG16(0x2b4),
 606	REG(0x120),
 607	REG(0x124),
 608
 609	NOP(1),
 610	LRI(9, POSTED),
 611	REG16(0x3a8),
 612	REG16(0x28c),
 613	REG16(0x288),
 614	REG16(0x284),
 615	REG16(0x280),
 616	REG16(0x27c),
 617	REG16(0x278),
 618	REG16(0x274),
 619	REG16(0x270),
 620
 621	LRI(3, POSTED),
 622	REG(0x1b0),
 623	REG16(0x5a8),
 624	REG16(0x5ac),
 625
 626	NOP(6),
 627	LRI(1, 0),
 628	REG(0x0c8),
 629
 630	END
 631};
 632
 633static const u8 mtl_rcs_offsets[] = {
 634	NOP(1),
 635	LRI(15, POSTED),
 636	REG16(0x244),
 637	REG(0x034),
 638	REG(0x030),
 639	REG(0x038),
 640	REG(0x03c),
 641	REG(0x168),
 642	REG(0x140),
 643	REG(0x110),
 644	REG(0x1c0),
 645	REG(0x1c4),
 646	REG(0x1c8),
 647	REG(0x180),
 648	REG16(0x2b4),
 649	REG(0x120),
 650	REG(0x124),
 651
 652	NOP(1),
 653	LRI(9, POSTED),
 654	REG16(0x3a8),
 655	REG16(0x28c),
 656	REG16(0x288),
 657	REG16(0x284),
 658	REG16(0x280),
 659	REG16(0x27c),
 660	REG16(0x278),
 661	REG16(0x274),
 662	REG16(0x270),
 663
 664	NOP(2),
 665	LRI(2, POSTED),
 666	REG16(0x5a8),
 667	REG16(0x5ac),
 668
 669	NOP(6),
 670	LRI(1, 0),
 671	REG(0x0c8),
 672
 673	END
 674};
 675
 676#undef END
 677#undef REG16
 678#undef REG
 679#undef LRI
 680#undef NOP
 681
 682static const u8 *reg_offsets(const struct intel_engine_cs *engine)
 683{
 684	/*
 685	 * The gen12+ lists only have the registers we program in the basic
 686	 * default state. We rely on the context image using relative
 687	 * addressing to automatic fixup the register state between the
 688	 * physical engines for virtual engine.
 689	 */
 690	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
 691		   !intel_engine_has_relative_mmio(engine));
 692
 693	if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
 694		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70))
 695			return mtl_rcs_offsets;
 696		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 697			return dg2_rcs_offsets;
 698		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 699			return xehp_rcs_offsets;
 700		else if (GRAPHICS_VER(engine->i915) >= 12)
 701			return gen12_rcs_offsets;
 702		else if (GRAPHICS_VER(engine->i915) >= 11)
 703			return gen11_rcs_offsets;
 704		else if (GRAPHICS_VER(engine->i915) >= 9)
 705			return gen9_rcs_offsets;
 706		else
 707			return gen8_rcs_offsets;
 708	} else {
 709		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 710			return dg2_xcs_offsets;
 711		else if (GRAPHICS_VER(engine->i915) >= 12)
 712			return gen12_xcs_offsets;
 713		else if (GRAPHICS_VER(engine->i915) >= 9)
 714			return gen9_xcs_offsets;
 715		else
 716			return gen8_xcs_offsets;
 717	}
 718}
 719
 720static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
 721{
 722	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 723		return 0x70;
 724	else if (GRAPHICS_VER(engine->i915) >= 12)
 725		return 0x60;
 726	else if (GRAPHICS_VER(engine->i915) >= 9)
 727		return 0x54;
 728	else if (engine->class == RENDER_CLASS)
 729		return 0x58;
 730	else
 731		return -1;
 732}
 733
 734static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
 735{
 736	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 737		return 0x80;
 738	else if (GRAPHICS_VER(engine->i915) >= 12)
 739		return 0x70;
 740	else if (GRAPHICS_VER(engine->i915) >= 9)
 741		return 0x64;
 742	else if (GRAPHICS_VER(engine->i915) >= 8 &&
 743		 engine->class == RENDER_CLASS)
 744		return 0xc4;
 745	else
 746		return -1;
 747}
 748
 749static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
 750{
 751	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 752		return 0x84;
 753	else if (GRAPHICS_VER(engine->i915) >= 12)
 754		return 0x74;
 755	else if (GRAPHICS_VER(engine->i915) >= 9)
 756		return 0x68;
 757	else if (engine->class == RENDER_CLASS)
 758		return 0xd8;
 759	else
 760		return -1;
 761}
 762
 763static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
 764{
 765	if (GRAPHICS_VER(engine->i915) >= 12)
 766		return 0x12;
 767	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
 768		return 0x18;
 769	else
 770		return -1;
 771}
 772
 773static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
 774{
 775	int x;
 776
 777	x = lrc_ring_wa_bb_per_ctx(engine);
 778	if (x < 0)
 779		return x;
 780
 781	return x + 2;
 782}
 783
 784static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
 785{
 786	int x;
 787
 788	x = lrc_ring_indirect_ptr(engine);
 789	if (x < 0)
 790		return x;
 791
 792	return x + 2;
 793}
 794
 795static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
 796{
 797
 798	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
 799		/*
 800		 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
 801		 * simply to match the RCS context image layout.
 802		 */
 803		return 0xc6;
 804	else if (engine->class != RENDER_CLASS)
 805		return -1;
 806	else if (GRAPHICS_VER(engine->i915) >= 12)
 807		return 0xb6;
 808	else if (GRAPHICS_VER(engine->i915) >= 11)
 809		return 0xaa;
 810	else
 811		return -1;
 812}
 813
 814static u32
 815lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
 816{
 817	if (GRAPHICS_VER(engine->i915) >= 12)
 818		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 819	else if (GRAPHICS_VER(engine->i915) >= 11)
 820		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 821	else if (GRAPHICS_VER(engine->i915) >= 9)
 822		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 823	else if (GRAPHICS_VER(engine->i915) >= 8)
 824		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 825
 826	GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8);
 827
 828	return 0;
 829}
 830
 831static void
 832lrc_setup_indirect_ctx(u32 *regs,
 833		       const struct intel_engine_cs *engine,
 834		       u32 ctx_bb_ggtt_addr,
 835		       u32 size)
 836{
 837	GEM_BUG_ON(!size);
 838	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
 839	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
 840	regs[lrc_ring_indirect_ptr(engine) + 1] =
 841		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
 842
 843	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
 844	regs[lrc_ring_indirect_offset(engine) + 1] =
 845		lrc_ring_indirect_offset_default(engine) << 6;
 846}
 847
 848static void init_common_regs(u32 * const regs,
 849			     const struct intel_context *ce,
 850			     const struct intel_engine_cs *engine,
 851			     bool inhibit)
 852{
 853	u32 ctl;
 854	int loc;
 855
 856	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
 857	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
 858	if (inhibit)
 859		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
 860	if (GRAPHICS_VER(engine->i915) < 11)
 861		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
 862					   CTX_CTRL_RS_CTX_ENABLE);
 863	regs[CTX_CONTEXT_CONTROL] = ctl;
 864
 865	regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
 866
 867	loc = lrc_ring_bb_offset(engine);
 868	if (loc != -1)
 869		regs[loc + 1] = 0;
 870}
 871
 872static void init_wa_bb_regs(u32 * const regs,
 873			    const struct intel_engine_cs *engine)
 874{
 875	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
 876
 877	if (wa_ctx->per_ctx.size) {
 878		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
 879
 880		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
 881		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
 882			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
 883	}
 884
 885	if (wa_ctx->indirect_ctx.size) {
 886		lrc_setup_indirect_ctx(regs, engine,
 887				       i915_ggtt_offset(wa_ctx->vma) +
 888				       wa_ctx->indirect_ctx.offset,
 889				       wa_ctx->indirect_ctx.size);
 890	}
 891}
 892
 893static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
 894{
 895	if (i915_vm_is_4lvl(&ppgtt->vm)) {
 896		/* 64b PPGTT (48bit canonical)
 897		 * PDP0_DESCRIPTOR contains the base address to PML4 and
 898		 * other PDP Descriptors are ignored.
 899		 */
 900		ASSIGN_CTX_PML4(ppgtt, regs);
 901	} else {
 902		ASSIGN_CTX_PDP(ppgtt, regs, 3);
 903		ASSIGN_CTX_PDP(ppgtt, regs, 2);
 904		ASSIGN_CTX_PDP(ppgtt, regs, 1);
 905		ASSIGN_CTX_PDP(ppgtt, regs, 0);
 906	}
 907}
 908
 909static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
 910{
 911	if (i915_is_ggtt(vm))
 912		return i915_vm_to_ggtt(vm)->alias;
 913	else
 914		return i915_vm_to_ppgtt(vm);
 915}
 916
 917static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
 918{
 919	int x;
 920
 921	x = lrc_ring_mi_mode(engine);
 922	if (x != -1) {
 923		regs[x + 1] &= ~STOP_RING;
 924		regs[x + 1] |= STOP_RING << 16;
 925	}
 926}
 927
 928static void __lrc_init_regs(u32 *regs,
 929			    const struct intel_context *ce,
 930			    const struct intel_engine_cs *engine,
 931			    bool inhibit)
 932{
 933	/*
 934	 * A context is actually a big batch buffer with several
 935	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
 936	 * values we are setting here are only for the first context restore:
 937	 * on a subsequent save, the GPU will recreate this batchbuffer with new
 938	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
 939	 * we are not initializing here).
 940	 *
 941	 * Must keep consistent with virtual_update_register_offsets().
 942	 */
 943
 944	if (inhibit)
 945		memset(regs, 0, PAGE_SIZE);
 946
 947	set_offsets(regs, reg_offsets(engine), engine, inhibit);
 948
 949	init_common_regs(regs, ce, engine, inhibit);
 950	init_ppgtt_regs(regs, vm_alias(ce->vm));
 951
 952	init_wa_bb_regs(regs, engine);
 953
 954	__reset_stop_ring(regs, engine);
 955}
 956
 957void lrc_init_regs(const struct intel_context *ce,
 958		   const struct intel_engine_cs *engine,
 959		   bool inhibit)
 960{
 961	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
 962}
 963
 964void lrc_reset_regs(const struct intel_context *ce,
 965		    const struct intel_engine_cs *engine)
 966{
 967	__reset_stop_ring(ce->lrc_reg_state, engine);
 968}
 969
 970static void
 971set_redzone(void *vaddr, const struct intel_engine_cs *engine)
 972{
 973	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 974		return;
 975
 976	vaddr += engine->context_size;
 977
 978	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
 979}
 980
 981static void
 982check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
 983{
 984	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
 985		return;
 986
 987	vaddr += engine->context_size;
 988
 989	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
 990		drm_err_once(&engine->i915->drm,
 991			     "%s context redzone overwritten!\n",
 992			     engine->name);
 993}
 994
 995static u32 context_wa_bb_offset(const struct intel_context *ce)
 996{
 997	return PAGE_SIZE * ce->wa_bb_page;
 998}
 999
1000static u32 *context_indirect_bb(const struct intel_context *ce)
1001{
1002	void *ptr;
1003
1004	GEM_BUG_ON(!ce->wa_bb_page);
1005
1006	ptr = ce->lrc_reg_state;
1007	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1008	ptr += context_wa_bb_offset(ce);
1009
1010	return ptr;
1011}
1012
1013void lrc_init_state(struct intel_context *ce,
1014		    struct intel_engine_cs *engine,
1015		    void *state)
1016{
1017	bool inhibit = true;
1018
1019	set_redzone(state, engine);
1020
1021	if (engine->default_state) {
1022		shmem_read(engine->default_state, 0,
1023			   state, engine->context_size);
1024		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
1025		inhibit = false;
1026	}
1027
1028	/* Clear the ppHWSP (inc. per-context counters) */
1029	memset(state, 0, PAGE_SIZE);
1030
1031	/* Clear the indirect wa and storage */
1032	if (ce->wa_bb_page)
1033		memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
1034
1035	/*
1036	 * The second page of the context object contains some registers which
1037	 * must be set up prior to the first execution.
1038	 */
1039	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
1040}
1041
1042u32 lrc_indirect_bb(const struct intel_context *ce)
1043{
1044	return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
1045}
1046
1047static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
1048{
1049	/* If predication is active, this will be noop'ed */
1050	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1051	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1052	*cs++ = 0;
1053	*cs++ = 0; /* No predication */
1054
1055	/* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
1056	*cs++ = MI_BATCH_BUFFER_END | BIT(15);
1057	*cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
1058
1059	/* Instructions are no longer predicated (disabled), we can proceed */
1060	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1061	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1062	*cs++ = 0;
1063	*cs++ = 1; /* enable predication before the next BB */
1064
1065	*cs++ = MI_BATCH_BUFFER_END;
1066	GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
1067
1068	return cs;
1069}
1070
1071static struct i915_vma *
1072__lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
1073{
1074	struct drm_i915_gem_object *obj;
1075	struct i915_vma *vma;
1076	u32 context_size;
1077
1078	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
1079
1080	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1081		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
1082
1083	if (GRAPHICS_VER(engine->i915) >= 12) {
1084		ce->wa_bb_page = context_size / PAGE_SIZE;
1085		context_size += PAGE_SIZE;
1086	}
1087
1088	if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1089		ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1090		context_size += PARENT_SCRATCH_SIZE;
1091	}
1092
1093	obj = i915_gem_object_create_lmem(engine->i915, context_size,
1094					  I915_BO_ALLOC_PM_VOLATILE);
1095	if (IS_ERR(obj))
1096		obj = i915_gem_object_create_shmem(engine->i915, context_size);
1097	if (IS_ERR(obj))
1098		return ERR_CAST(obj);
1099
1100	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1101	if (IS_ERR(vma)) {
1102		i915_gem_object_put(obj);
1103		return vma;
1104	}
1105
1106	return vma;
1107}
1108
1109static struct intel_timeline *
1110pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1111{
1112	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1113
1114	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1115}
1116
1117int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1118{
1119	struct intel_ring *ring;
1120	struct i915_vma *vma;
1121	int err;
1122
1123	GEM_BUG_ON(ce->state);
1124
1125	vma = __lrc_alloc_state(ce, engine);
1126	if (IS_ERR(vma))
1127		return PTR_ERR(vma);
1128
1129	ring = intel_engine_create_ring(engine, ce->ring_size);
1130	if (IS_ERR(ring)) {
1131		err = PTR_ERR(ring);
1132		goto err_vma;
1133	}
1134
1135	if (!page_mask_bits(ce->timeline)) {
1136		struct intel_timeline *tl;
1137
1138		/*
1139		 * Use the static global HWSP for the kernel context, and
1140		 * a dynamically allocated cacheline for everyone else.
1141		 */
1142		if (unlikely(ce->timeline))
1143			tl = pinned_timeline(ce, engine);
1144		else
1145			tl = intel_timeline_create(engine->gt);
1146		if (IS_ERR(tl)) {
1147			err = PTR_ERR(tl);
1148			goto err_ring;
1149		}
1150
1151		ce->timeline = tl;
1152	}
1153
1154	ce->ring = ring;
1155	ce->state = vma;
1156
1157	return 0;
1158
1159err_ring:
1160	intel_ring_put(ring);
1161err_vma:
1162	i915_vma_put(vma);
1163	return err;
1164}
1165
1166void lrc_reset(struct intel_context *ce)
1167{
1168	GEM_BUG_ON(!intel_context_is_pinned(ce));
1169
1170	intel_ring_reset(ce->ring, ce->ring->emit);
1171
1172	/* Scrub away the garbage */
1173	lrc_init_regs(ce, ce->engine, true);
1174	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1175}
1176
1177int
1178lrc_pre_pin(struct intel_context *ce,
1179	    struct intel_engine_cs *engine,
1180	    struct i915_gem_ww_ctx *ww,
1181	    void **vaddr)
1182{
1183	GEM_BUG_ON(!ce->state);
1184	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1185
1186	*vaddr = i915_gem_object_pin_map(ce->state->obj,
1187					 i915_coherent_map_type(ce->engine->i915,
1188								ce->state->obj,
1189								false) |
1190					 I915_MAP_OVERRIDE);
1191
1192	return PTR_ERR_OR_ZERO(*vaddr);
1193}
1194
1195int
1196lrc_pin(struct intel_context *ce,
1197	struct intel_engine_cs *engine,
1198	void *vaddr)
1199{
1200	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1201
1202	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1203		lrc_init_state(ce, engine, vaddr);
1204
1205	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1206	return 0;
1207}
1208
1209void lrc_unpin(struct intel_context *ce)
1210{
1211	if (unlikely(ce->parallel.last_rq)) {
1212		i915_request_put(ce->parallel.last_rq);
1213		ce->parallel.last_rq = NULL;
1214	}
1215	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1216		      ce->engine);
1217}
1218
1219void lrc_post_unpin(struct intel_context *ce)
1220{
1221	i915_gem_object_unpin_map(ce->state->obj);
1222}
1223
1224void lrc_fini(struct intel_context *ce)
1225{
1226	if (!ce->state)
1227		return;
1228
1229	intel_ring_put(fetch_and_zero(&ce->ring));
1230	i915_vma_put(fetch_and_zero(&ce->state));
1231}
1232
1233void lrc_destroy(struct kref *kref)
1234{
1235	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1236
1237	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1238	GEM_BUG_ON(intel_context_is_pinned(ce));
1239
1240	lrc_fini(ce);
1241
1242	intel_context_fini(ce);
1243	intel_context_free(ce);
1244}
1245
1246static u32 *
1247gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1248{
1249	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1250		MI_SRM_LRM_GLOBAL_GTT |
1251		MI_LRI_LRM_CS_MMIO;
1252	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1253	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1254		CTX_TIMESTAMP * sizeof(u32);
1255	*cs++ = 0;
1256
1257	*cs++ = MI_LOAD_REGISTER_REG |
1258		MI_LRR_SOURCE_CS_MMIO |
1259		MI_LRI_LRM_CS_MMIO;
1260	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1261	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1262
1263	*cs++ = MI_LOAD_REGISTER_REG |
1264		MI_LRR_SOURCE_CS_MMIO |
1265		MI_LRI_LRM_CS_MMIO;
1266	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1267	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1268
1269	return cs;
1270}
1271
1272static u32 *
1273gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1274{
1275	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1276
1277	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1278		MI_SRM_LRM_GLOBAL_GTT |
1279		MI_LRI_LRM_CS_MMIO;
1280	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1281	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1282		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1283	*cs++ = 0;
1284
1285	return cs;
1286}
1287
1288static u32 *
1289gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1290{
1291	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1292
1293	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1294		MI_SRM_LRM_GLOBAL_GTT |
1295		MI_LRI_LRM_CS_MMIO;
1296	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1297	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1298		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1299	*cs++ = 0;
1300
1301	*cs++ = MI_LOAD_REGISTER_REG |
1302		MI_LRR_SOURCE_CS_MMIO |
1303		MI_LRI_LRM_CS_MMIO;
1304	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1305	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1306
1307	return cs;
1308}
1309
1310/*
1311 * On DG2 during context restore of a preempted context in GPGPU mode,
1312 * RCS restore hang is detected. This is extremely timing dependent.
1313 * To address this below sw wabb is implemented for DG2 A steppings.
1314 */
1315static u32 *
1316dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs)
1317{
1318	*cs++ = MI_LOAD_REGISTER_IMM(1);
1319	*cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG);
1320	*cs++ = 0x21;
1321
1322	*cs++ = MI_LOAD_REGISTER_REG;
1323	*cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1324	*cs++ = i915_mmio_reg_offset(GEN12_CULLBIT1);
1325
1326	*cs++ = MI_LOAD_REGISTER_REG;
1327	*cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1328	*cs++ = i915_mmio_reg_offset(GEN12_CULLBIT2);
1329
1330	return cs;
1331}
1332
1333/*
1334 * The bspec's tuning guide asks us to program a vertical watermark value of
1335 * 0x3FF.  However this register is not saved/restored properly by the
1336 * hardware, so we're required to apply the desired value via INDIRECT_CTX
1337 * batch buffer to ensure the value takes effect properly.  All other bits
1338 * in this register should remain at 0 (the hardware default).
1339 */
1340static u32 *
1341dg2_emit_draw_watermark_setting(u32 *cs)
1342{
1343	*cs++ = MI_LOAD_REGISTER_IMM(1);
1344	*cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
1345	*cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
1346
1347	return cs;
1348}
1349
1350static u32 *
1351gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1352{
1353	cs = gen12_emit_timestamp_wa(ce, cs);
1354	cs = gen12_emit_cmd_buf_wa(ce, cs);
1355	cs = gen12_emit_restore_scratch(ce, cs);
1356
1357	/* Wa_22011450934:dg2 */
1358	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) ||
1359	    IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0))
1360		cs = dg2_emit_rcs_hang_wabb(ce, cs);
1361
1362	/* Wa_16013000631:dg2 */
1363	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1364	    IS_DG2_G11(ce->engine->i915))
1365		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1366
1367	/* hsdes: 1809175790 */
1368	if (!HAS_FLAT_CCS(ce->engine->i915))
1369		cs = gen12_emit_aux_table_inv(ce->engine->gt,
1370					      cs, GEN12_GFX_CCS_AUX_NV);
1371
1372	/* Wa_16014892111 */
1373	if (IS_DG2(ce->engine->i915))
1374		cs = dg2_emit_draw_watermark_setting(cs);
1375
1376	return cs;
1377}
1378
1379static u32 *
1380gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1381{
1382	cs = gen12_emit_timestamp_wa(ce, cs);
1383	cs = gen12_emit_restore_scratch(ce, cs);
1384
1385	/* Wa_16013000631:dg2 */
1386	if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1387	    IS_DG2_G11(ce->engine->i915))
1388		if (ce->engine->class == COMPUTE_CLASS)
1389			cs = gen8_emit_pipe_control(cs,
1390						    PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1391						    0);
1392
1393	/* hsdes: 1809175790 */
1394	if (!HAS_FLAT_CCS(ce->engine->i915)) {
1395		if (ce->engine->class == VIDEO_DECODE_CLASS)
1396			cs = gen12_emit_aux_table_inv(ce->engine->gt,
1397						      cs, GEN12_VD0_AUX_NV);
1398		else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS)
1399			cs = gen12_emit_aux_table_inv(ce->engine->gt,
1400						      cs, GEN12_VE0_AUX_NV);
1401	}
1402
1403	return cs;
1404}
1405
1406static void
1407setup_indirect_ctx_bb(const struct intel_context *ce,
1408		      const struct intel_engine_cs *engine,
1409		      u32 *(*emit)(const struct intel_context *, u32 *))
1410{
1411	u32 * const start = context_indirect_bb(ce);
1412	u32 *cs;
1413
1414	cs = emit(ce, start);
1415	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1416	while ((unsigned long)cs % CACHELINE_BYTES)
1417		*cs++ = MI_NOOP;
1418
1419	GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1420	setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1421
1422	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1423			       lrc_indirect_bb(ce),
1424			       (cs - start) * sizeof(*cs));
1425}
1426
1427/*
1428 * The context descriptor encodes various attributes of a context,
1429 * including its GTT address and some flags. Because it's fairly
1430 * expensive to calculate, we'll just do it once and cache the result,
1431 * which remains valid until the context is unpinned.
1432 *
1433 * This is what a descriptor looks like, from LSB to MSB::
1434 *
1435 *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1436 *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1437 *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1438 *      bits 53-54:    mbz, reserved for use by hardware
1439 *      bits 55-63:    group ID, currently unused and set to 0
1440 *
1441 * Starting from Gen11, the upper dword of the descriptor has a new format:
1442 *
1443 *      bits 32-36:    reserved
1444 *      bits 37-47:    SW context ID
1445 *      bits 48:53:    engine instance
1446 *      bit 54:        mbz, reserved for use by hardware
1447 *      bits 55-60:    SW counter
1448 *      bits 61-63:    engine class
1449 *
1450 * On Xe_HP, the upper dword of the descriptor has a new format:
1451 *
1452 *      bits 32-37:    virtual function number
1453 *      bit 38:        mbz, reserved for use by hardware
1454 *      bits 39-54:    SW context ID
1455 *      bits 55-57:    reserved
1456 *      bits 58-63:    SW counter
1457 *
1458 * engine info, SW context ID and SW counter need to form a unique number
1459 * (Context ID) per lrc.
1460 */
1461static u32 lrc_descriptor(const struct intel_context *ce)
1462{
1463	u32 desc;
1464
1465	desc = INTEL_LEGACY_32B_CONTEXT;
1466	if (i915_vm_is_4lvl(ce->vm))
1467		desc = INTEL_LEGACY_64B_CONTEXT;
1468	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1469
1470	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1471	if (GRAPHICS_VER(ce->vm->i915) == 8)
1472		desc |= GEN8_CTX_L3LLC_COHERENT;
1473
1474	return i915_ggtt_offset(ce->state) | desc;
1475}
1476
1477u32 lrc_update_regs(const struct intel_context *ce,
1478		    const struct intel_engine_cs *engine,
1479		    u32 head)
1480{
1481	struct intel_ring *ring = ce->ring;
1482	u32 *regs = ce->lrc_reg_state;
1483
1484	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1485	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1486
1487	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1488	regs[CTX_RING_HEAD] = head;
1489	regs[CTX_RING_TAIL] = ring->tail;
1490	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1491
1492	/* RPCS */
1493	if (engine->class == RENDER_CLASS) {
1494		regs[CTX_R_PWR_CLK_STATE] =
1495			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1496
1497		i915_oa_init_reg_state(ce, engine);
1498	}
1499
1500	if (ce->wa_bb_page) {
1501		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1502
1503		fn = gen12_emit_indirect_ctx_xcs;
1504		if (ce->engine->class == RENDER_CLASS)
1505			fn = gen12_emit_indirect_ctx_rcs;
1506
1507		/* Mutually exclusive wrt to global indirect bb */
1508		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1509		setup_indirect_ctx_bb(ce, engine, fn);
1510	}
1511
1512	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1513}
1514
1515void lrc_update_offsets(struct intel_context *ce,
1516			struct intel_engine_cs *engine)
1517{
1518	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1519}
1520
1521void lrc_check_regs(const struct intel_context *ce,
1522		    const struct intel_engine_cs *engine,
1523		    const char *when)
1524{
1525	const struct intel_ring *ring = ce->ring;
1526	u32 *regs = ce->lrc_reg_state;
1527	bool valid = true;
1528	int x;
1529
1530	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1531		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1532		       engine->name,
1533		       regs[CTX_RING_START],
1534		       i915_ggtt_offset(ring->vma));
1535		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1536		valid = false;
1537	}
1538
1539	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1540	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1541		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1542		       engine->name,
1543		       regs[CTX_RING_CTL],
1544		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1545		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1546		valid = false;
1547	}
1548
1549	x = lrc_ring_mi_mode(engine);
1550	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1551		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1552		       engine->name, regs[x + 1]);
1553		regs[x + 1] &= ~STOP_RING;
1554		regs[x + 1] |= STOP_RING << 16;
1555		valid = false;
1556	}
1557
1558	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1559}
1560
1561/*
1562 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1563 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1564 * but there is a slight complication as this is applied in WA batch where the
1565 * values are only initialized once so we cannot take register value at the
1566 * beginning and reuse it further; hence we save its value to memory, upload a
1567 * constant value with bit21 set and then we restore it back with the saved value.
1568 * To simplify the WA, a constant value is formed by using the default value
1569 * of this register. This shouldn't be a problem because we are only modifying
1570 * it for a short period and this batch in non-premptible. We can ofcourse
1571 * use additional instructions that read the actual value of the register
1572 * at that time and set our bit of interest but it makes the WA complicated.
1573 *
1574 * This WA is also required for Gen9 so extracting as a function avoids
1575 * code duplication.
1576 */
1577static u32 *
1578gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1579{
1580	/* NB no one else is allowed to scribble over scratch + 256! */
1581	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1582	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1583	*batch++ = intel_gt_scratch_offset(engine->gt,
1584					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1585	*batch++ = 0;
1586
1587	*batch++ = MI_LOAD_REGISTER_IMM(1);
1588	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1589	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1590
1591	batch = gen8_emit_pipe_control(batch,
1592				       PIPE_CONTROL_CS_STALL |
1593				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1594				       0);
1595
1596	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1597	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1598	*batch++ = intel_gt_scratch_offset(engine->gt,
1599					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1600	*batch++ = 0;
1601
1602	return batch;
1603}
1604
1605/*
1606 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1607 * initialized at the beginning and shared across all contexts but this field
1608 * helps us to have multiple batches at different offsets and select them based
1609 * on a criteria. At the moment this batch always start at the beginning of the page
1610 * and at this point we don't have multiple wa_ctx batch buffers.
1611 *
1612 * The number of WA applied are not known at the beginning; we use this field
1613 * to return the no of DWORDS written.
1614 *
1615 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1616 * so it adds NOOPs as padding to make it cacheline aligned.
1617 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1618 * makes a complete batch buffer.
1619 */
1620static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1621{
1622	/* WaDisableCtxRestoreArbitration:bdw,chv */
1623	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1624
1625	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1626	if (IS_BROADWELL(engine->i915))
1627		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1628
1629	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1630	/* Actual scratch location is at 128 bytes offset */
1631	batch = gen8_emit_pipe_control(batch,
1632				       PIPE_CONTROL_FLUSH_L3 |
1633				       PIPE_CONTROL_STORE_DATA_INDEX |
1634				       PIPE_CONTROL_CS_STALL |
1635				       PIPE_CONTROL_QW_WRITE,
1636				       LRC_PPHWSP_SCRATCH_ADDR);
1637
1638	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1639
1640	/* Pad to end of cacheline */
1641	while ((unsigned long)batch % CACHELINE_BYTES)
1642		*batch++ = MI_NOOP;
1643
1644	/*
1645	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1646	 * execution depends on the length specified in terms of cache lines
1647	 * in the register CTX_RCS_INDIRECT_CTX
1648	 */
1649
1650	return batch;
1651}
1652
1653struct lri {
1654	i915_reg_t reg;
1655	u32 value;
1656};
1657
1658static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1659{
1660	GEM_BUG_ON(!count || count > 63);
1661
1662	*batch++ = MI_LOAD_REGISTER_IMM(count);
1663	do {
1664		*batch++ = i915_mmio_reg_offset(lri->reg);
1665		*batch++ = lri->value;
1666	} while (lri++, --count);
1667	*batch++ = MI_NOOP;
1668
1669	return batch;
1670}
1671
1672static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1673{
1674	static const struct lri lri[] = {
1675		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1676		{
1677			COMMON_SLICE_CHICKEN2,
1678			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1679				       0),
1680		},
1681
1682		/* BSpec: 11391 */
1683		{
1684			FF_SLICE_CHICKEN,
1685			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1686				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1687		},
1688
1689		/* BSpec: 11299 */
1690		{
1691			_3D_CHICKEN3,
1692			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1693				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1694		}
1695	};
1696
1697	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1698
1699	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1700	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1701
1702	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1703	batch = gen8_emit_pipe_control(batch,
1704				       PIPE_CONTROL_FLUSH_L3 |
1705				       PIPE_CONTROL_STORE_DATA_INDEX |
1706				       PIPE_CONTROL_CS_STALL |
1707				       PIPE_CONTROL_QW_WRITE,
1708				       LRC_PPHWSP_SCRATCH_ADDR);
1709
1710	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1711
1712	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1713	if (HAS_POOLED_EU(engine->i915)) {
1714		/*
1715		 * EU pool configuration is setup along with golden context
1716		 * during context initialization. This value depends on
1717		 * device type (2x6 or 3x6) and needs to be updated based
1718		 * on which subslice is disabled especially for 2x6
1719		 * devices, however it is safe to load default
1720		 * configuration of 3x6 device instead of masking off
1721		 * corresponding bits because HW ignores bits of a disabled
1722		 * subslice and drops down to appropriate config. Please
1723		 * see render_state_setup() in i915_gem_render_state.c for
1724		 * possible configurations, to avoid duplication they are
1725		 * not shown here again.
1726		 */
1727		*batch++ = GEN9_MEDIA_POOL_STATE;
1728		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1729		*batch++ = 0x00777000;
1730		*batch++ = 0;
1731		*batch++ = 0;
1732		*batch++ = 0;
1733	}
1734
1735	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1736
1737	/* Pad to end of cacheline */
1738	while ((unsigned long)batch % CACHELINE_BYTES)
1739		*batch++ = MI_NOOP;
1740
1741	return batch;
1742}
1743
1744#define CTX_WA_BB_SIZE (PAGE_SIZE)
1745
1746static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1747{
1748	struct drm_i915_gem_object *obj;
1749	struct i915_vma *vma;
1750	int err;
1751
1752	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1753	if (IS_ERR(obj))
1754		return PTR_ERR(obj);
1755
1756	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1757	if (IS_ERR(vma)) {
1758		err = PTR_ERR(vma);
1759		goto err;
1760	}
1761
1762	engine->wa_ctx.vma = vma;
1763	return 0;
1764
1765err:
1766	i915_gem_object_put(obj);
1767	return err;
1768}
1769
1770void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1771{
1772	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1773}
1774
1775typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1776
1777void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1778{
1779	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1780	struct i915_wa_ctx_bb *wa_bb[] = {
1781		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1782	};
1783	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1784	struct i915_gem_ww_ctx ww;
1785	void *batch, *batch_ptr;
1786	unsigned int i;
1787	int err;
1788
1789	if (GRAPHICS_VER(engine->i915) >= 11 ||
1790	    !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1791		return;
1792
1793	if (GRAPHICS_VER(engine->i915) == 9) {
1794		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1795		wa_bb_fn[1] = NULL;
1796	} else if (GRAPHICS_VER(engine->i915) == 8) {
1797		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1798		wa_bb_fn[1] = NULL;
1799	}
1800
1801	err = lrc_create_wa_ctx(engine);
1802	if (err) {
1803		/*
1804		 * We continue even if we fail to initialize WA batch
1805		 * because we only expect rare glitches but nothing
1806		 * critical to prevent us from using GPU
1807		 */
1808		drm_err(&engine->i915->drm,
1809			"Ignoring context switch w/a allocation error:%d\n",
1810			err);
1811		return;
1812	}
1813
1814	if (!engine->wa_ctx.vma)
1815		return;
1816
1817	i915_gem_ww_ctx_init(&ww, true);
1818retry:
1819	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1820	if (!err)
1821		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1822	if (err)
1823		goto err;
1824
1825	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1826	if (IS_ERR(batch)) {
1827		err = PTR_ERR(batch);
1828		goto err_unpin;
1829	}
1830
1831	/*
1832	 * Emit the two workaround batch buffers, recording the offset from the
1833	 * start of the workaround batch buffer object for each and their
1834	 * respective sizes.
1835	 */
1836	batch_ptr = batch;
1837	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1838		wa_bb[i]->offset = batch_ptr - batch;
1839		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1840						  CACHELINE_BYTES))) {
1841			err = -EINVAL;
1842			break;
1843		}
1844		if (wa_bb_fn[i])
1845			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1846		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1847	}
1848	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1849
1850	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1851	__i915_gem_object_release_map(wa_ctx->vma->obj);
1852
1853	/* Verify that we can handle failure to setup the wa_ctx */
1854	if (!err)
1855		err = i915_inject_probe_error(engine->i915, -ENODEV);
1856
1857err_unpin:
1858	if (err)
1859		i915_vma_unpin(wa_ctx->vma);
1860err:
1861	if (err == -EDEADLK) {
1862		err = i915_gem_ww_ctx_backoff(&ww);
1863		if (!err)
1864			goto retry;
1865	}
1866	i915_gem_ww_ctx_fini(&ww);
1867
1868	if (err) {
1869		i915_vma_put(engine->wa_ctx.vma);
1870
1871		/* Clear all flags to prevent further use */
1872		memset(wa_ctx, 0, sizeof(*wa_ctx));
1873	}
1874}
1875
1876static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1877{
1878#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1879	stats->runtime.num_underflow++;
1880	stats->runtime.max_underflow =
1881		max_t(u32, stats->runtime.max_underflow, -dt);
1882#endif
1883}
1884
1885static u32 lrc_get_runtime(const struct intel_context *ce)
1886{
1887	/*
1888	 * We can use either ppHWSP[16] which is recorded before the context
1889	 * switch (and so excludes the cost of context switches) or use the
1890	 * value from the context image itself, which is saved/restored earlier
1891	 * and so includes the cost of the save.
1892	 */
1893	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1894}
1895
1896void lrc_update_runtime(struct intel_context *ce)
1897{
1898	struct intel_context_stats *stats = &ce->stats;
1899	u32 old;
1900	s32 dt;
1901
1902	old = stats->runtime.last;
1903	stats->runtime.last = lrc_get_runtime(ce);
1904	dt = stats->runtime.last - old;
1905	if (!dt)
1906		return;
1907
1908	if (unlikely(dt < 0)) {
1909		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1910			 old, stats->runtime.last, dt);
1911		st_runtime_underflow(stats, dt);
1912		return;
1913	}
1914
1915	ewma_runtime_add(&stats->runtime.avg, dt);
1916	stats->runtime.total += dt;
1917}
1918
1919#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1920#include "selftest_lrc.c"
1921#endif