Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2021 Intel Corporation
   4 */
   5
   6#include "xe_lrc.h"
   7
   8#include "instructions/xe_mi_commands.h"
   9#include "instructions/xe_gfxpipe_commands.h"
  10#include "regs/xe_engine_regs.h"
  11#include "regs/xe_gpu_commands.h"
  12#include "regs/xe_lrc_layout.h"
  13#include "xe_bb.h"
  14#include "xe_bo.h"
  15#include "xe_device.h"
  16#include "xe_drm_client.h"
  17#include "xe_exec_queue_types.h"
  18#include "xe_gt.h"
  19#include "xe_gt_printk.h"
  20#include "xe_hw_fence.h"
  21#include "xe_map.h"
  22#include "xe_vm.h"
  23
  24#define LRC_VALID				(1 << 0)
  25#define LRC_PRIVILEGE				(1 << 8)
  26#define LRC_ADDRESSING_MODE_SHIFT		3
  27#define LRC_LEGACY_64B_CONTEXT			3
  28
  29#define ENGINE_CLASS_SHIFT			61
  30#define ENGINE_INSTANCE_SHIFT			48
  31
  32static struct xe_device *
  33lrc_to_xe(struct xe_lrc *lrc)
  34{
  35	return gt_to_xe(lrc->fence_ctx.gt);
  36}
  37
  38size_t xe_lrc_size(struct xe_device *xe, enum xe_engine_class class)
  39{
  40	switch (class) {
  41	case XE_ENGINE_CLASS_RENDER:
  42		if (GRAPHICS_VER(xe) >= 20)
  43			return 4 * SZ_4K;
  44		else
  45			return 14 * SZ_4K;
  46	case XE_ENGINE_CLASS_COMPUTE:
  47		/* 14 pages since graphics_ver == 11 */
  48		if (GRAPHICS_VER(xe) >= 20)
  49			return 3 * SZ_4K;
  50		else
  51			return 14 * SZ_4K;
  52	default:
  53		WARN(1, "Unknown engine class: %d", class);
  54		fallthrough;
  55	case XE_ENGINE_CLASS_COPY:
  56	case XE_ENGINE_CLASS_VIDEO_DECODE:
  57	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
  58	case XE_ENGINE_CLASS_OTHER:
  59		return 2 * SZ_4K;
  60	}
  61}
  62
  63/*
  64 * The per-platform tables are u8-encoded in @data. Decode @data and set the
  65 * addresses' offset and commands in @regs. The following encoding is used
  66 * for each byte. There are 2 steps: decoding commands and decoding addresses.
  67 *
  68 * Commands:
  69 * [7]: create NOPs - number of NOPs are set in lower bits
  70 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
  71 *      MI_LRI_FORCE_POSTED
  72 * [5:0]: Number of NOPs or registers to set values to in case of
  73 *        MI_LOAD_REGISTER_IMM
  74 *
  75 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
  76 * number of registers. They are set by using the REG/REG16 macros: the former
  77 * is used for offsets smaller than 0x200 while the latter is for values bigger
  78 * than that. Those macros already set all the bits documented below correctly:
  79 *
  80 * [7]: When a register offset needs more than 6 bits, use additional bytes, to
  81 *      follow, for the lower bits
  82 * [6:0]: Register offset, without considering the engine base.
  83 *
  84 * This function only tweaks the commands and register offsets. Values are not
  85 * filled out.
  86 */
  87static void set_offsets(u32 *regs,
  88			const u8 *data,
  89			const struct xe_hw_engine *hwe)
  90#define NOP(x) (BIT(7) | (x))
  91#define LRI(count, flags) ((flags) << 6 | (count) | \
  92			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
  93#define POSTED BIT(0)
  94#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
  95#define REG16(x) \
  96	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
  97	(((x) >> 2) & 0x7f)
  98#define END 0
  99{
 100	const u32 base = hwe->mmio_base;
 101
 102	while (*data) {
 103		u8 count, flags;
 104
 105		if (*data & BIT(7)) { /* skip */
 106			count = *data++ & ~BIT(7);
 107			regs += count;
 108			continue;
 109		}
 110
 111		count = *data & 0x3f;
 112		flags = *data >> 6;
 113		data++;
 114
 115		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
 116		if (flags & POSTED)
 117			*regs |= MI_LRI_FORCE_POSTED;
 118		*regs |= MI_LRI_LRM_CS_MMIO;
 119		regs++;
 120
 121		xe_gt_assert(hwe->gt, count);
 122		do {
 123			u32 offset = 0;
 124			u8 v;
 125
 126			do {
 127				v = *data++;
 128				offset <<= 7;
 129				offset |= v & ~BIT(7);
 130			} while (v & BIT(7));
 131
 132			regs[0] = base + (offset << 2);
 133			regs += 2;
 134		} while (--count);
 135	}
 136
 137	*regs = MI_BATCH_BUFFER_END | BIT(0);
 138}
 139
 140static const u8 gen12_xcs_offsets[] = {
 141	NOP(1),
 142	LRI(13, POSTED),
 143	REG16(0x244),
 144	REG(0x034),
 145	REG(0x030),
 146	REG(0x038),
 147	REG(0x03c),
 148	REG(0x168),
 149	REG(0x140),
 150	REG(0x110),
 151	REG(0x1c0),
 152	REG(0x1c4),
 153	REG(0x1c8),
 154	REG(0x180),
 155	REG16(0x2b4),
 156
 157	NOP(5),
 158	LRI(9, POSTED),
 159	REG16(0x3a8),
 160	REG16(0x28c),
 161	REG16(0x288),
 162	REG16(0x284),
 163	REG16(0x280),
 164	REG16(0x27c),
 165	REG16(0x278),
 166	REG16(0x274),
 167	REG16(0x270),
 168
 169	END
 170};
 171
 172static const u8 dg2_xcs_offsets[] = {
 173	NOP(1),
 174	LRI(15, POSTED),
 175	REG16(0x244),
 176	REG(0x034),
 177	REG(0x030),
 178	REG(0x038),
 179	REG(0x03c),
 180	REG(0x168),
 181	REG(0x140),
 182	REG(0x110),
 183	REG(0x1c0),
 184	REG(0x1c4),
 185	REG(0x1c8),
 186	REG(0x180),
 187	REG16(0x2b4),
 188	REG(0x120),
 189	REG(0x124),
 190
 191	NOP(1),
 192	LRI(9, POSTED),
 193	REG16(0x3a8),
 194	REG16(0x28c),
 195	REG16(0x288),
 196	REG16(0x284),
 197	REG16(0x280),
 198	REG16(0x27c),
 199	REG16(0x278),
 200	REG16(0x274),
 201	REG16(0x270),
 202
 203	END
 204};
 205
 206static const u8 gen12_rcs_offsets[] = {
 207	NOP(1),
 208	LRI(13, POSTED),
 209	REG16(0x244),
 210	REG(0x034),
 211	REG(0x030),
 212	REG(0x038),
 213	REG(0x03c),
 214	REG(0x168),
 215	REG(0x140),
 216	REG(0x110),
 217	REG(0x1c0),
 218	REG(0x1c4),
 219	REG(0x1c8),
 220	REG(0x180),
 221	REG16(0x2b4),
 222
 223	NOP(5),
 224	LRI(9, POSTED),
 225	REG16(0x3a8),
 226	REG16(0x28c),
 227	REG16(0x288),
 228	REG16(0x284),
 229	REG16(0x280),
 230	REG16(0x27c),
 231	REG16(0x278),
 232	REG16(0x274),
 233	REG16(0x270),
 234
 235	LRI(3, POSTED),
 236	REG(0x1b0),
 237	REG16(0x5a8),
 238	REG16(0x5ac),
 239
 240	NOP(6),
 241	LRI(1, 0),
 242	REG(0x0c8),
 243	NOP(3 + 9 + 1),
 244
 245	LRI(51, POSTED),
 246	REG16(0x588),
 247	REG16(0x588),
 248	REG16(0x588),
 249	REG16(0x588),
 250	REG16(0x588),
 251	REG16(0x588),
 252	REG(0x028),
 253	REG(0x09c),
 254	REG(0x0c0),
 255	REG(0x178),
 256	REG(0x17c),
 257	REG16(0x358),
 258	REG(0x170),
 259	REG(0x150),
 260	REG(0x154),
 261	REG(0x158),
 262	REG16(0x41c),
 263	REG16(0x600),
 264	REG16(0x604),
 265	REG16(0x608),
 266	REG16(0x60c),
 267	REG16(0x610),
 268	REG16(0x614),
 269	REG16(0x618),
 270	REG16(0x61c),
 271	REG16(0x620),
 272	REG16(0x624),
 273	REG16(0x628),
 274	REG16(0x62c),
 275	REG16(0x630),
 276	REG16(0x634),
 277	REG16(0x638),
 278	REG16(0x63c),
 279	REG16(0x640),
 280	REG16(0x644),
 281	REG16(0x648),
 282	REG16(0x64c),
 283	REG16(0x650),
 284	REG16(0x654),
 285	REG16(0x658),
 286	REG16(0x65c),
 287	REG16(0x660),
 288	REG16(0x664),
 289	REG16(0x668),
 290	REG16(0x66c),
 291	REG16(0x670),
 292	REG16(0x674),
 293	REG16(0x678),
 294	REG16(0x67c),
 295	REG(0x068),
 296	REG(0x084),
 297	NOP(1),
 298
 299	END
 300};
 301
 302static const u8 xehp_rcs_offsets[] = {
 303	NOP(1),
 304	LRI(13, POSTED),
 305	REG16(0x244),
 306	REG(0x034),
 307	REG(0x030),
 308	REG(0x038),
 309	REG(0x03c),
 310	REG(0x168),
 311	REG(0x140),
 312	REG(0x110),
 313	REG(0x1c0),
 314	REG(0x1c4),
 315	REG(0x1c8),
 316	REG(0x180),
 317	REG16(0x2b4),
 318
 319	NOP(5),
 320	LRI(9, POSTED),
 321	REG16(0x3a8),
 322	REG16(0x28c),
 323	REG16(0x288),
 324	REG16(0x284),
 325	REG16(0x280),
 326	REG16(0x27c),
 327	REG16(0x278),
 328	REG16(0x274),
 329	REG16(0x270),
 330
 331	LRI(3, POSTED),
 332	REG(0x1b0),
 333	REG16(0x5a8),
 334	REG16(0x5ac),
 335
 336	NOP(6),
 337	LRI(1, 0),
 338	REG(0x0c8),
 339
 340	END
 341};
 342
 343static const u8 dg2_rcs_offsets[] = {
 344	NOP(1),
 345	LRI(15, POSTED),
 346	REG16(0x244),
 347	REG(0x034),
 348	REG(0x030),
 349	REG(0x038),
 350	REG(0x03c),
 351	REG(0x168),
 352	REG(0x140),
 353	REG(0x110),
 354	REG(0x1c0),
 355	REG(0x1c4),
 356	REG(0x1c8),
 357	REG(0x180),
 358	REG16(0x2b4),
 359	REG(0x120),
 360	REG(0x124),
 361
 362	NOP(1),
 363	LRI(9, POSTED),
 364	REG16(0x3a8),
 365	REG16(0x28c),
 366	REG16(0x288),
 367	REG16(0x284),
 368	REG16(0x280),
 369	REG16(0x27c),
 370	REG16(0x278),
 371	REG16(0x274),
 372	REG16(0x270),
 373
 374	LRI(3, POSTED),
 375	REG(0x1b0),
 376	REG16(0x5a8),
 377	REG16(0x5ac),
 378
 379	NOP(6),
 380	LRI(1, 0),
 381	REG(0x0c8),
 382
 383	END
 384};
 385
 386static const u8 mtl_rcs_offsets[] = {
 387	NOP(1),
 388	LRI(15, POSTED),
 389	REG16(0x244),
 390	REG(0x034),
 391	REG(0x030),
 392	REG(0x038),
 393	REG(0x03c),
 394	REG(0x168),
 395	REG(0x140),
 396	REG(0x110),
 397	REG(0x1c0),
 398	REG(0x1c4),
 399	REG(0x1c8),
 400	REG(0x180),
 401	REG16(0x2b4),
 402	REG(0x120),
 403	REG(0x124),
 404
 405	NOP(1),
 406	LRI(9, POSTED),
 407	REG16(0x3a8),
 408	REG16(0x28c),
 409	REG16(0x288),
 410	REG16(0x284),
 411	REG16(0x280),
 412	REG16(0x27c),
 413	REG16(0x278),
 414	REG16(0x274),
 415	REG16(0x270),
 416
 417	NOP(2),
 418	LRI(2, POSTED),
 419	REG16(0x5a8),
 420	REG16(0x5ac),
 421
 422	NOP(6),
 423	LRI(1, 0),
 424	REG(0x0c8),
 425
 426	END
 427};
 428
 429#define XE2_CTX_COMMON \
 430	NOP(1),                 /* [0x00] */ \
 431	LRI(15, POSTED),        /* [0x01] */ \
 432	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
 433	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
 434	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
 435	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
 436	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
 437	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
 438	REG(0x140),             /* [0x0e] BB_ADDR */ \
 439	REG(0x110),             /* [0x10] BB_STATE */ \
 440	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
 441	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
 442	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
 443	REG(0x180),             /* [0x18] CCID */ \
 444	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
 445	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
 446	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
 447	\
 448	NOP(1),                 /* [0x20] */ \
 449	LRI(9, POSTED),         /* [0x21] */ \
 450	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
 451	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
 452	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
 453	REG16(0x284),           /* [0x28] dummy reg */ \
 454	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
 455	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
 456	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
 457	REG16(0x274),           /* [0x30] PTBP_UDW */ \
 458	REG16(0x270)            /* [0x32] PTBP_LDW */
 459
 460static const u8 xe2_rcs_offsets[] = {
 461	XE2_CTX_COMMON,
 462
 463	NOP(2),                 /* [0x34] */
 464	LRI(2, POSTED),         /* [0x36] */
 465	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
 466	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
 467
 468	NOP(6),                 /* [0x41] */
 469	LRI(1, 0),              /* [0x47] */
 470	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
 471
 472	END
 473};
 474
 475static const u8 xe2_bcs_offsets[] = {
 476	XE2_CTX_COMMON,
 477
 478	NOP(4 + 8 + 1),         /* [0x34] */
 479	LRI(2, POSTED),         /* [0x41] */
 480	REG16(0x200),           /* [0x42] BCS_SWCTRL */
 481	REG16(0x204),           /* [0x44] BLIT_CCTL */
 482
 483	END
 484};
 485
 486static const u8 xe2_xcs_offsets[] = {
 487	XE2_CTX_COMMON,
 488
 489	END
 490};
 491
 492#undef END
 493#undef REG16
 494#undef REG
 495#undef LRI
 496#undef NOP
 497
 498static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
 499{
 500	if (class == XE_ENGINE_CLASS_RENDER) {
 501		if (GRAPHICS_VER(xe) >= 20)
 502			return xe2_rcs_offsets;
 503		else if (GRAPHICS_VERx100(xe) >= 1270)
 504			return mtl_rcs_offsets;
 505		else if (GRAPHICS_VERx100(xe) >= 1255)
 506			return dg2_rcs_offsets;
 507		else if (GRAPHICS_VERx100(xe) >= 1250)
 508			return xehp_rcs_offsets;
 509		else
 510			return gen12_rcs_offsets;
 511	} else if (class == XE_ENGINE_CLASS_COPY) {
 512		if (GRAPHICS_VER(xe) >= 20)
 513			return xe2_bcs_offsets;
 514		else
 515			return gen12_xcs_offsets;
 516	} else {
 517		if (GRAPHICS_VER(xe) >= 20)
 518			return xe2_xcs_offsets;
 519		else if (GRAPHICS_VERx100(xe) >= 1255)
 520			return dg2_xcs_offsets;
 521		else
 522			return gen12_xcs_offsets;
 523	}
 524}
 525
 526static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
 527{
 528	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH) |
 529				    _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
 530				    CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
 531
 532	/* TODO: Timestamp */
 533}
 534
 535static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
 536{
 537	struct xe_device *xe = gt_to_xe(hwe->gt);
 538
 539	if (GRAPHICS_VERx100(xe) >= 1250)
 540		return 0x70;
 541	else
 542		return 0x60;
 543}
 544
 545static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
 546{
 547	int x;
 548
 549	x = lrc_ring_mi_mode(hwe);
 550	regs[x + 1] &= ~STOP_RING;
 551	regs[x + 1] |= STOP_RING << 16;
 552}
 553
 554static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
 555{
 556	return 0;
 557}
 558
 559u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
 560{
 561	return lrc->ring.size;
 562}
 563
 564/* Make the magic macros work */
 565#define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
 566
 567#define LRC_SEQNO_PPHWSP_OFFSET 512
 568#define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
 569#define LRC_PARALLEL_PPHWSP_OFFSET 2048
 570#define LRC_PPHWSP_SIZE SZ_4K
 571
 572static size_t lrc_reg_size(struct xe_device *xe)
 573{
 574	if (GRAPHICS_VERx100(xe) >= 1250)
 575		return 96 * sizeof(u32);
 576	else
 577		return 80 * sizeof(u32);
 578}
 579
 580size_t xe_lrc_skip_size(struct xe_device *xe)
 581{
 582	return LRC_PPHWSP_SIZE + lrc_reg_size(xe);
 583}
 584
 585static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
 586{
 587	/* The seqno is stored in the driver-defined portion of PPHWSP */
 588	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
 589}
 590
 591static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
 592{
 593	/* The start seqno is stored in the driver-defined portion of PPHWSP */
 594	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
 595}
 596
 597static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
 598{
 599	/* The parallel is stored in the driver-defined portion of PPHWSP */
 600	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
 601}
 602
 603static inline u32 __xe_lrc_regs_offset(struct xe_lrc *lrc)
 604{
 605	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
 606}
 607
 608#define DECL_MAP_ADDR_HELPERS(elem) \
 609static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
 610{ \
 611	struct iosys_map map = lrc->bo->vmap; \
 612\
 613	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
 614	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
 615	return map; \
 616} \
 617static inline u32 __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
 618{ \
 619	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
 620} \
 621
 622DECL_MAP_ADDR_HELPERS(ring)
 623DECL_MAP_ADDR_HELPERS(pphwsp)
 624DECL_MAP_ADDR_HELPERS(seqno)
 625DECL_MAP_ADDR_HELPERS(regs)
 626DECL_MAP_ADDR_HELPERS(start_seqno)
 627DECL_MAP_ADDR_HELPERS(parallel)
 628
 629#undef DECL_MAP_ADDR_HELPERS
 630
 631u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
 632{
 633	return __xe_lrc_pphwsp_ggtt_addr(lrc);
 634}
 635
 636u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
 637{
 638	struct xe_device *xe = lrc_to_xe(lrc);
 639	struct iosys_map map;
 640
 641	map = __xe_lrc_regs_map(lrc);
 642	iosys_map_incr(&map, reg_nr * sizeof(u32));
 643	return xe_map_read32(xe, &map);
 644}
 645
 646void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
 647{
 648	struct xe_device *xe = lrc_to_xe(lrc);
 649	struct iosys_map map;
 650
 651	map = __xe_lrc_regs_map(lrc);
 652	iosys_map_incr(&map, reg_nr * sizeof(u32));
 653	xe_map_write32(xe, &map, val);
 654}
 655
 656static void *empty_lrc_data(struct xe_hw_engine *hwe)
 657{
 658	struct xe_device *xe = gt_to_xe(hwe->gt);
 659	void *data;
 660	u32 *regs;
 661
 662	data = kzalloc(xe_lrc_size(xe, hwe->class), GFP_KERNEL);
 663	if (!data)
 664		return NULL;
 665
 666	/* 1st page: Per-Process of HW status Page */
 667	regs = data + LRC_PPHWSP_SIZE;
 668	set_offsets(regs, reg_offsets(xe, hwe->class), hwe);
 669	set_context_control(regs, hwe);
 670	reset_stop_ring(regs, hwe);
 671
 672	return data;
 673}
 674
 675static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
 676{
 677	u64 desc = xe_vm_pdp4_descriptor(vm, lrc->tile);
 678
 679	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
 680	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
 681}
 682
 683#define PVC_CTX_ASID		(0x2e + 1)
 684#define PVC_CTX_ACC_CTR_THOLD	(0x2a + 1)
 685
 686int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
 687		struct xe_exec_queue *q, struct xe_vm *vm, u32 ring_size)
 688{
 689	struct xe_gt *gt = hwe->gt;
 690	struct xe_tile *tile = gt_to_tile(gt);
 691	struct xe_device *xe = gt_to_xe(gt);
 692	struct iosys_map map;
 693	void *init_data = NULL;
 694	u32 arb_enable;
 695	int err;
 696
 697	lrc->flags = 0;
 698
 699	/*
 700	 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
 701	 * via VM bind calls.
 702	 */
 703	lrc->bo = xe_bo_create_pin_map(xe, tile, vm,
 704				      ring_size + xe_lrc_size(xe, hwe->class),
 705				      ttm_bo_type_kernel,
 706				      XE_BO_CREATE_VRAM_IF_DGFX(tile) |
 707				      XE_BO_CREATE_GGTT_BIT);
 708	if (IS_ERR(lrc->bo))
 709		return PTR_ERR(lrc->bo);
 710
 711	lrc->tile = gt_to_tile(hwe->gt);
 712	lrc->ring.size = ring_size;
 713	lrc->ring.tail = 0;
 714
 715	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
 716			     hwe->fence_irq, hwe->name);
 717
 718	if (!gt->default_lrc[hwe->class]) {
 719		init_data = empty_lrc_data(hwe);
 720		if (!init_data) {
 721			err = -ENOMEM;
 722			goto err_lrc_finish;
 723		}
 724	}
 725
 726	/*
 727	 * Init Per-Process of HW status Page, LRC / context state to known
 728	 * values
 729	 */
 730	map = __xe_lrc_pphwsp_map(lrc);
 731	if (!init_data) {
 732		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
 733		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
 734				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
 735				 xe_lrc_size(xe, hwe->class) - LRC_PPHWSP_SIZE);
 736	} else {
 737		xe_map_memcpy_to(xe, &map, 0, init_data,
 738				 xe_lrc_size(xe, hwe->class));
 739		kfree(init_data);
 740	}
 741
 742	if (vm) {
 743		xe_lrc_set_ppgtt(lrc, vm);
 744
 745		if (vm->xef)
 746			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
 747	}
 748
 749	xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
 750	xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
 751	xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
 752	xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
 753			     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
 754	if (xe->info.has_asid && vm)
 755		xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid);
 756
 757	lrc->desc = LRC_VALID;
 758	lrc->desc |= LRC_LEGACY_64B_CONTEXT << LRC_ADDRESSING_MODE_SHIFT;
 759	/* TODO: Priority */
 760
 761	/* While this appears to have something about privileged batches or
 762	 * some such, it really just means PPGTT mode.
 763	 */
 764	if (vm)
 765		lrc->desc |= LRC_PRIVILEGE;
 766
 767	if (GRAPHICS_VERx100(xe) < 1250) {
 768		lrc->desc |= (u64)hwe->instance << ENGINE_INSTANCE_SHIFT;
 769		lrc->desc |= (u64)hwe->class << ENGINE_CLASS_SHIFT;
 770	}
 771
 772	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 773	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
 774
 775	map = __xe_lrc_seqno_map(lrc);
 776	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
 777
 778	map = __xe_lrc_start_seqno_map(lrc);
 779	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
 780
 781	return 0;
 782
 783err_lrc_finish:
 784	xe_lrc_finish(lrc);
 785	return err;
 786}
 787
 788void xe_lrc_finish(struct xe_lrc *lrc)
 789{
 790	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
 791	xe_bo_lock(lrc->bo, false);
 792	xe_bo_unpin(lrc->bo);
 793	xe_bo_unlock(lrc->bo);
 794	xe_bo_put(lrc->bo);
 795}
 796
 797void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
 798{
 799	xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
 800}
 801
 802u32 xe_lrc_ring_head(struct xe_lrc *lrc)
 803{
 804	return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
 805}
 806
 807u32 xe_lrc_ring_space(struct xe_lrc *lrc)
 808{
 809	const u32 head = xe_lrc_ring_head(lrc);
 810	const u32 tail = lrc->ring.tail;
 811	const u32 size = lrc->ring.size;
 812
 813	return ((head - tail - 1) & (size - 1)) + 1;
 814}
 815
 816static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
 817				const void *data, size_t size)
 818{
 819	struct xe_device *xe = lrc_to_xe(lrc);
 820
 821	iosys_map_incr(&ring, lrc->ring.tail);
 822	xe_map_memcpy_to(xe, &ring, 0, data, size);
 823	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
 824}
 825
 826void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
 827{
 828	struct xe_device *xe = lrc_to_xe(lrc);
 829	struct iosys_map ring;
 830	u32 rhs;
 831	size_t aligned_size;
 832
 833	xe_assert(xe, IS_ALIGNED(size, 4));
 834	aligned_size = ALIGN(size, 8);
 835
 836	ring = __xe_lrc_ring_map(lrc);
 837
 838	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
 839	rhs = lrc->ring.size - lrc->ring.tail;
 840	if (size > rhs) {
 841		__xe_lrc_write_ring(lrc, ring, data, rhs);
 842		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
 843	} else {
 844		__xe_lrc_write_ring(lrc, ring, data, size);
 845	}
 846
 847	if (aligned_size > size) {
 848		u32 noop = MI_NOOP;
 849
 850		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
 851	}
 852}
 853
 854u64 xe_lrc_descriptor(struct xe_lrc *lrc)
 855{
 856	return lrc->desc | xe_lrc_ggtt_addr(lrc);
 857}
 858
 859u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
 860{
 861	return __xe_lrc_seqno_ggtt_addr(lrc);
 862}
 863
 864struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc)
 865{
 866	return &xe_hw_fence_create(&lrc->fence_ctx,
 867				   __xe_lrc_seqno_map(lrc))->dma;
 868}
 869
 870s32 xe_lrc_seqno(struct xe_lrc *lrc)
 871{
 872	struct iosys_map map = __xe_lrc_seqno_map(lrc);
 873
 874	return xe_map_read32(lrc_to_xe(lrc), &map);
 875}
 876
 877s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
 878{
 879	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
 880
 881	return xe_map_read32(lrc_to_xe(lrc), &map);
 882}
 883
 884u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
 885{
 886	return __xe_lrc_start_seqno_ggtt_addr(lrc);
 887}
 888
 889u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
 890{
 891	return __xe_lrc_parallel_ggtt_addr(lrc);
 892}
 893
 894struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
 895{
 896	return __xe_lrc_parallel_map(lrc);
 897}
 898
 899static int instr_dw(u32 cmd_header)
 900{
 901	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
 902	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
 903	    GFXPIPE_SINGLE_DW_CMD(0, 0))
 904		return 1;
 905
 906	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
 907	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
 908		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
 909
 910	/* Most instructions have the # of dwords (minus 2) in 7:0 */
 911	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
 912}
 913
 914static int dump_mi_command(struct drm_printer *p,
 915			   struct xe_gt *gt,
 916			   u32 *dw,
 917			   int remaining_dw)
 918{
 919	u32 inst_header = *dw;
 920	u32 numdw = instr_dw(inst_header);
 921	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
 922	int num_noop;
 923
 924	/* First check for commands that don't have/use a '# DW' field */
 925	switch (inst_header & MI_OPCODE) {
 926	case MI_NOOP:
 927		num_noop = 1;
 928		while (num_noop < remaining_dw &&
 929		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
 930			num_noop++;
 931		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
 932		return num_noop;
 933
 934	case MI_TOPOLOGY_FILTER:
 935		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
 936		return 1;
 937
 938	case MI_BATCH_BUFFER_END:
 939		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
 940		/* Return 'remaining_dw' to consume the rest of the LRC */
 941		return remaining_dw;
 942	}
 943
 944	/*
 945	 * Any remaining commands include a # of dwords.  We should make sure
 946	 * it doesn't exceed the remaining size of the LRC.
 947	 */
 948	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
 949		numdw = remaining_dw;
 950
 951	switch (inst_header & MI_OPCODE) {
 952	case MI_LOAD_REGISTER_IMM:
 953		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
 954			   inst_header, (numdw - 1) / 2);
 955		for (int i = 1; i < numdw; i += 2)
 956			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
 957		return numdw;
 958
 959	case MI_FORCE_WAKEUP:
 960		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
 961		return numdw;
 962
 963	default:
 964		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
 965			   inst_header, opcode, numdw);
 966		return numdw;
 967	}
 968}
 969
 970static int dump_gfxpipe_command(struct drm_printer *p,
 971				struct xe_gt *gt,
 972				u32 *dw,
 973				int remaining_dw)
 974{
 975	u32 numdw = instr_dw(*dw);
 976	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
 977	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
 978	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
 979
 980	/*
 981	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
 982	 * remaining size of the LRC.
 983	 */
 984	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
 985		numdw = remaining_dw;
 986
 987	switch (*dw & GFXPIPE_MATCH_MASK) {
 988#define MATCH(cmd) \
 989	case cmd: \
 990		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
 991		return numdw
 992#define MATCH3D(cmd) \
 993	case CMD_##cmd: \
 994		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
 995		return numdw
 996
 997	MATCH(STATE_BASE_ADDRESS);
 998	MATCH(STATE_SIP);
 999	MATCH(GPGPU_CSR_BASE_ADDRESS);
1000	MATCH(STATE_COMPUTE_MODE);
1001	MATCH3D(3DSTATE_BTD);
1002
1003	MATCH3D(3DSTATE_VF_STATISTICS);
1004
1005	MATCH(PIPELINE_SELECT);
1006
1007	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1008	MATCH3D(3DSTATE_CLEAR_PARAMS);
1009	MATCH3D(3DSTATE_DEPTH_BUFFER);
1010	MATCH3D(3DSTATE_STENCIL_BUFFER);
1011	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1012	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1013	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1014	MATCH3D(3DSTATE_INDEX_BUFFER);
1015	MATCH3D(3DSTATE_VF);
1016	MATCH3D(3DSTATE_MULTISAMPLE);
1017	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1018	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1019	MATCH3D(3DSTATE_VS);
1020	MATCH3D(3DSTATE_GS);
1021	MATCH3D(3DSTATE_CLIP);
1022	MATCH3D(3DSTATE_SF);
1023	MATCH3D(3DSTATE_WM);
1024	MATCH3D(3DSTATE_CONSTANT_VS);
1025	MATCH3D(3DSTATE_CONSTANT_GS);
1026	MATCH3D(3DSTATE_SAMPLE_MASK);
1027	MATCH3D(3DSTATE_CONSTANT_HS);
1028	MATCH3D(3DSTATE_CONSTANT_DS);
1029	MATCH3D(3DSTATE_HS);
1030	MATCH3D(3DSTATE_TE);
1031	MATCH3D(3DSTATE_DS);
1032	MATCH3D(3DSTATE_STREAMOUT);
1033	MATCH3D(3DSTATE_SBE);
1034	MATCH3D(3DSTATE_PS);
1035	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1036	MATCH3D(3DSTATE_CPS_POINTERS);
1037	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1038	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1039	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1040	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1041	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1042	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1043	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1044	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1045	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1046	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1047	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1048	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1049	MATCH3D(3DSTATE_VF_INSTANCING);
1050	MATCH3D(3DSTATE_VF_SGVS);
1051	MATCH3D(3DSTATE_VF_TOPOLOGY);
1052	MATCH3D(3DSTATE_WM_CHROMAKEY);
1053	MATCH3D(3DSTATE_PS_BLEND);
1054	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1055	MATCH3D(3DSTATE_PS_EXTRA);
1056	MATCH3D(3DSTATE_RASTER);
1057	MATCH3D(3DSTATE_SBE_SWIZ);
1058	MATCH3D(3DSTATE_WM_HZ_OP);
1059	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1060	MATCH3D(3DSTATE_VF_SGVS_2);
1061	MATCH3D(3DSTATE_VFG);
1062	MATCH3D(3DSTATE_URB_ALLOC_VS);
1063	MATCH3D(3DSTATE_URB_ALLOC_HS);
1064	MATCH3D(3DSTATE_URB_ALLOC_DS);
1065	MATCH3D(3DSTATE_URB_ALLOC_GS);
1066	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1067	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1068	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1069	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1070	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1071	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1072	MATCH3D(3DSTATE_AMFS);
1073	MATCH3D(3DSTATE_DEPTH_BOUNDS);
1074	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1075	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1076	MATCH3D(3DSTATE_MESH_CONTROL);
1077	MATCH3D(3DSTATE_MESH_DISTRIB);
1078	MATCH3D(3DSTATE_TASK_REDISTRIB);
1079	MATCH3D(3DSTATE_MESH_SHADER);
1080	MATCH3D(3DSTATE_MESH_SHADER_DATA);
1081	MATCH3D(3DSTATE_TASK_CONTROL);
1082	MATCH3D(3DSTATE_TASK_SHADER);
1083	MATCH3D(3DSTATE_TASK_SHADER_DATA);
1084	MATCH3D(3DSTATE_URB_ALLOC_MESH);
1085	MATCH3D(3DSTATE_URB_ALLOC_TASK);
1086	MATCH3D(3DSTATE_CLIP_MESH);
1087	MATCH3D(3DSTATE_SBE_MESH);
1088	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1089
1090	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1091	MATCH3D(3DSTATE_CHROMA_KEY);
1092	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1093	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
1094	MATCH3D(3DSTATE_LINE_STIPPLE);
1095	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
1096	MATCH3D(3DSTATE_MONOFILTER_SIZE);
1097	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
1098	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
1099	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
1100	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
1101	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
1102	MATCH3D(3DSTATE_SO_DECL_LIST);
1103	MATCH3D(3DSTATE_SO_BUFFER);
1104	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
1105	MATCH3D(3DSTATE_SAMPLE_PATTERN);
1106	MATCH3D(3DSTATE_3D_MODE);
1107	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
1108	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
1109	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
1110
1111	default:
1112		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
1113			   *dw, pipeline, opcode, subopcode, numdw);
1114		return numdw;
1115	}
1116}
1117
1118void xe_lrc_dump_default(struct drm_printer *p,
1119			 struct xe_gt *gt,
1120			 enum xe_engine_class hwe_class)
1121{
1122	u32 *dw;
1123	int remaining_dw, num_dw;
1124
1125	if (!gt->default_lrc[hwe_class]) {
1126		drm_printf(p, "No default LRC for class %d\n", hwe_class);
1127		return;
1128	}
1129
1130	/*
1131	 * Skip the beginning of the LRC since it contains the per-process
1132	 * hardware status page.
1133	 */
1134	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
1135	remaining_dw = (xe_lrc_size(gt_to_xe(gt), hwe_class) - LRC_PPHWSP_SIZE) / 4;
1136
1137	while (remaining_dw > 0) {
1138		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
1139			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
1140		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
1141			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
1142		} else {
1143			num_dw = min(instr_dw(*dw), remaining_dw);
1144			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
1145				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
1146				   num_dw);
1147		}
1148
1149		dw += num_dw;
1150		remaining_dw -= num_dw;
1151	}
1152}
1153
1154struct instr_state {
1155	u32 instr;
1156	u16 num_dw;
1157};
1158
1159static const struct instr_state xe_hpg_svg_state[] = {
1160	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
1161	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
1162	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
1163	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
1164	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
1165	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
1166	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
1167	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
1168	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
1169	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
1170	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
1171	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
1172	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
1173	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
1174	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
1175	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
1176	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
1177	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
1178	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
1179	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
1180	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
1181	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
1182	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
1183	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
1184	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
1185	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
1186	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
1187	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
1188	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
1189	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
1190	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
1191	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
1192	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
1193	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
1194	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
1195	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
1196	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
1197	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
1198	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
1199	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
1200	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
1201	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
1202	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
1203	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
1204	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
1205	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
1206	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
1207	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
1208	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
1209	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
1210};
1211
1212void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
1213{
1214	struct xe_gt *gt = q->hwe->gt;
1215	struct xe_device *xe = gt_to_xe(gt);
1216	const struct instr_state *state_table = NULL;
1217	int state_table_size = 0;
1218
1219	/*
1220	 * At the moment we only need to emit non-register state for the RCS
1221	 * engine.
1222	 */
1223	if (q->hwe->class != XE_ENGINE_CLASS_RENDER)
1224		return;
1225
1226	switch (GRAPHICS_VERx100(xe)) {
1227	case 1255:
1228	case 1270 ... 2004:
1229		state_table = xe_hpg_svg_state;
1230		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
1231		break;
1232	default:
1233		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
1234			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
1235		return;
1236	}
1237
1238	for (int i = 0; i < state_table_size; i++) {
1239		u32 instr = state_table[i].instr;
1240		u16 num_dw = state_table[i].num_dw;
1241		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
1242
1243		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
1244		xe_gt_assert(gt, num_dw != 0);
1245		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
1246
1247		/*
1248		 * Xe2's SVG context is the same as the one on DG2 / MTL
1249		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
1250		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
1251		 * Just make the replacement here rather than defining a
1252		 * whole separate table for the single trivial change.
1253		 */
1254		if (GRAPHICS_VER(xe) >= 20 &&
1255		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
1256			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
1257
1258		bb->cs[bb->len] = instr;
1259		if (!is_single_dw)
1260			bb->cs[bb->len] |= (num_dw - 2);
1261
1262		bb->len += num_dw;
1263	}
1264}