Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2014-2018 Intel Corporation
   4 */
   5
   6#include "i915_drv.h"
   7#include "i915_reg.h"
   8#include "intel_context.h"
   9#include "intel_engine_pm.h"
  10#include "intel_engine_regs.h"
  11#include "intel_gpu_commands.h"
  12#include "intel_gt.h"
  13#include "intel_gt_ccs_mode.h"
  14#include "intel_gt_mcr.h"
  15#include "intel_gt_print.h"
  16#include "intel_gt_regs.h"
  17#include "intel_ring.h"
  18#include "intel_workarounds.h"
  19
  20#include "display/intel_fbc_regs.h"
  21
  22/**
  23 * DOC: Hardware workarounds
  24 *
  25 * Hardware workarounds are register programming documented to be executed in
  26 * the driver that fall outside of the normal programming sequences for a
  27 * platform. There are some basic categories of workarounds, depending on
  28 * how/when they are applied:
  29 *
  30 * - Context workarounds: workarounds that touch registers that are
  31 *   saved/restored to/from the HW context image. The list is emitted (via Load
  32 *   Register Immediate commands) once when initializing the device and saved in
  33 *   the default context. That default context is then used on every context
  34 *   creation to have a "primed golden context", i.e. a context image that
  35 *   already contains the changes needed to all the registers.
  36 *
  37 *   Context workarounds should be implemented in the \*_ctx_workarounds_init()
  38 *   variants respective to the targeted platforms.
  39 *
  40 * - Engine workarounds: the list of these WAs is applied whenever the specific
  41 *   engine is reset. It's also possible that a set of engine classes share a
  42 *   common power domain and they are reset together. This happens on some
  43 *   platforms with render and compute engines. In this case (at least) one of
  44 *   them need to keeep the workaround programming: the approach taken in the
  45 *   driver is to tie those workarounds to the first compute/render engine that
  46 *   is registered.  When executing with GuC submission, engine resets are
  47 *   outside of kernel driver control, hence the list of registers involved in
  48 *   written once, on engine initialization, and then passed to GuC, that
  49 *   saves/restores their values before/after the reset takes place. See
  50 *   ``drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c`` for reference.
  51 *
  52 *   Workarounds for registers specific to RCS and CCS should be implemented in
  53 *   rcs_engine_wa_init() and ccs_engine_wa_init(), respectively; those for
  54 *   registers belonging to BCS, VCS or VECS should be implemented in
  55 *   xcs_engine_wa_init(). Workarounds for registers not belonging to a specific
  56 *   engine's MMIO range but that are part of of the common RCS/CCS reset domain
  57 *   should be implemented in general_render_compute_wa_init(). The settings
  58 *   about the CCS load balancing should be added in ccs_engine_wa_mode().
  59 *
  60 * - GT workarounds: the list of these WAs is applied whenever these registers
  61 *   revert to their default values: on GPU reset, suspend/resume [1]_, etc.
  62 *
  63 *   GT workarounds should be implemented in the \*_gt_workarounds_init()
  64 *   variants respective to the targeted platforms.
  65 *
  66 * - Register whitelist: some workarounds need to be implemented in userspace,
  67 *   but need to touch privileged registers. The whitelist in the kernel
  68 *   instructs the hardware to allow the access to happen. From the kernel side,
  69 *   this is just a special case of a MMIO workaround (as we write the list of
  70 *   these to/be-whitelisted registers to some special HW registers).
  71 *
  72 *   Register whitelisting should be done in the \*_whitelist_build() variants
  73 *   respective to the targeted platforms.
  74 *
  75 * - Workaround batchbuffers: buffers that get executed automatically by the
  76 *   hardware on every HW context restore. These buffers are created and
  77 *   programmed in the default context so the hardware always go through those
  78 *   programming sequences when switching contexts. The support for workaround
  79 *   batchbuffers is enabled these hardware mechanisms:
  80 *
  81 *   #. INDIRECT_CTX: A batchbuffer and an offset are provided in the default
  82 *      context, pointing the hardware to jump to that location when that offset
  83 *      is reached in the context restore. Workaround batchbuffer in the driver
  84 *      currently uses this mechanism for all platforms.
  85 *
  86 *   #. BB_PER_CTX_PTR: A batchbuffer is provided in the default context,
  87 *      pointing the hardware to a buffer to continue executing after the
  88 *      engine registers are restored in a context restore sequence. This is
  89 *      currently not used in the driver.
  90 *
  91 * - Other:  There are WAs that, due to their nature, cannot be applied from a
  92 *   central place. Those are peppered around the rest of the code, as needed.
  93 *   Workarounds related to the display IP are the main example.
  94 *
  95 * .. [1] Technically, some registers are powercontext saved & restored, so they
  96 *    survive a suspend/resume. In practice, writing them again is not too
  97 *    costly and simplifies things, so it's the approach taken in the driver.
  98 */
  99
 100static void wa_init_start(struct i915_wa_list *wal, struct intel_gt *gt,
 101			  const char *name, const char *engine_name)
 102{
 103	wal->gt = gt;
 104	wal->name = name;
 105	wal->engine_name = engine_name;
 106}
 107
 108#define WA_LIST_CHUNK (1 << 4)
 109
 110static void wa_init_finish(struct i915_wa_list *wal)
 111{
 112	/* Trim unused entries. */
 113	if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) {
 114		struct i915_wa *list = kmemdup_array(wal->list, wal->count,
 115						     sizeof(*list), GFP_KERNEL);
 116
 117		if (list) {
 118			kfree(wal->list);
 119			wal->list = list;
 120		}
 121	}
 122
 123	if (!wal->count)
 124		return;
 125
 126	gt_dbg(wal->gt, "Initialized %u %s workarounds on %s\n",
 127	       wal->wa_count, wal->name, wal->engine_name);
 128}
 129
 130static enum forcewake_domains
 131wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal)
 132{
 133	enum forcewake_domains fw = 0;
 134	struct i915_wa *wa;
 135	unsigned int i;
 136
 137	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
 138		fw |= intel_uncore_forcewake_for_reg(uncore,
 139						     wa->reg,
 140						     FW_REG_READ |
 141						     FW_REG_WRITE);
 142
 143	return fw;
 144}
 145
 146static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa)
 147{
 148	unsigned int addr = i915_mmio_reg_offset(wa->reg);
 149	struct drm_i915_private *i915 = wal->gt->i915;
 150	unsigned int start = 0, end = wal->count;
 151	const unsigned int grow = WA_LIST_CHUNK;
 152	struct i915_wa *wa_;
 153
 154	GEM_BUG_ON(!is_power_of_2(grow));
 155
 156	if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */
 157		struct i915_wa *list;
 158
 159		list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*wa),
 160				     GFP_KERNEL);
 161		if (!list) {
 162			drm_err(&i915->drm, "No space for workaround init!\n");
 163			return;
 164		}
 165
 166		if (wal->list) {
 167			memcpy(list, wal->list, sizeof(*wa) * wal->count);
 168			kfree(wal->list);
 169		}
 170
 171		wal->list = list;
 172	}
 173
 174	while (start < end) {
 175		unsigned int mid = start + (end - start) / 2;
 176
 177		if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) {
 178			start = mid + 1;
 179		} else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) {
 180			end = mid;
 181		} else {
 182			wa_ = &wal->list[mid];
 183
 184			if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) {
 185				drm_err(&i915->drm,
 186					"Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n",
 187					i915_mmio_reg_offset(wa_->reg),
 188					wa_->clr, wa_->set);
 189
 190				wa_->set &= ~wa->clr;
 191			}
 192
 193			wal->wa_count++;
 194			wa_->set |= wa->set;
 195			wa_->clr |= wa->clr;
 196			wa_->read |= wa->read;
 197			return;
 198		}
 199	}
 200
 201	wal->wa_count++;
 202	wa_ = &wal->list[wal->count++];
 203	*wa_ = *wa;
 204
 205	while (wa_-- > wal->list) {
 206		GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) ==
 207			   i915_mmio_reg_offset(wa_[1].reg));
 208		if (i915_mmio_reg_offset(wa_[1].reg) >
 209		    i915_mmio_reg_offset(wa_[0].reg))
 210			break;
 211
 212		swap(wa_[1], wa_[0]);
 213	}
 214}
 215
 216static void wa_add(struct i915_wa_list *wal, i915_reg_t reg,
 217		   u32 clear, u32 set, u32 read_mask, bool masked_reg)
 218{
 219	struct i915_wa wa = {
 220		.reg  = reg,
 221		.clr  = clear,
 222		.set  = set,
 223		.read = read_mask,
 224		.masked_reg = masked_reg,
 225	};
 226
 227	_wa_add(wal, &wa);
 228}
 229
 230static void wa_mcr_add(struct i915_wa_list *wal, i915_mcr_reg_t reg,
 231		       u32 clear, u32 set, u32 read_mask, bool masked_reg)
 232{
 233	struct i915_wa wa = {
 234		.mcr_reg = reg,
 235		.clr  = clear,
 236		.set  = set,
 237		.read = read_mask,
 238		.masked_reg = masked_reg,
 239		.is_mcr = 1,
 240	};
 241
 242	_wa_add(wal, &wa);
 243}
 244
 245static void
 246wa_write_clr_set(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set)
 247{
 248	wa_add(wal, reg, clear, set, clear | set, false);
 249}
 250
 251static void
 252wa_mcr_write_clr_set(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clear, u32 set)
 253{
 254	wa_mcr_add(wal, reg, clear, set, clear | set, false);
 255}
 256
 257static void
 258wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
 259{
 260	wa_write_clr_set(wal, reg, ~0, set);
 261}
 262
 263static void
 264wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
 265{
 266	wa_write_clr_set(wal, reg, set, set);
 267}
 268
 269static void
 270wa_mcr_write_or(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set)
 271{
 272	wa_mcr_write_clr_set(wal, reg, set, set);
 273}
 274
 275static void
 276wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr)
 277{
 278	wa_write_clr_set(wal, reg, clr, 0);
 279}
 280
 281static void
 282wa_mcr_write_clr(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clr)
 283{
 284	wa_mcr_write_clr_set(wal, reg, clr, 0);
 285}
 286
 287/*
 288 * WA operations on "masked register". A masked register has the upper 16 bits
 289 * documented as "masked" in b-spec. Its purpose is to allow writing to just a
 290 * portion of the register without a rmw: you simply write in the upper 16 bits
 291 * the mask of bits you are going to modify.
 292 *
 293 * The wa_masked_* family of functions already does the necessary operations to
 294 * calculate the mask based on the parameters passed, so user only has to
 295 * provide the lower 16 bits of that register.
 296 */
 297
 298static void
 299wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
 300{
 301	wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
 302}
 303
 304static void
 305wa_mcr_masked_en(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
 306{
 307	wa_mcr_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
 308}
 309
 310static void
 311wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
 312{
 313	wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
 314}
 315
 316static void
 317wa_mcr_masked_dis(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
 318{
 319	wa_mcr_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
 320}
 321
 322static void
 323wa_masked_field_set(struct i915_wa_list *wal, i915_reg_t reg,
 324		    u32 mask, u32 val)
 325{
 326	wa_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
 327}
 328
 329static void
 330wa_mcr_masked_field_set(struct i915_wa_list *wal, i915_mcr_reg_t reg,
 331			u32 mask, u32 val)
 332{
 333	wa_mcr_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
 334}
 335
 336static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine,
 337				      struct i915_wa_list *wal)
 338{
 339	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
 340}
 341
 342static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine,
 343				      struct i915_wa_list *wal)
 344{
 345	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
 346}
 347
 348static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine,
 349				      struct i915_wa_list *wal)
 350{
 351	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
 352
 353	/* WaDisableAsyncFlipPerfMode:bdw,chv */
 354	wa_masked_en(wal, RING_MI_MODE(RENDER_RING_BASE), ASYNC_FLIP_PERF_DISABLE);
 355
 356	/* WaDisablePartialInstShootdown:bdw,chv */
 357	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
 358			 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
 359
 360	/* Use Force Non-Coherent whenever executing a 3D context. This is a
 361	 * workaround for a possible hang in the unlikely event a TLB
 362	 * invalidation occurs during a PSD flush.
 363	 */
 364	/* WaForceEnableNonCoherent:bdw,chv */
 365	/* WaHdcDisableFetchWhenMasked:bdw,chv */
 366	wa_masked_en(wal, HDC_CHICKEN0,
 367		     HDC_DONOT_FETCH_MEM_WHEN_MASKED |
 368		     HDC_FORCE_NON_COHERENT);
 369
 370	/* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
 371	 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
 372	 *  polygons in the same 8x4 pixel/sample area to be processed without
 373	 *  stalling waiting for the earlier ones to write to Hierarchical Z
 374	 *  buffer."
 375	 *
 376	 * This optimization is off by default for BDW and CHV; turn it on.
 377	 */
 378	wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
 379
 380	/* Wa4x4STCOptimizationDisable:bdw,chv */
 381	wa_masked_en(wal, CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
 382
 383	/*
 384	 * BSpec recommends 8x4 when MSAA is used,
 385	 * however in practice 16x4 seems fastest.
 386	 *
 387	 * Note that PS/WM thread counts depend on the WIZ hashing
 388	 * disable bit, which we don't touch here, but it's good
 389	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
 390	 */
 391	wa_masked_field_set(wal, GEN7_GT_MODE,
 392			    GEN6_WIZ_HASHING_MASK,
 393			    GEN6_WIZ_HASHING_16x4);
 394}
 395
 396static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine,
 397				     struct i915_wa_list *wal)
 398{
 399	struct drm_i915_private *i915 = engine->i915;
 400
 401	gen8_ctx_workarounds_init(engine, wal);
 402
 403	/* WaDisableThreadStallDopClockGating:bdw (pre-production) */
 404	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
 405
 406	/* WaDisableDopClockGating:bdw
 407	 *
 408	 * Also see the related UCGTCL1 write in bdw_init_clock_gating()
 409	 * to disable EUTC clock gating.
 410	 */
 411	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
 412			 DOP_CLOCK_GATING_DISABLE);
 413
 414	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
 415			 GEN8_SAMPLER_POWER_BYPASS_DIS);
 416
 417	wa_masked_en(wal, HDC_CHICKEN0,
 418		     /* WaForceContextSaveRestoreNonCoherent:bdw */
 419		     HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
 420		     /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
 421		     (INTEL_INFO(i915)->gt == 3 ? HDC_FENCE_DEST_SLM_DISABLE : 0));
 422}
 423
 424static void chv_ctx_workarounds_init(struct intel_engine_cs *engine,
 425				     struct i915_wa_list *wal)
 426{
 427	gen8_ctx_workarounds_init(engine, wal);
 428
 429	/* WaDisableThreadStallDopClockGating:chv */
 430	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
 431
 432	/* Improve HiZ throughput on CHV. */
 433	wa_masked_en(wal, HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
 434}
 435
 436static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine,
 437				      struct i915_wa_list *wal)
 438{
 439	struct drm_i915_private *i915 = engine->i915;
 440
 441	if (HAS_LLC(i915)) {
 442		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
 443		 *
 444		 * Must match Display Engine. See
 445		 * WaCompressedResourceDisplayNewHashMode.
 446		 */
 447		wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 448			     GEN9_PBE_COMPRESSED_HASH_SELECTION);
 449		wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
 450				 GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR);
 451	}
 452
 453	/* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */
 454	/* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */
 455	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
 456			 FLOW_CONTROL_ENABLE |
 457			 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
 458
 459	/* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */
 460	/* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */
 461	wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
 462			 GEN9_ENABLE_YV12_BUGFIX |
 463			 GEN9_ENABLE_GPGPU_PREEMPTION);
 464
 465	/* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */
 466	/* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */
 467	wa_masked_en(wal, CACHE_MODE_1,
 468		     GEN8_4x4_STC_OPTIMIZATION_DISABLE |
 469		     GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE);
 470
 471	/* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */
 472	wa_mcr_masked_dis(wal, GEN9_HALF_SLICE_CHICKEN5,
 473			  GEN9_CCS_TLB_PREFETCH_ENABLE);
 474
 475	/* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */
 476	wa_masked_en(wal, HDC_CHICKEN0,
 477		     HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
 478		     HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
 479
 480	/* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
 481	 * both tied to WaForceContextSaveRestoreNonCoherent
 482	 * in some hsds for skl. We keep the tie for all gen9. The
 483	 * documentation is a bit hazy and so we want to get common behaviour,
 484	 * even though there is no clear evidence we would need both on kbl/bxt.
 485	 * This area has been source of system hangs so we play it safe
 486	 * and mimic the skl regardless of what bspec says.
 487	 *
 488	 * Use Force Non-Coherent whenever executing a 3D context. This
 489	 * is a workaround for a possible hang in the unlikely event
 490	 * a TLB invalidation occurs during a PSD flush.
 491	 */
 492
 493	/* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */
 494	wa_masked_en(wal, HDC_CHICKEN0,
 495		     HDC_FORCE_NON_COHERENT);
 496
 497	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
 498	if (IS_SKYLAKE(i915) ||
 499	    IS_KABYLAKE(i915) ||
 500	    IS_COFFEELAKE(i915) ||
 501	    IS_COMETLAKE(i915))
 502		wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
 503				 GEN8_SAMPLER_POWER_BYPASS_DIS);
 504
 505	/* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */
 506	wa_mcr_masked_en(wal, HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
 507
 508	/*
 509	 * Supporting preemption with fine-granularity requires changes in the
 510	 * batch buffer programming. Since we can't break old userspace, we
 511	 * need to set our default preemption level to safe value. Userspace is
 512	 * still able to use more fine-grained preemption levels, since in
 513	 * WaEnablePreemptionGranularityControlByUMD we're whitelisting the
 514	 * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are
 515	 * not real HW workarounds, but merely a way to start using preemption
 516	 * while maintaining old contract with userspace.
 517	 */
 518
 519	/* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */
 520	wa_masked_dis(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
 521
 522	/* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */
 523	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
 524			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 525			    GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
 526
 527	/* WaClearHIZ_WM_CHICKEN3:bxt,glk */
 528	if (IS_GEN9_LP(i915))
 529		wa_masked_en(wal, GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
 530}
 531
 532static void skl_tune_iz_hashing(struct intel_engine_cs *engine,
 533				struct i915_wa_list *wal)
 534{
 535	struct intel_gt *gt = engine->gt;
 536	u8 vals[3] = { 0, 0, 0 };
 537	unsigned int i;
 538
 539	for (i = 0; i < 3; i++) {
 540		u8 ss;
 541
 542		/*
 543		 * Only consider slices where one, and only one, subslice has 7
 544		 * EUs
 545		 */
 546		if (!is_power_of_2(gt->info.sseu.subslice_7eu[i]))
 547			continue;
 548
 549		/*
 550		 * subslice_7eu[i] != 0 (because of the check above) and
 551		 * ss_max == 4 (maximum number of subslices possible per slice)
 552		 *
 553		 * ->    0 <= ss <= 3;
 554		 */
 555		ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1;
 556		vals[i] = 3 - ss;
 557	}
 558
 559	if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
 560		return;
 561
 562	/* Tune IZ hashing. See intel_device_info_runtime_init() */
 563	wa_masked_field_set(wal, GEN7_GT_MODE,
 564			    GEN9_IZ_HASHING_MASK(2) |
 565			    GEN9_IZ_HASHING_MASK(1) |
 566			    GEN9_IZ_HASHING_MASK(0),
 567			    GEN9_IZ_HASHING(2, vals[2]) |
 568			    GEN9_IZ_HASHING(1, vals[1]) |
 569			    GEN9_IZ_HASHING(0, vals[0]));
 570}
 571
 572static void skl_ctx_workarounds_init(struct intel_engine_cs *engine,
 573				     struct i915_wa_list *wal)
 574{
 575	gen9_ctx_workarounds_init(engine, wal);
 576	skl_tune_iz_hashing(engine, wal);
 577}
 578
 579static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine,
 580				     struct i915_wa_list *wal)
 581{
 582	gen9_ctx_workarounds_init(engine, wal);
 583
 584	/* WaDisableThreadStallDopClockGating:bxt */
 585	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
 586			 STALL_DOP_GATING_DISABLE);
 587
 588	/* WaToEnableHwFixForPushConstHWBug:bxt */
 589	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 590		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 591}
 592
 593static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine,
 594				     struct i915_wa_list *wal)
 595{
 596	struct drm_i915_private *i915 = engine->i915;
 597
 598	gen9_ctx_workarounds_init(engine, wal);
 599
 600	/* WaToEnableHwFixForPushConstHWBug:kbl */
 601	if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_C0, STEP_FOREVER))
 602		wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 603			     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 604
 605	/* WaDisableSbeCacheDispatchPortSharing:kbl */
 606	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
 607			 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
 608}
 609
 610static void glk_ctx_workarounds_init(struct intel_engine_cs *engine,
 611				     struct i915_wa_list *wal)
 612{
 613	gen9_ctx_workarounds_init(engine, wal);
 614
 615	/* WaToEnableHwFixForPushConstHWBug:glk */
 616	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 617		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 618}
 619
 620static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine,
 621				     struct i915_wa_list *wal)
 622{
 623	gen9_ctx_workarounds_init(engine, wal);
 624
 625	/* WaToEnableHwFixForPushConstHWBug:cfl */
 626	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 627		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 628
 629	/* WaDisableSbeCacheDispatchPortSharing:cfl */
 630	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
 631			 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
 632}
 633
 634static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
 635				     struct i915_wa_list *wal)
 636{
 637	/* Wa_1406697149 (WaDisableBankHangMode:icl) */
 638	wa_write(wal, GEN8_L3CNTLREG, GEN8_ERRDETBCTRL);
 639
 640	/* WaForceEnableNonCoherent:icl
 641	 * This is not the same workaround as in early Gen9 platforms, where
 642	 * lacking this could cause system hangs, but coherency performance
 643	 * overhead is high and only a few compute workloads really need it
 644	 * (the register is whitelisted in hardware now, so UMDs can opt in
 645	 * for coherency if they have a good reason).
 646	 */
 647	wa_mcr_masked_en(wal, ICL_HDC_MODE, HDC_FORCE_NON_COHERENT);
 648
 649	/* WaEnableFloatBlendOptimization:icl */
 650	wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
 651		   _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE),
 652		   0 /* write-only, so skip validation */,
 653		   true);
 654
 655	/* WaDisableGPGPUMidThreadPreemption:icl */
 656	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
 657			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 658			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
 659
 660	/* allow headerless messages for preemptible GPGPU context */
 661	wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
 662			 GEN11_SAMPLER_ENABLE_HEADLESS_MSG);
 663
 664	/* Wa_1604278689:icl,ehl */
 665	wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID);
 666	wa_write_clr_set(wal, IVB_FBC_RT_BASE_UPPER,
 667			 0,
 668			 0xFFFFFFFF);
 669
 670	/* Wa_1406306137:icl,ehl */
 671	wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU);
 672}
 673
 674/*
 675 * These settings aren't actually workarounds, but general tuning settings that
 676 * need to be programmed on dg2 platform.
 677 */
 678static void dg2_ctx_gt_tuning_init(struct intel_engine_cs *engine,
 679				   struct i915_wa_list *wal)
 680{
 681	wa_mcr_masked_en(wal, CHICKEN_RASTER_2, TBIMR_FAST_CLIP);
 682	wa_mcr_write_clr_set(wal, XEHP_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK,
 683			     REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f));
 684	wa_mcr_write_clr_set(wal, XEHP_FF_MODE2, FF_MODE2_TDS_TIMER_MASK,
 685			     FF_MODE2_TDS_TIMER_128);
 686}
 687
 688static void gen12_ctx_workarounds_init(struct intel_engine_cs *engine,
 689				       struct i915_wa_list *wal)
 690{
 691	struct drm_i915_private *i915 = engine->i915;
 692
 693	/*
 694	 * Wa_1409142259:tgl,dg1,adl-p
 695	 * Wa_1409347922:tgl,dg1,adl-p
 696	 * Wa_1409252684:tgl,dg1,adl-p
 697	 * Wa_1409217633:tgl,dg1,adl-p
 698	 * Wa_1409207793:tgl,dg1,adl-p
 699	 * Wa_1409178076:tgl,dg1,adl-p
 700	 * Wa_1408979724:tgl,dg1,adl-p
 701	 * Wa_14010443199:tgl,rkl,dg1,adl-p
 702	 * Wa_14010698770:tgl,rkl,dg1,adl-s,adl-p
 703	 * Wa_1409342910:tgl,rkl,dg1,adl-s,adl-p
 704	 */
 705	wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3,
 706		     GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
 707
 708	/* WaDisableGPGPUMidThreadPreemption:gen12 */
 709	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
 710			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 711			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
 712
 713	/*
 714	 * Wa_16011163337 - GS_TIMER
 715	 *
 716	 * TDS_TIMER: Although some platforms refer to it as Wa_1604555607, we
 717	 * need to program it even on those that don't explicitly list that
 718	 * workaround.
 719	 *
 720	 * Note that the programming of GEN12_FF_MODE2 is further modified
 721	 * according to the FF_MODE2 guidance given by Wa_1608008084.
 722	 * Wa_1608008084 tells us the FF_MODE2 register will return the wrong
 723	 * value when read from the CPU.
 724	 *
 725	 * The default value for this register is zero for all fields.
 726	 * So instead of doing a RMW we should just write the desired values
 727	 * for TDS and GS timers. Note that since the readback can't be trusted,
 728	 * the clear mask is just set to ~0 to make sure other bits are not
 729	 * inadvertently set. For the same reason read verification is ignored.
 730	 */
 731	wa_add(wal,
 732	       GEN12_FF_MODE2,
 733	       ~0,
 734	       FF_MODE2_TDS_TIMER_128 | FF_MODE2_GS_TIMER_224,
 735	       0, false);
 736
 737	if (!IS_DG1(i915)) {
 738		/* Wa_1806527549 */
 739		wa_masked_en(wal, HIZ_CHICKEN, HZ_DEPTH_TEST_LE_GE_OPT_DISABLE);
 740
 741		/* Wa_1606376872 */
 742		wa_masked_en(wal, COMMON_SLICE_CHICKEN4, DISABLE_TDC_LOAD_BALANCING_CALC);
 743	}
 744}
 745
 746static void dg1_ctx_workarounds_init(struct intel_engine_cs *engine,
 747				     struct i915_wa_list *wal)
 748{
 749	gen12_ctx_workarounds_init(engine, wal);
 750
 751	/* Wa_1409044764 */
 752	wa_masked_dis(wal, GEN11_COMMON_SLICE_CHICKEN3,
 753		      DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN);
 754
 755	/* Wa_22010493298 */
 756	wa_masked_en(wal, HIZ_CHICKEN,
 757		     DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE);
 758}
 759
 760static void dg2_ctx_workarounds_init(struct intel_engine_cs *engine,
 761				     struct i915_wa_list *wal)
 762{
 763	dg2_ctx_gt_tuning_init(engine, wal);
 764
 765	/* Wa_16013271637:dg2 */
 766	wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
 767			 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
 768
 769	/* Wa_14014947963:dg2 */
 770	wa_masked_field_set(wal, VF_PREEMPTION, PREEMPTION_VERTEX_COUNT, 0x4000);
 771
 772	/* Wa_18018764978:dg2 */
 773	wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
 774
 775	/* Wa_18019271663:dg2 */
 776	wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
 777
 778	/* Wa_14019877138:dg2 */
 779	wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT);
 780}
 781
 782static void xelpg_ctx_gt_tuning_init(struct intel_engine_cs *engine,
 783				     struct i915_wa_list *wal)
 784{
 785	struct intel_gt *gt = engine->gt;
 786
 787	dg2_ctx_gt_tuning_init(engine, wal);
 788
 789	/*
 790	 * Due to Wa_16014892111, the DRAW_WATERMARK tuning must be done in
 791	 * gen12_emit_indirect_ctx_rcs() rather than here on some early
 792	 * steppings.
 793	 */
 794	if (!(IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
 795	      IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)))
 796		wa_add(wal, DRAW_WATERMARK, VERT_WM_VAL, 0x3FF, 0, false);
 797}
 798
 799static void xelpg_ctx_workarounds_init(struct intel_engine_cs *engine,
 800				       struct i915_wa_list *wal)
 801{
 802	struct intel_gt *gt = engine->gt;
 803
 804	xelpg_ctx_gt_tuning_init(engine, wal);
 805
 806	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
 807	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
 808		/* Wa_14014947963 */
 809		wa_masked_field_set(wal, VF_PREEMPTION,
 810				    PREEMPTION_VERTEX_COUNT, 0x4000);
 811
 812		/* Wa_16013271637 */
 813		wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
 814				 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
 815
 816		/* Wa_18019627453 */
 817		wa_mcr_masked_en(wal, VFLSKPD, VF_PREFETCH_TLB_DIS);
 818
 819		/* Wa_18018764978 */
 820		wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
 821	}
 822
 823	/* Wa_18019271663 */
 824	wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
 825
 826	/* Wa_14019877138 */
 827	wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT);
 828}
 829
 830static void fakewa_disable_nestedbb_mode(struct intel_engine_cs *engine,
 831					 struct i915_wa_list *wal)
 832{
 833	/*
 834	 * This is a "fake" workaround defined by software to ensure we
 835	 * maintain reliable, backward-compatible behavior for userspace with
 836	 * regards to how nested MI_BATCH_BUFFER_START commands are handled.
 837	 *
 838	 * The per-context setting of MI_MODE[12] determines whether the bits
 839	 * of a nested MI_BATCH_BUFFER_START instruction should be interpreted
 840	 * in the traditional manner or whether they should instead use a new
 841	 * tgl+ meaning that breaks backward compatibility, but allows nesting
 842	 * into 3rd-level batchbuffers.  When this new capability was first
 843	 * added in TGL, it remained off by default unless a context
 844	 * intentionally opted in to the new behavior.  However Xe_HPG now
 845	 * flips this on by default and requires that we explicitly opt out if
 846	 * we don't want the new behavior.
 847	 *
 848	 * From a SW perspective, we want to maintain the backward-compatible
 849	 * behavior for userspace, so we'll apply a fake workaround to set it
 850	 * back to the legacy behavior on platforms where the hardware default
 851	 * is to break compatibility.  At the moment there is no Linux
 852	 * userspace that utilizes third-level batchbuffers, so this will avoid
 853	 * userspace from needing to make any changes.  using the legacy
 854	 * meaning is the correct thing to do.  If/when we have userspace
 855	 * consumers that want to utilize third-level batch nesting, we can
 856	 * provide a context parameter to allow them to opt-in.
 857	 */
 858	wa_masked_dis(wal, RING_MI_MODE(engine->mmio_base), TGL_NESTED_BB_EN);
 859}
 860
 861static void gen12_ctx_gt_mocs_init(struct intel_engine_cs *engine,
 862				   struct i915_wa_list *wal)
 863{
 864	u8 mocs;
 865
 866	/*
 867	 * Some blitter commands do not have a field for MOCS, those
 868	 * commands will use MOCS index pointed by BLIT_CCTL.
 869	 * BLIT_CCTL registers are needed to be programmed to un-cached.
 870	 */
 871	if (engine->class == COPY_ENGINE_CLASS) {
 872		mocs = engine->gt->mocs.uc_index;
 873		wa_write_clr_set(wal,
 874				 BLIT_CCTL(engine->mmio_base),
 875				 BLIT_CCTL_MASK,
 876				 BLIT_CCTL_MOCS(mocs, mocs));
 877	}
 878}
 879
 880/*
 881 * gen12_ctx_gt_fake_wa_init() aren't programmingan official workaround
 882 * defined by the hardware team, but it programming general context registers.
 883 * Adding those context register programming in context workaround
 884 * allow us to use the wa framework for proper application and validation.
 885 */
 886static void
 887gen12_ctx_gt_fake_wa_init(struct intel_engine_cs *engine,
 888			  struct i915_wa_list *wal)
 889{
 890	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 891		fakewa_disable_nestedbb_mode(engine, wal);
 892
 893	gen12_ctx_gt_mocs_init(engine, wal);
 894}
 895
 896static void
 897__intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
 898			   struct i915_wa_list *wal,
 899			   const char *name)
 900{
 901	struct drm_i915_private *i915 = engine->i915;
 902
 903	wa_init_start(wal, engine->gt, name, engine->name);
 904
 905	/* Applies to all engines */
 906	/*
 907	 * Fake workarounds are not the actual workaround but
 908	 * programming of context registers using workaround framework.
 909	 */
 910	if (GRAPHICS_VER(i915) >= 12)
 911		gen12_ctx_gt_fake_wa_init(engine, wal);
 912
 913	if (engine->class != RENDER_CLASS)
 914		goto done;
 915
 916	if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
 917		xelpg_ctx_workarounds_init(engine, wal);
 918	else if (IS_DG2(i915))
 919		dg2_ctx_workarounds_init(engine, wal);
 920	else if (IS_DG1(i915))
 921		dg1_ctx_workarounds_init(engine, wal);
 922	else if (GRAPHICS_VER(i915) == 12)
 923		gen12_ctx_workarounds_init(engine, wal);
 924	else if (GRAPHICS_VER(i915) == 11)
 925		icl_ctx_workarounds_init(engine, wal);
 926	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
 927		cfl_ctx_workarounds_init(engine, wal);
 928	else if (IS_GEMINILAKE(i915))
 929		glk_ctx_workarounds_init(engine, wal);
 930	else if (IS_KABYLAKE(i915))
 931		kbl_ctx_workarounds_init(engine, wal);
 932	else if (IS_BROXTON(i915))
 933		bxt_ctx_workarounds_init(engine, wal);
 934	else if (IS_SKYLAKE(i915))
 935		skl_ctx_workarounds_init(engine, wal);
 936	else if (IS_CHERRYVIEW(i915))
 937		chv_ctx_workarounds_init(engine, wal);
 938	else if (IS_BROADWELL(i915))
 939		bdw_ctx_workarounds_init(engine, wal);
 940	else if (GRAPHICS_VER(i915) == 7)
 941		gen7_ctx_workarounds_init(engine, wal);
 942	else if (GRAPHICS_VER(i915) == 6)
 943		gen6_ctx_workarounds_init(engine, wal);
 944	else if (GRAPHICS_VER(i915) < 8)
 945		;
 946	else
 947		MISSING_CASE(GRAPHICS_VER(i915));
 948
 949done:
 950	wa_init_finish(wal);
 951}
 952
 953void intel_engine_init_ctx_wa(struct intel_engine_cs *engine)
 954{
 955	__intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context");
 956}
 957
 958int intel_engine_emit_ctx_wa(struct i915_request *rq)
 959{
 960	struct i915_wa_list *wal = &rq->engine->ctx_wa_list;
 961	struct intel_uncore *uncore = rq->engine->uncore;
 962	enum forcewake_domains fw;
 963	unsigned long flags;
 964	struct i915_wa *wa;
 965	unsigned int i;
 966	u32 *cs;
 967	int ret;
 968
 969	if (wal->count == 0)
 970		return 0;
 971
 972	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
 973	if (ret)
 974		return ret;
 975
 976	if ((IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(12, 70), IP_VER(12, 74)) ||
 977	     IS_DG2(rq->i915)) && rq->engine->class == RENDER_CLASS)
 978		cs = intel_ring_begin(rq, (wal->count * 2 + 6));
 979	else
 980		cs = intel_ring_begin(rq, (wal->count * 2 + 2));
 981
 982	if (IS_ERR(cs))
 983		return PTR_ERR(cs);
 984
 985	fw = wal_get_fw_for_rmw(uncore, wal);
 986
 987	intel_gt_mcr_lock(wal->gt, &flags);
 988	spin_lock(&uncore->lock);
 989	intel_uncore_forcewake_get__locked(uncore, fw);
 990
 991	*cs++ = MI_LOAD_REGISTER_IMM(wal->count);
 992	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
 993		u32 val;
 994
 995		/* Skip reading the register if it's not really needed */
 996		if (wa->masked_reg || (wa->clr | wa->set) == U32_MAX) {
 997			val = wa->set;
 998		} else {
 999			val = wa->is_mcr ?
1000				intel_gt_mcr_read_any_fw(wal->gt, wa->mcr_reg) :
1001				intel_uncore_read_fw(uncore, wa->reg);
1002			val &= ~wa->clr;
1003			val |= wa->set;
1004		}
1005
1006		*cs++ = i915_mmio_reg_offset(wa->reg);
1007		*cs++ = val;
1008	}
1009	*cs++ = MI_NOOP;
1010
1011	/* Wa_14019789679 */
1012	if ((IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(12, 70), IP_VER(12, 74)) ||
1013	     IS_DG2(rq->i915)) && rq->engine->class == RENDER_CLASS) {
1014		*cs++ = CMD_3DSTATE_MESH_CONTROL;
1015		*cs++ = 0;
1016		*cs++ = 0;
1017		*cs++ = MI_NOOP;
1018	}
1019
1020	intel_uncore_forcewake_put__locked(uncore, fw);
1021	spin_unlock(&uncore->lock);
1022	intel_gt_mcr_unlock(wal->gt, flags);
1023
1024	intel_ring_advance(rq, cs);
1025
1026	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
1027	if (ret)
1028		return ret;
1029
1030	return 0;
1031}
1032
1033static void
1034gen4_gt_workarounds_init(struct intel_gt *gt,
1035			 struct i915_wa_list *wal)
1036{
1037	/* WaDisable_RenderCache_OperationalFlush:gen4,ilk */
1038	wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
1039}
1040
1041static void
1042g4x_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1043{
1044	gen4_gt_workarounds_init(gt, wal);
1045
1046	/* WaDisableRenderCachePipelinedFlush:g4x,ilk */
1047	wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE);
1048}
1049
1050static void
1051ilk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1052{
1053	g4x_gt_workarounds_init(gt, wal);
1054
1055	wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED);
1056}
1057
1058static void
1059snb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1060{
1061}
1062
1063static void
1064ivb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1065{
1066	/* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
1067	wa_masked_dis(wal,
1068		      GEN7_COMMON_SLICE_CHICKEN1,
1069		      GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
1070
1071	/* WaApplyL3ControlAndL3ChickenMode:ivb */
1072	wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL);
1073	wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
1074
1075	/* WaForceL3Serialization:ivb */
1076	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1077}
1078
1079static void
1080vlv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1081{
1082	/* WaForceL3Serialization:vlv */
1083	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1084
1085	/*
1086	 * WaIncreaseL3CreditsForVLVB0:vlv
1087	 * This is the hardware default actually.
1088	 */
1089	wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE);
1090}
1091
1092static void
1093hsw_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1094{
1095	/* L3 caching of data atomics doesn't work -- disable it. */
1096	wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
1097
1098	wa_add(wal,
1099	       HSW_ROW_CHICKEN3, 0,
1100	       _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE),
1101	       0 /* XXX does this reg exist? */, true);
1102
1103	/* WaVSRefCountFullforceMissDisable:hsw */
1104	wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME);
1105}
1106
1107static void
1108gen9_wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal)
1109{
1110	const struct sseu_dev_info *sseu = &to_gt(i915)->info.sseu;
1111	unsigned int slice, subslice;
1112	u32 mcr, mcr_mask;
1113
1114	GEM_BUG_ON(GRAPHICS_VER(i915) != 9);
1115
1116	/*
1117	 * WaProgramMgsrForCorrectSliceSpecificMmioReads:gen9,glk,kbl,cml
1118	 * Before any MMIO read into slice/subslice specific registers, MCR
1119	 * packet control register needs to be programmed to point to any
1120	 * enabled s/ss pair. Otherwise, incorrect values will be returned.
1121	 * This means each subsequent MMIO read will be forwarded to an
1122	 * specific s/ss combination, but this is OK since these registers
1123	 * are consistent across s/ss in almost all cases. In the rare
1124	 * occasions, such as INSTDONE, where this value is dependent
1125	 * on s/ss combo, the read should be done with read_subslice_reg.
1126	 */
1127	slice = ffs(sseu->slice_mask) - 1;
1128	GEM_BUG_ON(slice >= ARRAY_SIZE(sseu->subslice_mask.hsw));
1129	subslice = ffs(intel_sseu_get_hsw_subslices(sseu, slice));
1130	GEM_BUG_ON(!subslice);
1131	subslice--;
1132
1133	/*
1134	 * We use GEN8_MCR..() macros to calculate the |mcr| value for
1135	 * Gen9 to address WaProgramMgsrForCorrectSliceSpecificMmioReads
1136	 */
1137	mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice);
1138	mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
1139
1140	drm_dbg(&i915->drm, "MCR slice:%d/subslice:%d = %x\n", slice, subslice, mcr);
1141
1142	wa_write_clr_set(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr);
1143}
1144
1145static void
1146gen9_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1147{
1148	struct drm_i915_private *i915 = gt->i915;
1149
1150	/* WaProgramMgsrForCorrectSliceSpecificMmioReads:glk,kbl,cml,gen9 */
1151	gen9_wa_init_mcr(i915, wal);
1152
1153	/* WaDisableKillLogic:bxt,skl,kbl */
1154	if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915))
1155		wa_write_or(wal,
1156			    GAM_ECOCHK,
1157			    ECOCHK_DIS_TLB);
1158
1159	if (HAS_LLC(i915)) {
1160		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
1161		 *
1162		 * Must match Display Engine. See
1163		 * WaCompressedResourceDisplayNewHashMode.
1164		 */
1165		wa_write_or(wal,
1166			    MMCD_MISC_CTRL,
1167			    MMCD_PCLA | MMCD_HOTSPOT_EN);
1168	}
1169
1170	/* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */
1171	wa_write_or(wal,
1172		    GAM_ECOCHK,
1173		    BDW_DISABLE_HDC_INVALIDATION);
1174}
1175
1176static void
1177skl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1178{
1179	gen9_gt_workarounds_init(gt, wal);
1180
1181	/* WaDisableGafsUnitClkGating:skl */
1182	wa_write_or(wal,
1183		    GEN7_UCGCTL4,
1184		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1185
1186	/* WaInPlaceDecompressionHang:skl */
1187	if (IS_SKYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, STEP_A0, STEP_H0))
1188		wa_write_or(wal,
1189			    GEN9_GAMT_ECO_REG_RW_IA,
1190			    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1191}
1192
1193static void
1194kbl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1195{
1196	gen9_gt_workarounds_init(gt, wal);
1197
1198	/* WaDisableDynamicCreditSharing:kbl */
1199	if (IS_KABYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, 0, STEP_C0))
1200		wa_write_or(wal,
1201			    GAMT_CHKN_BIT_REG,
1202			    GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);
1203
1204	/* WaDisableGafsUnitClkGating:kbl */
1205	wa_write_or(wal,
1206		    GEN7_UCGCTL4,
1207		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1208
1209	/* WaInPlaceDecompressionHang:kbl */
1210	wa_write_or(wal,
1211		    GEN9_GAMT_ECO_REG_RW_IA,
1212		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1213}
1214
1215static void
1216glk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1217{
1218	gen9_gt_workarounds_init(gt, wal);
1219}
1220
1221static void
1222cfl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1223{
1224	gen9_gt_workarounds_init(gt, wal);
1225
1226	/* WaDisableGafsUnitClkGating:cfl */
1227	wa_write_or(wal,
1228		    GEN7_UCGCTL4,
1229		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1230
1231	/* WaInPlaceDecompressionHang:cfl */
1232	wa_write_or(wal,
1233		    GEN9_GAMT_ECO_REG_RW_IA,
1234		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1235}
1236
1237static void __set_mcr_steering(struct i915_wa_list *wal,
1238			       i915_reg_t steering_reg,
1239			       unsigned int slice, unsigned int subslice)
1240{
1241	u32 mcr, mcr_mask;
1242
1243	mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice);
1244	mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
1245
1246	wa_write_clr_set(wal, steering_reg, mcr_mask, mcr);
1247}
1248
1249static void debug_dump_steering(struct intel_gt *gt)
1250{
1251	struct drm_printer p = drm_dbg_printer(&gt->i915->drm, DRM_UT_DRIVER,
1252					       "MCR Steering:");
1253
1254	if (drm_debug_enabled(DRM_UT_DRIVER))
1255		intel_gt_mcr_report_steering(&p, gt, false);
1256}
1257
1258static void __add_mcr_wa(struct intel_gt *gt, struct i915_wa_list *wal,
1259			 unsigned int slice, unsigned int subslice)
1260{
1261	__set_mcr_steering(wal, GEN8_MCR_SELECTOR, slice, subslice);
1262
1263	gt->default_steering.groupid = slice;
1264	gt->default_steering.instanceid = subslice;
1265
1266	debug_dump_steering(gt);
1267}
1268
1269static void
1270icl_wa_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1271{
1272	const struct sseu_dev_info *sseu = &gt->info.sseu;
1273	unsigned int subslice;
1274
1275	GEM_BUG_ON(GRAPHICS_VER(gt->i915) < 11);
1276	GEM_BUG_ON(hweight8(sseu->slice_mask) > 1);
1277
1278	/*
1279	 * Although a platform may have subslices, we need to always steer
1280	 * reads to the lowest instance that isn't fused off.  When Render
1281	 * Power Gating is enabled, grabbing forcewake will only power up a
1282	 * single subslice (the "minconfig") if there isn't a real workload
1283	 * that needs to be run; this means that if we steer register reads to
1284	 * one of the higher subslices, we run the risk of reading back 0's or
1285	 * random garbage.
1286	 */
1287	subslice = __ffs(intel_sseu_get_hsw_subslices(sseu, 0));
1288
1289	/*
1290	 * If the subslice we picked above also steers us to a valid L3 bank,
1291	 * then we can just rely on the default steering and won't need to
1292	 * worry about explicitly re-steering L3BANK reads later.
1293	 */
1294	if (gt->info.l3bank_mask & BIT(subslice))
1295		gt->steering_table[L3BANK] = NULL;
1296
1297	__add_mcr_wa(gt, wal, 0, subslice);
1298}
1299
1300static void
1301xehp_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1302{
1303	const struct sseu_dev_info *sseu = &gt->info.sseu;
1304	unsigned long slice, subslice = 0, slice_mask = 0;
1305	u32 lncf_mask = 0;
1306	int i;
1307
1308	/*
1309	 * On Xe_HP the steering increases in complexity. There are now several
1310	 * more units that require steering and we're not guaranteed to be able
1311	 * to find a common setting for all of them. These are:
1312	 * - GSLICE (fusable)
1313	 * - DSS (sub-unit within gslice; fusable)
1314	 * - L3 Bank (fusable)
1315	 * - MSLICE (fusable)
1316	 * - LNCF (sub-unit within mslice; always present if mslice is present)
1317	 *
1318	 * We'll do our default/implicit steering based on GSLICE (in the
1319	 * sliceid field) and DSS (in the subsliceid field).  If we can
1320	 * find overlap between the valid MSLICE and/or LNCF values with
1321	 * a suitable GSLICE, then we can just re-use the default value and
1322	 * skip and explicit steering at runtime.
1323	 *
1324	 * We only need to look for overlap between GSLICE/MSLICE/LNCF to find
1325	 * a valid sliceid value.  DSS steering is the only type of steering
1326	 * that utilizes the 'subsliceid' bits.
1327	 *
1328	 * Also note that, even though the steering domain is called "GSlice"
1329	 * and it is encoded in the register using the gslice format, the spec
1330	 * says that the combined (geometry | compute) fuse should be used to
1331	 * select the steering.
1332	 */
1333
1334	/* Find the potential gslice candidates */
1335	slice_mask = intel_slicemask_from_xehp_dssmask(sseu->subslice_mask,
1336						       GEN_DSS_PER_GSLICE);
1337
1338	/*
1339	 * Find the potential LNCF candidates.  Either LNCF within a valid
1340	 * mslice is fine.
1341	 */
1342	for_each_set_bit(i, &gt->info.mslice_mask, GEN12_MAX_MSLICES)
1343		lncf_mask |= (0x3 << (i * 2));
1344
1345	/*
1346	 * Are there any sliceid values that work for both GSLICE and LNCF
1347	 * steering?
1348	 */
1349	if (slice_mask & lncf_mask) {
1350		slice_mask &= lncf_mask;
1351		gt->steering_table[LNCF] = NULL;
1352	}
1353
1354	/* How about sliceid values that also work for MSLICE steering? */
1355	if (slice_mask & gt->info.mslice_mask) {
1356		slice_mask &= gt->info.mslice_mask;
1357		gt->steering_table[MSLICE] = NULL;
1358	}
1359
1360	slice = __ffs(slice_mask);
1361	subslice = intel_sseu_find_first_xehp_dss(sseu, GEN_DSS_PER_GSLICE, slice) %
1362		GEN_DSS_PER_GSLICE;
1363
1364	__add_mcr_wa(gt, wal, slice, subslice);
1365
1366	/*
1367	 * SQIDI ranges are special because they use different steering
1368	 * registers than everything else we work with.  On XeHP SDV and
1369	 * DG2-G10, any value in the steering registers will work fine since
1370	 * all instances are present, but DG2-G11 only has SQIDI instances at
1371	 * ID's 2 and 3, so we need to steer to one of those.  For simplicity
1372	 * we'll just steer to a hardcoded "2" since that value will work
1373	 * everywhere.
1374	 */
1375	__set_mcr_steering(wal, MCFG_MCR_SELECTOR, 0, 2);
1376	__set_mcr_steering(wal, SF_MCR_SELECTOR, 0, 2);
1377
1378	/*
1379	 * On DG2, GAM registers have a dedicated steering control register
1380	 * and must always be programmed to a hardcoded groupid of "1."
1381	 */
1382	if (IS_DG2(gt->i915))
1383		__set_mcr_steering(wal, GAM_MCR_SELECTOR, 1, 0);
1384}
1385
1386static void
1387icl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1388{
1389	struct drm_i915_private *i915 = gt->i915;
1390
1391	icl_wa_init_mcr(gt, wal);
1392
1393	/* WaModifyGamTlbPartitioning:icl */
1394	wa_write_clr_set(wal,
1395			 GEN11_GACB_PERF_CTRL,
1396			 GEN11_HASH_CTRL_MASK,
1397			 GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4);
1398
1399	/* Wa_1405766107:icl
1400	 * Formerly known as WaCL2SFHalfMaxAlloc
1401	 */
1402	wa_write_or(wal,
1403		    GEN11_LSN_UNSLCVC,
1404		    GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC |
1405		    GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC);
1406
1407	/* Wa_220166154:icl
1408	 * Formerly known as WaDisCtxReload
1409	 */
1410	wa_write_or(wal,
1411		    GEN8_GAMW_ECO_DEV_RW_IA,
1412		    GAMW_ECO_DEV_CTX_RELOAD_DISABLE);
1413
1414	/* Wa_1406463099:icl
1415	 * Formerly known as WaGamTlbPendError
1416	 */
1417	wa_write_or(wal,
1418		    GAMT_CHKN_BIT_REG,
1419		    GAMT_CHKN_DISABLE_L3_COH_PIPE);
1420
1421	/*
1422	 * Wa_1408615072:icl,ehl  (vsunit)
1423	 * Wa_1407596294:icl,ehl  (hsunit)
1424	 */
1425	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1426		    VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS);
1427
1428	/* Wa_1407352427:icl,ehl */
1429	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1430		    PSDUNIT_CLKGATE_DIS);
1431
1432	/* Wa_1406680159:icl,ehl */
1433	wa_mcr_write_or(wal,
1434			GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1435			GWUNIT_CLKGATE_DIS);
1436
1437	/* Wa_1607087056:icl,ehl,jsl */
1438	if (IS_ICELAKE(i915) ||
1439		((IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) &&
1440		IS_GRAPHICS_STEP(i915, STEP_A0, STEP_B0)))
1441		wa_write_or(wal,
1442			    GEN11_SLICE_UNIT_LEVEL_CLKGATE,
1443			    L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1444
1445	/*
1446	 * This is not a documented workaround, but rather an optimization
1447	 * to reduce sampler power.
1448	 */
1449	wa_mcr_write_clr(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1450}
1451
1452/*
1453 * Though there are per-engine instances of these registers,
1454 * they retain their value through engine resets and should
1455 * only be provided on the GT workaround list rather than
1456 * the engine-specific workaround list.
1457 */
1458static void
1459wa_14011060649(struct intel_gt *gt, struct i915_wa_list *wal)
1460{
1461	struct intel_engine_cs *engine;
1462	int id;
1463
1464	for_each_engine(engine, gt, id) {
1465		if (engine->class != VIDEO_DECODE_CLASS ||
1466		    (engine->instance % 2))
1467			continue;
1468
1469		wa_write_or(wal, VDBOX_CGCTL3F10(engine->mmio_base),
1470			    IECPUNIT_CLKGATE_DIS);
1471	}
1472}
1473
1474static void
1475gen12_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1476{
1477	icl_wa_init_mcr(gt, wal);
1478
1479	/* Wa_14011060649:tgl,rkl,dg1,adl-s,adl-p */
1480	wa_14011060649(gt, wal);
1481
1482	/* Wa_14011059788:tgl,rkl,adl-s,dg1,adl-p */
1483	wa_mcr_write_or(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1484
1485	/*
1486	 * Wa_14015795083
1487	 *
1488	 * Firmware on some gen12 platforms locks the MISCCPCTL register,
1489	 * preventing i915 from modifying it for this workaround.  Skip the
1490	 * readback verification for this workaround on debug builds; if the
1491	 * workaround doesn't stick due to firmware behavior, it's not an error
1492	 * that we want CI to flag.
1493	 */
1494	wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE,
1495	       0, 0, false);
1496}
1497
1498static void
1499dg1_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1500{
1501	gen12_gt_workarounds_init(gt, wal);
1502
1503	/* Wa_1409420604:dg1 */
1504	wa_mcr_write_or(wal, SUBSLICE_UNIT_LEVEL_CLKGATE2,
1505			CPSSUNIT_CLKGATE_DIS);
1506
1507	/* Wa_1408615072:dg1 */
1508	/* Empirical testing shows this register is unaffected by engine reset. */
1509	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, VSUNIT_CLKGATE_DIS_TGL);
1510}
1511
1512static void
1513dg2_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1514{
1515	xehp_init_mcr(gt, wal);
1516
1517	/* Wa_14011060649:dg2 */
1518	wa_14011060649(gt, wal);
1519
1520	if (IS_DG2_G10(gt->i915)) {
1521		/* Wa_22010523718:dg2 */
1522		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1523			    CG3DDISCFEG_CLKGATE_DIS);
1524
1525		/* Wa_14011006942:dg2 */
1526		wa_mcr_write_or(wal, GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1527				DSS_ROUTER_CLKGATE_DIS);
1528	}
1529
1530	/* Wa_14014830051:dg2 */
1531	wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1532
1533	/*
1534	 * Wa_14015795083
1535	 * Skip verification for possibly locked register.
1536	 */
1537	wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE,
1538	       0, 0, false);
1539
1540	/* Wa_18018781329 */
1541	wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1542	wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1543	wa_mcr_write_or(wal, XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1544	wa_mcr_write_or(wal, XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB);
1545
1546	/* Wa_1509235366:dg2 */
1547	wa_mcr_write_or(wal, XEHP_GAMCNTRL_CTRL,
1548			INVALIDATION_BROADCAST_MODE_DIS | GLOBAL_INVALIDATION_MODE);
1549
1550	/* Wa_14010648519:dg2 */
1551	wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
1552}
1553
1554static void
1555xelpg_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1556{
1557	/* Wa_14018575942 / Wa_18018781329 */
1558	wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1559	wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1560
1561	/* Wa_22016670082 */
1562	wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE);
1563
1564	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1565	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
1566		/* Wa_14014830051 */
1567		wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1568
1569		/* Wa_14015795083 */
1570		wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1571	}
1572
1573	/*
1574	 * Unlike older platforms, we no longer setup implicit steering here;
1575	 * all MCR accesses are explicitly steered.
1576	 */
1577	debug_dump_steering(gt);
1578}
1579
1580static void
1581wa_16021867713(struct intel_gt *gt, struct i915_wa_list *wal)
1582{
1583	struct intel_engine_cs *engine;
1584	int id;
1585
1586	for_each_engine(engine, gt, id)
1587		if (engine->class == VIDEO_DECODE_CLASS)
1588			wa_write_or(wal, VDBOX_CGCTL3F1C(engine->mmio_base),
1589				    MFXPIPE_CLKGATE_DIS);
1590}
1591
1592static void
1593xelpmp_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1594{
1595	wa_16021867713(gt, wal);
1596
1597	/*
1598	 * Wa_14018778641
1599	 * Wa_18018781329
1600	 *
1601	 * Note that although these registers are MCR on the primary
1602	 * GT, the media GT's versions are regular singleton registers.
1603	 */
1604	wa_write_or(wal, XELPMP_GSC_MOD_CTRL, FORCE_MISS_FTLB);
1605
1606	/*
1607	 * Wa_14018575942
1608	 *
1609	 * Issue is seen on media KPI test running on VDBOX engine
1610	 * especially VP9 encoding WLs
1611	 */
1612	wa_write_or(wal, XELPMP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1613
1614	/* Wa_22016670082 */
1615	wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE);
1616
1617	debug_dump_steering(gt);
1618}
1619
1620/*
1621 * The bspec performance guide has recommended MMIO tuning settings.  These
1622 * aren't truly "workarounds" but we want to program them through the
1623 * workaround infrastructure to make sure they're (re)applied at the proper
1624 * times.
1625 *
1626 * The programming in this function is for settings that persist through
1627 * engine resets and also are not part of any engine's register state context.
1628 * I.e., settings that only need to be re-applied in the event of a full GT
1629 * reset.
1630 */
1631static void gt_tuning_settings(struct intel_gt *gt, struct i915_wa_list *wal)
1632{
1633	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74))) {
1634		wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1635		wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1636	}
1637
1638	if (IS_DG2(gt->i915)) {
1639		wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1640		wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1641	}
1642}
1643
1644static void
1645gt_init_workarounds(struct intel_gt *gt, struct i915_wa_list *wal)
1646{
1647	struct drm_i915_private *i915 = gt->i915;
1648
1649	gt_tuning_settings(gt, wal);
1650
1651	if (gt->type == GT_MEDIA) {
1652		if (MEDIA_VER_FULL(i915) == IP_VER(13, 0))
1653			xelpmp_gt_workarounds_init(gt, wal);
1654		else
1655			MISSING_CASE(MEDIA_VER_FULL(i915));
1656
1657		return;
1658	}
1659
1660	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)))
1661		xelpg_gt_workarounds_init(gt, wal);
1662	else if (IS_DG2(i915))
1663		dg2_gt_workarounds_init(gt, wal);
1664	else if (IS_DG1(i915))
1665		dg1_gt_workarounds_init(gt, wal);
1666	else if (GRAPHICS_VER(i915) == 12)
1667		gen12_gt_workarounds_init(gt, wal);
1668	else if (GRAPHICS_VER(i915) == 11)
1669		icl_gt_workarounds_init(gt, wal);
1670	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
1671		cfl_gt_workarounds_init(gt, wal);
1672	else if (IS_GEMINILAKE(i915))
1673		glk_gt_workarounds_init(gt, wal);
1674	else if (IS_KABYLAKE(i915))
1675		kbl_gt_workarounds_init(gt, wal);
1676	else if (IS_BROXTON(i915))
1677		gen9_gt_workarounds_init(gt, wal);
1678	else if (IS_SKYLAKE(i915))
1679		skl_gt_workarounds_init(gt, wal);
1680	else if (IS_HASWELL(i915))
1681		hsw_gt_workarounds_init(gt, wal);
1682	else if (IS_VALLEYVIEW(i915))
1683		vlv_gt_workarounds_init(gt, wal);
1684	else if (IS_IVYBRIDGE(i915))
1685		ivb_gt_workarounds_init(gt, wal);
1686	else if (GRAPHICS_VER(i915) == 6)
1687		snb_gt_workarounds_init(gt, wal);
1688	else if (GRAPHICS_VER(i915) == 5)
1689		ilk_gt_workarounds_init(gt, wal);
1690	else if (IS_G4X(i915))
1691		g4x_gt_workarounds_init(gt, wal);
1692	else if (GRAPHICS_VER(i915) == 4)
1693		gen4_gt_workarounds_init(gt, wal);
1694	else if (GRAPHICS_VER(i915) <= 8)
1695		;
1696	else
1697		MISSING_CASE(GRAPHICS_VER(i915));
1698}
1699
1700void intel_gt_init_workarounds(struct intel_gt *gt)
1701{
1702	struct i915_wa_list *wal = &gt->wa_list;
1703
1704	wa_init_start(wal, gt, "GT", "global");
1705	gt_init_workarounds(gt, wal);
1706	wa_init_finish(wal);
1707}
1708
1709static bool
1710wa_verify(struct intel_gt *gt, const struct i915_wa *wa, u32 cur,
1711	  const char *name, const char *from)
1712{
1713	if ((cur ^ wa->set) & wa->read) {
1714		gt_err(gt,
1715		       "%s workaround lost on %s! (reg[%x]=0x%x, relevant bits were 0x%x vs expected 0x%x)\n",
1716		       name, from, i915_mmio_reg_offset(wa->reg),
1717		       cur, cur & wa->read, wa->set & wa->read);
1718
1719		return false;
1720	}
1721
1722	return true;
1723}
1724
1725static void wa_list_apply(const struct i915_wa_list *wal)
1726{
1727	struct intel_gt *gt = wal->gt;
1728	struct intel_uncore *uncore = gt->uncore;
1729	enum forcewake_domains fw;
1730	unsigned long flags;
1731	struct i915_wa *wa;
1732	unsigned int i;
1733
1734	if (!wal->count)
1735		return;
1736
1737	fw = wal_get_fw_for_rmw(uncore, wal);
1738
1739	intel_gt_mcr_lock(gt, &flags);
1740	spin_lock(&uncore->lock);
1741	intel_uncore_forcewake_get__locked(uncore, fw);
1742
1743	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1744		u32 val, old = 0;
1745
1746		/* open-coded rmw due to steering */
1747		if (wa->clr)
1748			old = wa->is_mcr ?
1749				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1750				intel_uncore_read_fw(uncore, wa->reg);
1751		val = (old & ~wa->clr) | wa->set;
1752		if (val != old || !wa->clr) {
1753			if (wa->is_mcr)
1754				intel_gt_mcr_multicast_write_fw(gt, wa->mcr_reg, val);
1755			else
1756				intel_uncore_write_fw(uncore, wa->reg, val);
1757		}
1758
1759		if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
1760			u32 val = wa->is_mcr ?
1761				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1762				intel_uncore_read_fw(uncore, wa->reg);
1763
1764			wa_verify(gt, wa, val, wal->name, "application");
1765		}
1766	}
1767
1768	intel_uncore_forcewake_put__locked(uncore, fw);
1769	spin_unlock(&uncore->lock);
1770	intel_gt_mcr_unlock(gt, flags);
1771}
1772
1773void intel_gt_apply_workarounds(struct intel_gt *gt)
1774{
1775	wa_list_apply(&gt->wa_list);
1776}
1777
1778static bool wa_list_verify(struct intel_gt *gt,
1779			   const struct i915_wa_list *wal,
1780			   const char *from)
1781{
1782	struct intel_uncore *uncore = gt->uncore;
1783	struct i915_wa *wa;
1784	enum forcewake_domains fw;
1785	unsigned long flags;
1786	unsigned int i;
1787	bool ok = true;
1788
1789	fw = wal_get_fw_for_rmw(uncore, wal);
1790
1791	intel_gt_mcr_lock(gt, &flags);
1792	spin_lock(&uncore->lock);
1793	intel_uncore_forcewake_get__locked(uncore, fw);
1794
1795	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1796		ok &= wa_verify(wal->gt, wa, wa->is_mcr ?
1797				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1798				intel_uncore_read_fw(uncore, wa->reg),
1799				wal->name, from);
1800
1801	intel_uncore_forcewake_put__locked(uncore, fw);
1802	spin_unlock(&uncore->lock);
1803	intel_gt_mcr_unlock(gt, flags);
1804
1805	return ok;
1806}
1807
1808bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from)
1809{
1810	return wa_list_verify(gt, &gt->wa_list, from);
1811}
1812
1813__maybe_unused
1814static bool is_nonpriv_flags_valid(u32 flags)
1815{
1816	/* Check only valid flag bits are set */
1817	if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID)
1818		return false;
1819
1820	/* NB: Only 3 out of 4 enum values are valid for access field */
1821	if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) ==
1822	    RING_FORCE_TO_NONPRIV_ACCESS_INVALID)
1823		return false;
1824
1825	return true;
1826}
1827
1828static void
1829whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags)
1830{
1831	struct i915_wa wa = {
1832		.reg = reg
1833	};
1834
1835	if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1836		return;
1837
1838	if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1839		return;
1840
1841	wa.reg.reg |= flags;
1842	_wa_add(wal, &wa);
1843}
1844
1845static void
1846whitelist_mcr_reg_ext(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 flags)
1847{
1848	struct i915_wa wa = {
1849		.mcr_reg = reg,
1850		.is_mcr = 1,
1851	};
1852
1853	if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1854		return;
1855
1856	if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1857		return;
1858
1859	wa.mcr_reg.reg |= flags;
1860	_wa_add(wal, &wa);
1861}
1862
1863static void
1864whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
1865{
1866	whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1867}
1868
1869static void
1870whitelist_mcr_reg(struct i915_wa_list *wal, i915_mcr_reg_t reg)
1871{
1872	whitelist_mcr_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1873}
1874
1875static void gen9_whitelist_build(struct i915_wa_list *w)
1876{
1877	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
1878	whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
1879
1880	/* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */
1881	whitelist_reg(w, GEN8_CS_CHICKEN1);
1882
1883	/* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */
1884	whitelist_reg(w, GEN8_HDC_CHICKEN1);
1885
1886	/* WaSendPushConstantsFromMMIO:skl,bxt */
1887	whitelist_reg(w, COMMON_SLICE_CHICKEN2);
1888}
1889
1890static void skl_whitelist_build(struct intel_engine_cs *engine)
1891{
1892	struct i915_wa_list *w = &engine->whitelist;
1893
1894	if (engine->class != RENDER_CLASS)
1895		return;
1896
1897	gen9_whitelist_build(w);
1898
1899	/* WaDisableLSQCROPERFforOCL:skl */
1900	whitelist_mcr_reg(w, GEN8_L3SQCREG4);
1901}
1902
1903static void bxt_whitelist_build(struct intel_engine_cs *engine)
1904{
1905	if (engine->class != RENDER_CLASS)
1906		return;
1907
1908	gen9_whitelist_build(&engine->whitelist);
1909}
1910
1911static void kbl_whitelist_build(struct intel_engine_cs *engine)
1912{
1913	struct i915_wa_list *w = &engine->whitelist;
1914
1915	if (engine->class != RENDER_CLASS)
1916		return;
1917
1918	gen9_whitelist_build(w);
1919
1920	/* WaDisableLSQCROPERFforOCL:kbl */
1921	whitelist_mcr_reg(w, GEN8_L3SQCREG4);
1922}
1923
1924static void glk_whitelist_build(struct intel_engine_cs *engine)
1925{
1926	struct i915_wa_list *w = &engine->whitelist;
1927
1928	if (engine->class != RENDER_CLASS)
1929		return;
1930
1931	gen9_whitelist_build(w);
1932
1933	/* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */
1934	whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1935}
1936
1937static void cfl_whitelist_build(struct intel_engine_cs *engine)
1938{
1939	struct i915_wa_list *w = &engine->whitelist;
1940
1941	if (engine->class != RENDER_CLASS)
1942		return;
1943
1944	gen9_whitelist_build(w);
1945
1946	/*
1947	 * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml
1948	 *
1949	 * This covers 4 register which are next to one another :
1950	 *   - PS_INVOCATION_COUNT
1951	 *   - PS_INVOCATION_COUNT_UDW
1952	 *   - PS_DEPTH_COUNT
1953	 *   - PS_DEPTH_COUNT_UDW
1954	 */
1955	whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1956			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
1957			  RING_FORCE_TO_NONPRIV_RANGE_4);
1958}
1959
1960static void allow_read_ctx_timestamp(struct intel_engine_cs *engine)
1961{
1962	struct i915_wa_list *w = &engine->whitelist;
1963
1964	if (engine->class != RENDER_CLASS)
1965		whitelist_reg_ext(w,
1966				  RING_CTX_TIMESTAMP(engine->mmio_base),
1967				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1968}
1969
1970static void cml_whitelist_build(struct intel_engine_cs *engine)
1971{
1972	allow_read_ctx_timestamp(engine);
1973
1974	cfl_whitelist_build(engine);
1975}
1976
1977static void icl_whitelist_build(struct intel_engine_cs *engine)
1978{
1979	struct i915_wa_list *w = &engine->whitelist;
1980
1981	allow_read_ctx_timestamp(engine);
1982
1983	switch (engine->class) {
1984	case RENDER_CLASS:
1985		/* WaAllowUMDToModifyHalfSliceChicken7:icl */
1986		whitelist_mcr_reg(w, GEN9_HALF_SLICE_CHICKEN7);
1987
1988		/* WaAllowUMDToModifySamplerMode:icl */
1989		whitelist_mcr_reg(w, GEN10_SAMPLER_MODE);
1990
1991		/* WaEnableStateCacheRedirectToCS:icl */
1992		whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1993
1994		/*
1995		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl
1996		 *
1997		 * This covers 4 register which are next to one another :
1998		 *   - PS_INVOCATION_COUNT
1999		 *   - PS_INVOCATION_COUNT_UDW
2000		 *   - PS_DEPTH_COUNT
2001		 *   - PS_DEPTH_COUNT_UDW
2002		 */
2003		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2004				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
2005				  RING_FORCE_TO_NONPRIV_RANGE_4);
2006		break;
2007
2008	case VIDEO_DECODE_CLASS:
2009		/* hucStatusRegOffset */
2010		whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base),
2011				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2012		/* hucUKernelHdrInfoRegOffset */
2013		whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base),
2014				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2015		/* hucStatus2RegOffset */
2016		whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base),
2017				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2018		break;
2019
2020	default:
2021		break;
2022	}
2023}
2024
2025static void tgl_whitelist_build(struct intel_engine_cs *engine)
2026{
2027	struct i915_wa_list *w = &engine->whitelist;
2028
2029	allow_read_ctx_timestamp(engine);
2030
2031	switch (engine->class) {
2032	case RENDER_CLASS:
2033		/*
2034		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl
2035		 * Wa_1408556865:tgl
2036		 *
2037		 * This covers 4 registers which are next to one another :
2038		 *   - PS_INVOCATION_COUNT
2039		 *   - PS_INVOCATION_COUNT_UDW
2040		 *   - PS_DEPTH_COUNT
2041		 *   - PS_DEPTH_COUNT_UDW
2042		 */
2043		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2044				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
2045				  RING_FORCE_TO_NONPRIV_RANGE_4);
2046
2047		/*
2048		 * Wa_1808121037:tgl
2049		 * Wa_14012131227:dg1
2050		 * Wa_1508744258:tgl,rkl,dg1,adl-s,adl-p
2051		 */
2052		whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
2053
2054		/* Wa_1806527549:tgl */
2055		whitelist_reg(w, HIZ_CHICKEN);
2056
2057		/* Required by recommended tuning setting (not a workaround) */
2058		whitelist_reg(w, GEN11_COMMON_SLICE_CHICKEN3);
2059
2060		break;
2061	default:
2062		break;
2063	}
2064}
2065
2066static void dg2_whitelist_build(struct intel_engine_cs *engine)
2067{
2068	struct i915_wa_list *w = &engine->whitelist;
2069
2070	switch (engine->class) {
2071	case RENDER_CLASS:
2072		/* Required by recommended tuning setting (not a workaround) */
2073		whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3);
2074		whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
2075		break;
2076	default:
2077		break;
2078	}
2079}
2080
2081static void xelpg_whitelist_build(struct intel_engine_cs *engine)
2082{
2083	struct i915_wa_list *w = &engine->whitelist;
2084
2085	switch (engine->class) {
2086	case RENDER_CLASS:
2087		/* Required by recommended tuning setting (not a workaround) */
2088		whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3);
2089		whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
2090		break;
2091	default:
2092		break;
2093	}
2094}
2095
2096void intel_engine_init_whitelist(struct intel_engine_cs *engine)
2097{
2098	struct drm_i915_private *i915 = engine->i915;
2099	struct i915_wa_list *w = &engine->whitelist;
2100
2101	wa_init_start(w, engine->gt, "whitelist", engine->name);
2102
2103	if (engine->gt->type == GT_MEDIA)
2104		; /* none yet */
2105	else if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
2106		xelpg_whitelist_build(engine);
2107	else if (IS_DG2(i915))
2108		dg2_whitelist_build(engine);
2109	else if (GRAPHICS_VER(i915) == 12)
2110		tgl_whitelist_build(engine);
2111	else if (GRAPHICS_VER(i915) == 11)
2112		icl_whitelist_build(engine);
2113	else if (IS_COMETLAKE(i915))
2114		cml_whitelist_build(engine);
2115	else if (IS_COFFEELAKE(i915))
2116		cfl_whitelist_build(engine);
2117	else if (IS_GEMINILAKE(i915))
2118		glk_whitelist_build(engine);
2119	else if (IS_KABYLAKE(i915))
2120		kbl_whitelist_build(engine);
2121	else if (IS_BROXTON(i915))
2122		bxt_whitelist_build(engine);
2123	else if (IS_SKYLAKE(i915))
2124		skl_whitelist_build(engine);
2125	else if (GRAPHICS_VER(i915) <= 8)
2126		;
2127	else
2128		MISSING_CASE(GRAPHICS_VER(i915));
2129
2130	wa_init_finish(w);
2131}
2132
2133void intel_engine_apply_whitelist(struct intel_engine_cs *engine)
2134{
2135	const struct i915_wa_list *wal = &engine->whitelist;
2136	struct intel_uncore *uncore = engine->uncore;
2137	const u32 base = engine->mmio_base;
2138	struct i915_wa *wa;
2139	unsigned int i;
2140
2141	if (!wal->count)
2142		return;
2143
2144	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
2145		intel_uncore_write(uncore,
2146				   RING_FORCE_TO_NONPRIV(base, i),
2147				   i915_mmio_reg_offset(wa->reg));
2148
2149	/* And clear the rest just in case of garbage */
2150	for (; i < RING_MAX_NONPRIV_SLOTS; i++)
2151		intel_uncore_write(uncore,
2152				   RING_FORCE_TO_NONPRIV(base, i),
2153				   i915_mmio_reg_offset(RING_NOPID(base)));
2154}
2155
2156/*
2157 * engine_fake_wa_init(), a place holder to program the registers
2158 * which are not part of an official workaround defined by the
2159 * hardware team.
2160 * Adding programming of those register inside workaround will
2161 * allow utilizing wa framework to proper application and verification.
2162 */
2163static void
2164engine_fake_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2165{
2166	u8 mocs_w, mocs_r;
2167
2168	/*
2169	 * RING_CMD_CCTL specifies the default MOCS entry that will be used
2170	 * by the command streamer when executing commands that don't have
2171	 * a way to explicitly specify a MOCS setting.  The default should
2172	 * usually reference whichever MOCS entry corresponds to uncached
2173	 * behavior, although use of a WB cached entry is recommended by the
2174	 * spec in certain circumstances on specific platforms.
2175	 */
2176	if (GRAPHICS_VER(engine->i915) >= 12) {
2177		mocs_r = engine->gt->mocs.uc_index;
2178		mocs_w = engine->gt->mocs.uc_index;
2179
2180		if (HAS_L3_CCS_READ(engine->i915) &&
2181		    engine->class == COMPUTE_CLASS) {
2182			mocs_r = engine->gt->mocs.wb_index;
2183
2184			/*
2185			 * Even on the few platforms where MOCS 0 is a
2186			 * legitimate table entry, it's never the correct
2187			 * setting to use here; we can assume the MOCS init
2188			 * just forgot to initialize wb_index.
2189			 */
2190			drm_WARN_ON(&engine->i915->drm, mocs_r == 0);
2191		}
2192
2193		wa_masked_field_set(wal,
2194				    RING_CMD_CCTL(engine->mmio_base),
2195				    CMD_CCTL_MOCS_MASK,
2196				    CMD_CCTL_MOCS_OVERRIDE(mocs_w, mocs_r));
2197	}
2198}
2199
2200static void
2201rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2202{
2203	struct drm_i915_private *i915 = engine->i915;
2204	struct intel_gt *gt = engine->gt;
2205
2206	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2207	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
2208		/* Wa_22014600077 */
2209		wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2210				 ENABLE_EU_COUNT_FOR_TDL_FLUSH);
2211	}
2212
2213	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2214	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2215	    IS_DG2(i915)) {
2216		/* Wa_1509727124 */
2217		wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2218				 SC_DISABLE_POWER_OPTIMIZATION_EBB);
2219	}
2220
2221	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2222	    IS_DG2(i915)) {
2223		/* Wa_22012856258 */
2224		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2225				 GEN12_DISABLE_READ_SUPPRESSION);
2226	}
2227
2228	if (IS_DG2(i915)) {
2229		/*
2230		 * Wa_22010960976:dg2
2231		 * Wa_14013347512:dg2
2232		 */
2233		wa_mcr_masked_dis(wal, XEHP_HDC_CHICKEN0,
2234				  LSC_L1_FLUSH_CTL_3D_DATAPORT_FLUSH_EVENTS_MASK);
2235	}
2236
2237	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) ||
2238	    IS_DG2(i915)) {
2239		/* Wa_14015150844 */
2240		wa_mcr_add(wal, XEHP_HDC_CHICKEN0, 0,
2241			   _MASKED_BIT_ENABLE(DIS_ATOMIC_CHAINING_TYPED_WRITES),
2242			   0, true);
2243	}
2244
2245	if (IS_DG2(i915) || IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2246	    IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2247		/*
2248		 * Wa_1606700617:tgl,dg1,adl-p
2249		 * Wa_22010271021:tgl,rkl,dg1,adl-s,adl-p
2250		 * Wa_14010826681:tgl,dg1,rkl,adl-p
2251		 * Wa_18019627453:dg2
2252		 */
2253		wa_masked_en(wal,
2254			     GEN9_CS_DEBUG_MODE1,
2255			     FF_DOP_CLOCK_GATE_DISABLE);
2256	}
2257
2258	if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || IS_DG1(i915) ||
2259	    IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2260		/* Wa_1606931601:tgl,rkl,dg1,adl-s,adl-p */
2261		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ);
2262
2263		/*
2264		 * Wa_1407928979:tgl A*
2265		 * Wa_18011464164:tgl[B0+],dg1[B0+]
2266		 * Wa_22010931296:tgl[B0+],dg1[B0+]
2267		 * Wa_14010919138:rkl,dg1,adl-s,adl-p
2268		 */
2269		wa_write_or(wal, GEN7_FF_THREAD_MODE,
2270			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2271
2272		/* Wa_1406941453:tgl,rkl,dg1,adl-s,adl-p */
2273		wa_mcr_masked_en(wal,
2274				 GEN10_SAMPLER_MODE,
2275				 ENABLE_SMALLPL);
2276	}
2277
2278	if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2279	    IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2280		/* Wa_1409804808 */
2281		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2282				 GEN12_PUSH_CONST_DEREF_HOLD_DIS);
2283
2284		/* Wa_14010229206 */
2285		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH);
2286	}
2287
2288	if (IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) || IS_ALDERLAKE_P(i915)) {
2289		/*
2290		 * Wa_1607297627
2291		 *
2292		 * On TGL and RKL there are multiple entries for this WA in the
2293		 * BSpec; some indicate this is an A0-only WA, others indicate
2294		 * it applies to all steppings so we trust the "all steppings."
2295		 */
2296		wa_masked_en(wal,
2297			     RING_PSMI_CTL(RENDER_RING_BASE),
2298			     GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE |
2299			     GEN8_RC_SEMA_IDLE_MSG_DISABLE);
2300	}
2301
2302	if (IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) {
2303		/*
2304		 * "Disable Repacking for Compression (masked R/W access)
2305		 *  before rendering compressed surfaces for display."
2306		 */
2307		wa_masked_en(wal, CACHE_MODE_0_GEN7,
2308			     DISABLE_REPACKING_FOR_COMPRESSION);
2309	}
2310
2311	if (GRAPHICS_VER(i915) == 11) {
2312		/* This is not an Wa. Enable for better image quality */
2313		wa_masked_en(wal,
2314			     _3D_CHICKEN3,
2315			     _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE);
2316
2317		/*
2318		 * Wa_1405543622:icl
2319		 * Formerly known as WaGAPZPriorityScheme
2320		 */
2321		wa_write_or(wal,
2322			    GEN8_GARBCNTL,
2323			    GEN11_ARBITRATION_PRIO_ORDER_MASK);
2324
2325		/*
2326		 * Wa_1604223664:icl
2327		 * Formerly known as WaL3BankAddressHashing
2328		 */
2329		wa_write_clr_set(wal,
2330				 GEN8_GARBCNTL,
2331				 GEN11_HASH_CTRL_EXCL_MASK,
2332				 GEN11_HASH_CTRL_EXCL_BIT0);
2333		wa_write_clr_set(wal,
2334				 GEN11_GLBLINVL,
2335				 GEN11_BANK_HASH_ADDR_EXCL_MASK,
2336				 GEN11_BANK_HASH_ADDR_EXCL_BIT0);
2337
2338		/*
2339		 * Wa_1405733216:icl
2340		 * Formerly known as WaDisableCleanEvicts
2341		 */
2342		wa_mcr_write_or(wal,
2343				GEN8_L3SQCREG4,
2344				GEN11_LQSC_CLEAN_EVICT_DISABLE);
2345
2346		/* Wa_1606682166:icl */
2347		wa_write_or(wal,
2348			    GEN7_SARCHKMD,
2349			    GEN7_DISABLE_SAMPLER_PREFETCH);
2350
2351		/* Wa_1409178092:icl */
2352		wa_mcr_write_clr_set(wal,
2353				     GEN11_SCRATCH2,
2354				     GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE,
2355				     0);
2356
2357		/* WaEnable32PlaneMode:icl */
2358		wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS,
2359			     GEN11_ENABLE_32_PLANE_MODE);
2360
2361		/*
2362		 * Wa_1408767742:icl[a2..forever],ehl[all]
2363		 * Wa_1605460711:icl[a0..c0]
2364		 */
2365		wa_write_or(wal,
2366			    GEN7_FF_THREAD_MODE,
2367			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2368
2369		/* Wa_22010271021 */
2370		wa_masked_en(wal,
2371			     GEN9_CS_DEBUG_MODE1,
2372			     FF_DOP_CLOCK_GATE_DISABLE);
2373	}
2374
2375	/*
2376	 * Intel platforms that support fine-grained preemption (i.e., gen9 and
2377	 * beyond) allow the kernel-mode driver to choose between two different
2378	 * options for controlling preemption granularity and behavior.
2379	 *
2380	 * Option 1 (hardware default):
2381	 *   Preemption settings are controlled in a global manner via
2382	 *   kernel-only register CS_DEBUG_MODE1 (0x20EC).  Any granularity
2383	 *   and settings chosen by the kernel-mode driver will apply to all
2384	 *   userspace clients.
2385	 *
2386	 * Option 2:
2387	 *   Preemption settings are controlled on a per-context basis via
2388	 *   register CS_CHICKEN1 (0x2580).  CS_CHICKEN1 is saved/restored on
2389	 *   context switch and is writable by userspace (e.g., via
2390	 *   MI_LOAD_REGISTER_IMMEDIATE instructions placed in a batch buffer)
2391	 *   which allows different userspace drivers/clients to select
2392	 *   different settings, or to change those settings on the fly in
2393	 *   response to runtime needs.  This option was known by name
2394	 *   "FtrPerCtxtPreemptionGranularityControl" at one time, although
2395	 *   that name is somewhat misleading as other non-granularity
2396	 *   preemption settings are also impacted by this decision.
2397	 *
2398	 * On Linux, our policy has always been to let userspace drivers
2399	 * control preemption granularity/settings (Option 2).  This was
2400	 * originally mandatory on gen9 to prevent ABI breakage (old gen9
2401	 * userspace developed before object-level preemption was enabled would
2402	 * not behave well if i915 were to go with Option 1 and enable that
2403	 * preemption in a global manner).  On gen9 each context would have
2404	 * object-level preemption disabled by default (see
2405	 * WaDisable3DMidCmdPreemption in gen9_ctx_workarounds_init), but
2406	 * userspace drivers could opt-in to object-level preemption as they
2407	 * saw fit.  For post-gen9 platforms, we continue to utilize Option 2;
2408	 * even though it is no longer necessary for ABI compatibility when
2409	 * enabling a new platform, it does ensure that userspace will be able
2410	 * to implement any workarounds that show up requiring temporary
2411	 * adjustments to preemption behavior at runtime.
2412	 *
2413	 * Notes/Workarounds:
2414	 *  - Wa_14015141709:  On DG2 and early steppings of MTL,
2415	 *      CS_CHICKEN1[0] does not disable object-level preemption as
2416	 *      it is supposed to (nor does CS_DEBUG_MODE1[0] if we had been
2417	 *      using Option 1).  Effectively this means userspace is unable
2418	 *      to disable object-level preemption on these platforms/steppings
2419	 *      despite the setting here.
2420	 *
2421	 *  - Wa_16013994831:  May require that userspace program
2422	 *      CS_CHICKEN1[10] when certain runtime conditions are true.
2423	 *      Userspace requires Option 2 to be in effect for their update of
2424	 *      CS_CHICKEN1[10] to be effective.
2425	 *
2426	 * Other workarounds may appear in the future that will also require
2427	 * Option 2 behavior to allow proper userspace implementation.
2428	 */
2429	if (GRAPHICS_VER(i915) >= 9)
2430		wa_masked_en(wal,
2431			     GEN7_FF_SLICE_CS_CHICKEN1,
2432			     GEN9_FFSC_PERCTX_PREEMPT_CTRL);
2433
2434	if (IS_SKYLAKE(i915) ||
2435	    IS_KABYLAKE(i915) ||
2436	    IS_COFFEELAKE(i915) ||
2437	    IS_COMETLAKE(i915)) {
2438		/* WaEnableGapsTsvCreditFix:skl,kbl,cfl */
2439		wa_write_or(wal,
2440			    GEN8_GARBCNTL,
2441			    GEN9_GAPS_TSV_CREDIT_DISABLE);
2442	}
2443
2444	if (IS_BROXTON(i915)) {
2445		/* WaDisablePooledEuLoadBalancingFix:bxt */
2446		wa_masked_en(wal,
2447			     FF_SLICE_CS_CHICKEN2,
2448			     GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
2449	}
2450
2451	if (GRAPHICS_VER(i915) == 9) {
2452		/* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */
2453		wa_masked_en(wal,
2454			     GEN9_CSFE_CHICKEN1_RCS,
2455			     GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE);
2456
2457		/* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */
2458		wa_mcr_write_or(wal,
2459				BDW_SCRATCH1,
2460				GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
2461
2462		/* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */
2463		if (IS_GEN9_LP(i915))
2464			wa_mcr_write_clr_set(wal,
2465					     GEN8_L3SQCREG1,
2466					     L3_PRIO_CREDITS_MASK,
2467					     L3_GENERAL_PRIO_CREDITS(62) |
2468					     L3_HIGH_PRIO_CREDITS(2));
2469
2470		/* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */
2471		wa_mcr_write_or(wal,
2472				GEN8_L3SQCREG4,
2473				GEN8_LQSC_FLUSH_COHERENT_LINES);
2474
2475		/* Disable atomics in L3 to prevent unrecoverable hangs */
2476		wa_write_clr_set(wal, GEN9_SCRATCH_LNCF1,
2477				 GEN9_LNCF_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2478		wa_mcr_write_clr_set(wal, GEN8_L3SQCREG4,
2479				     GEN8_LQSQ_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2480		wa_mcr_write_clr_set(wal, GEN9_SCRATCH1,
2481				     EVICTION_PERF_FIX_ENABLE, 0);
2482	}
2483
2484	if (IS_HASWELL(i915)) {
2485		/* WaSampleCChickenBitEnable:hsw */
2486		wa_masked_en(wal,
2487			     HSW_HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE);
2488
2489		wa_masked_dis(wal,
2490			      CACHE_MODE_0_GEN7,
2491			      /* enable HiZ Raw Stall Optimization */
2492			      HIZ_RAW_STALL_OPT_DISABLE);
2493	}
2494
2495	if (IS_VALLEYVIEW(i915)) {
2496		/* WaDisableEarlyCull:vlv */
2497		wa_masked_en(wal,
2498			     _3D_CHICKEN3,
2499			     _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2500
2501		/*
2502		 * WaVSThreadDispatchOverride:ivb,vlv
2503		 *
2504		 * This actually overrides the dispatch
2505		 * mode for all thread types.
2506		 */
2507		wa_write_clr_set(wal,
2508				 GEN7_FF_THREAD_MODE,
2509				 GEN7_FF_SCHED_MASK,
2510				 GEN7_FF_TS_SCHED_HW |
2511				 GEN7_FF_VS_SCHED_HW |
2512				 GEN7_FF_DS_SCHED_HW);
2513
2514		/* WaPsdDispatchEnable:vlv */
2515		/* WaDisablePSDDualDispatchEnable:vlv */
2516		wa_masked_en(wal,
2517			     GEN7_HALF_SLICE_CHICKEN1,
2518			     GEN7_MAX_PS_THREAD_DEP |
2519			     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2520	}
2521
2522	if (IS_IVYBRIDGE(i915)) {
2523		/* WaDisableEarlyCull:ivb */
2524		wa_masked_en(wal,
2525			     _3D_CHICKEN3,
2526			     _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2527
2528		if (0) { /* causes HiZ corruption on ivb:gt1 */
2529			/* enable HiZ Raw Stall Optimization */
2530			wa_masked_dis(wal,
2531				      CACHE_MODE_0_GEN7,
2532				      HIZ_RAW_STALL_OPT_DISABLE);
2533		}
2534
2535		/*
2536		 * WaVSThreadDispatchOverride:ivb,vlv
2537		 *
2538		 * This actually overrides the dispatch
2539		 * mode for all thread types.
2540		 */
2541		wa_write_clr_set(wal,
2542				 GEN7_FF_THREAD_MODE,
2543				 GEN7_FF_SCHED_MASK,
2544				 GEN7_FF_TS_SCHED_HW |
2545				 GEN7_FF_VS_SCHED_HW |
2546				 GEN7_FF_DS_SCHED_HW);
2547
2548		/* WaDisablePSDDualDispatchEnable:ivb */
2549		if (INTEL_INFO(i915)->gt == 1)
2550			wa_masked_en(wal,
2551				     GEN7_HALF_SLICE_CHICKEN1,
2552				     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2553	}
2554
2555	if (GRAPHICS_VER(i915) == 7) {
2556		/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
2557		wa_masked_en(wal,
2558			     RING_MODE_GEN7(RENDER_RING_BASE),
2559			     GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
2560
2561		/* WaDisable_RenderCache_OperationalFlush:ivb,vlv,hsw */
2562		wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
2563
2564		/*
2565		 * BSpec says this must be set, even though
2566		 * WaDisable4x2SubspanOptimization:ivb,hsw
2567		 * WaDisable4x2SubspanOptimization isn't listed for VLV.
2568		 */
2569		wa_masked_en(wal,
2570			     CACHE_MODE_1,
2571			     PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
2572
2573		/*
2574		 * BSpec recommends 8x4 when MSAA is used,
2575		 * however in practice 16x4 seems fastest.
2576		 *
2577		 * Note that PS/WM thread counts depend on the WIZ hashing
2578		 * disable bit, which we don't touch here, but it's good
2579		 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2580		 */
2581		wa_masked_field_set(wal,
2582				    GEN7_GT_MODE,
2583				    GEN6_WIZ_HASHING_MASK,
2584				    GEN6_WIZ_HASHING_16x4);
2585	}
2586
2587	if (IS_GRAPHICS_VER(i915, 6, 7))
2588		/*
2589		 * We need to disable the AsyncFlip performance optimisations in
2590		 * order to use MI_WAIT_FOR_EVENT within the CS. It should
2591		 * already be programmed to '1' on all products.
2592		 *
2593		 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
2594		 */
2595		wa_masked_en(wal,
2596			     RING_MI_MODE(RENDER_RING_BASE),
2597			     ASYNC_FLIP_PERF_DISABLE);
2598
2599	if (GRAPHICS_VER(i915) == 6) {
2600		/*
2601		 * Required for the hardware to program scanline values for
2602		 * waiting
2603		 * WaEnableFlushTlbInvalidationMode:snb
2604		 */
2605		wa_masked_en(wal,
2606			     GFX_MODE,
2607			     GFX_TLB_INVALIDATE_EXPLICIT);
2608
2609		/* WaDisableHiZPlanesWhenMSAAEnabled:snb */
2610		wa_masked_en(wal,
2611			     _3D_CHICKEN,
2612			     _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB);
2613
2614		wa_masked_en(wal,
2615			     _3D_CHICKEN3,
2616			     /* WaStripsFansDisableFastClipPerformanceFix:snb */
2617			     _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL |
2618			     /*
2619			      * Bspec says:
2620			      * "This bit must be set if 3DSTATE_CLIP clip mode is set
2621			      * to normal and 3DSTATE_SF number of SF output attributes
2622			      * is more than 16."
2623			      */
2624			     _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH);
2625
2626		/*
2627		 * BSpec recommends 8x4 when MSAA is used,
2628		 * however in practice 16x4 seems fastest.
2629		 *
2630		 * Note that PS/WM thread counts depend on the WIZ hashing
2631		 * disable bit, which we don't touch here, but it's good
2632		 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2633		 */
2634		wa_masked_field_set(wal,
2635				    GEN6_GT_MODE,
2636				    GEN6_WIZ_HASHING_MASK,
2637				    GEN6_WIZ_HASHING_16x4);
2638
2639		/* WaDisable_RenderCache_OperationalFlush:snb */
2640		wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
2641
2642		/*
2643		 * From the Sandybridge PRM, volume 1 part 3, page 24:
2644		 * "If this bit is set, STCunit will have LRA as replacement
2645		 *  policy. [...] This bit must be reset. LRA replacement
2646		 *  policy is not supported."
2647		 */
2648		wa_masked_dis(wal,
2649			      CACHE_MODE_0,
2650			      CM0_STC_EVICT_DISABLE_LRA_SNB);
2651	}
2652
2653	if (IS_GRAPHICS_VER(i915, 4, 6))
2654		/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
2655		wa_add(wal, RING_MI_MODE(RENDER_RING_BASE),
2656		       0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH),
2657		       /* XXX bit doesn't stick on Broadwater */
2658		       IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH, true);
2659
2660	if (GRAPHICS_VER(i915) == 4)
2661		/*
2662		 * Disable CONSTANT_BUFFER before it is loaded from the context
2663		 * image. For as it is loaded, it is executed and the stored
2664		 * address may no longer be valid, leading to a GPU hang.
2665		 *
2666		 * This imposes the requirement that userspace reload their
2667		 * CONSTANT_BUFFER on every batch, fortunately a requirement
2668		 * they are already accustomed to from before contexts were
2669		 * enabled.
2670		 */
2671		wa_add(wal, ECOSKPD(RENDER_RING_BASE),
2672		       0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE),
2673		       0 /* XXX bit doesn't stick on Broadwater */,
2674		       true);
2675}
2676
2677static void
2678xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2679{
2680	struct drm_i915_private *i915 = engine->i915;
2681
2682	/* WaKBLVECSSemaphoreWaitPoll:kbl */
2683	if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_A0, STEP_F0)) {
2684		wa_write(wal,
2685			 RING_SEMA_WAIT_POLL(engine->mmio_base),
2686			 1);
2687	}
2688	/* Wa_16018031267, Wa_16018063123 */
2689	if (NEEDS_FASTCOLOR_BLT_WABB(engine))
2690		wa_masked_field_set(wal, ECOSKPD(engine->mmio_base),
2691				    XEHP_BLITTER_SCHEDULING_MODE_MASK,
2692				    XEHP_BLITTER_ROUND_ROBIN_MODE);
2693}
2694
2695static void
2696ccs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2697{
2698	/* boilerplate for any CCS engine workaround */
2699}
2700
2701/*
2702 * The bspec performance guide has recommended MMIO tuning settings.  These
2703 * aren't truly "workarounds" but we want to program them with the same
2704 * workaround infrastructure to ensure that they're automatically added to
2705 * the GuC save/restore lists, re-applied at the right times, and checked for
2706 * any conflicting programming requested by real workarounds.
2707 *
2708 * Programming settings should be added here only if their registers are not
2709 * part of an engine's register state context.  If a register is part of a
2710 * context, then any tuning settings should be programmed in an appropriate
2711 * function invoked by __intel_engine_init_ctx_wa().
2712 */
2713static void
2714add_render_compute_tuning_settings(struct intel_gt *gt,
2715				   struct i915_wa_list *wal)
2716{
2717	struct drm_i915_private *i915 = gt->i915;
2718
2719	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)) || IS_DG2(i915))
2720		wa_mcr_write_clr_set(wal, RT_CTRL, STACKID_CTRL, STACKID_CTRL_512);
2721
2722	/*
2723	 * This tuning setting proves beneficial only on ATS-M designs; the
2724	 * default "age based" setting is optimal on regular DG2 and other
2725	 * platforms.
2726	 */
2727	if (INTEL_INFO(i915)->tuning_thread_rr_after_dep)
2728		wa_mcr_masked_field_set(wal, GEN9_ROW_CHICKEN4, THREAD_EX_ARB_MODE,
2729					THREAD_EX_ARB_MODE_RR_AFTER_DEP);
2730
2731	if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 55))
2732		wa_write_clr(wal, GEN8_GARBCNTL, GEN12_BUS_HASH_CTL_BIT_EXC);
2733}
2734
2735static void ccs_engine_wa_mode(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2736{
2737	struct intel_gt *gt = engine->gt;
2738	u32 mode;
2739
2740	if (!IS_DG2(gt->i915))
2741		return;
2742
2743	/*
2744	 * Wa_14019159160: This workaround, along with others, leads to
2745	 * significant challenges in utilizing load balancing among the
2746	 * CCS slices. Consequently, an architectural decision has been
2747	 * made to completely disable automatic CCS load balancing.
2748	 */
2749	wa_masked_en(wal, GEN12_RCU_MODE, XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE);
2750
2751	/*
2752	 * After having disabled automatic load balancing we need to
2753	 * assign all slices to a single CCS. We will call it CCS mode 1
2754	 */
2755	mode = intel_gt_apply_ccs_mode(gt);
2756	wa_masked_en(wal, XEHP_CCS_MODE, mode);
2757}
2758
2759/*
2760 * The workarounds in this function apply to shared registers in
2761 * the general render reset domain that aren't tied to a
2762 * specific engine.  Since all render+compute engines get reset
2763 * together, and the contents of these registers are lost during
2764 * the shared render domain reset, we'll define such workarounds
2765 * here and then add them to just a single RCS or CCS engine's
2766 * workaround list (whichever engine has the XXXX flag).
2767 */
2768static void
2769general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2770{
2771	struct drm_i915_private *i915 = engine->i915;
2772	struct intel_gt *gt = engine->gt;
2773
2774	add_render_compute_tuning_settings(gt, wal);
2775
2776	if (GRAPHICS_VER(i915) >= 11) {
2777		/* This is not a Wa (although referred to as
2778		 * WaSetInidrectStateOverride in places), this allows
2779		 * applications that reference sampler states through
2780		 * the BindlessSamplerStateBaseAddress to have their
2781		 * border color relative to DynamicStateBaseAddress
2782		 * rather than BindlessSamplerStateBaseAddress.
2783		 *
2784		 * Otherwise SAMPLER_STATE border colors have to be
2785		 * copied in multiple heaps (DynamicStateBaseAddress &
2786		 * BindlessSamplerStateBaseAddress)
2787		 *
2788		 * BSpec: 46052
2789		 */
2790		wa_mcr_masked_en(wal,
2791				 GEN10_SAMPLER_MODE,
2792				 GEN11_INDIRECT_STATE_BASE_ADDR_OVERRIDE);
2793	}
2794
2795	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_B0, STEP_FOREVER) ||
2796	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B0, STEP_FOREVER) ||
2797	    IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 74), IP_VER(12, 74))) {
2798		/* Wa_14017856879 */
2799		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN3, MTL_DISABLE_FIX_FOR_EOT_FLUSH);
2800
2801		/* Wa_14020495402 */
2802		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, XELPG_DISABLE_TDL_SVHS_GATING);
2803	}
2804
2805	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2806	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2807		/*
2808		 * Wa_14017066071
2809		 * Wa_14017654203
2810		 */
2811		wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2812				 MTL_DISABLE_SAMPLER_SC_OOO);
2813
2814	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2815		/* Wa_22015279794 */
2816		wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2817				 DISABLE_PREFETCH_INTO_IC);
2818
2819	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2820	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2821	    IS_DG2(i915)) {
2822		/* Wa_22013037850 */
2823		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW,
2824				DISABLE_128B_EVICTION_COMMAND_UDW);
2825
2826		/* Wa_18017747507 */
2827		wa_masked_en(wal, VFG_PREEMPTION_CHICKEN, POLYGON_TRIFAN_LINELOOP_DISABLE);
2828	}
2829
2830	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2831	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2832	    IS_DG2(i915)) {
2833		/* Wa_22014226127 */
2834		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0, DISABLE_D8_D16_COASLESCE);
2835	}
2836
2837	if (IS_DG2(i915)) {
2838		/* Wa_14015227452:dg2,pvc */
2839		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE);
2840
2841		/*
2842		 * Wa_16011620976:dg2_g11
2843		 * Wa_22015475538:dg2
2844		 */
2845		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8);
2846
2847		/* Wa_18028616096 */
2848		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, UGM_FRAGMENT_THRESHOLD_TO_3);
2849	}
2850
2851	if (IS_DG2_G11(i915)) {
2852		/*
2853		 * Wa_22012826095:dg2
2854		 * Wa_22013059131:dg2
2855		 */
2856		wa_mcr_write_clr_set(wal, LSC_CHICKEN_BIT_0_UDW,
2857				     MAXREQS_PER_BANK,
2858				     REG_FIELD_PREP(MAXREQS_PER_BANK, 2));
2859
2860		/* Wa_22013059131:dg2 */
2861		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0,
2862				FORCE_1_SUB_MESSAGE_PER_FRAGMENT);
2863
2864		/*
2865		 * Wa_22012654132
2866		 *
2867		 * Note that register 0xE420 is write-only and cannot be read
2868		 * back for verification on DG2 (due to Wa_14012342262), so
2869		 * we need to explicitly skip the readback.
2870		 */
2871		wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
2872			   _MASKED_BIT_ENABLE(ENABLE_PREFETCH_INTO_IC),
2873			   0 /* write-only, so skip validation */,
2874			   true);
2875	}
2876}
2877
2878static void
2879engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2880{
2881	if (GRAPHICS_VER(engine->i915) < 4)
2882		return;
2883
2884	engine_fake_wa_init(engine, wal);
2885
2886	/*
2887	 * These are common workarounds that just need to applied
2888	 * to a single RCS/CCS engine's workaround list since
2889	 * they're reset as part of the general render domain reset.
2890	 */
2891	if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE) {
2892		general_render_compute_wa_init(engine, wal);
2893		ccs_engine_wa_mode(engine, wal);
2894	}
2895
2896	if (engine->class == COMPUTE_CLASS)
2897		ccs_engine_wa_init(engine, wal);
2898	else if (engine->class == RENDER_CLASS)
2899		rcs_engine_wa_init(engine, wal);
2900	else
2901		xcs_engine_wa_init(engine, wal);
2902}
2903
2904void intel_engine_init_workarounds(struct intel_engine_cs *engine)
2905{
2906	struct i915_wa_list *wal = &engine->wa_list;
2907
2908	wa_init_start(wal, engine->gt, "engine", engine->name);
2909	engine_init_workarounds(engine, wal);
2910	wa_init_finish(wal);
2911}
2912
2913void intel_engine_apply_workarounds(struct intel_engine_cs *engine)
2914{
2915	wa_list_apply(&engine->wa_list);
2916}
2917
2918static const struct i915_range mcr_ranges_gen8[] = {
2919	{ .start = 0x5500, .end = 0x55ff },
2920	{ .start = 0x7000, .end = 0x7fff },
2921	{ .start = 0x9400, .end = 0x97ff },
2922	{ .start = 0xb000, .end = 0xb3ff },
2923	{ .start = 0xe000, .end = 0xe7ff },
2924	{},
2925};
2926
2927static const struct i915_range mcr_ranges_gen12[] = {
2928	{ .start =  0x8150, .end =  0x815f },
2929	{ .start =  0x9520, .end =  0x955f },
2930	{ .start =  0xb100, .end =  0xb3ff },
2931	{ .start =  0xde80, .end =  0xe8ff },
2932	{ .start = 0x24a00, .end = 0x24a7f },
2933	{},
2934};
2935
2936static const struct i915_range mcr_ranges_xehp[] = {
2937	{ .start =  0x4000, .end =  0x4aff },
2938	{ .start =  0x5200, .end =  0x52ff },
2939	{ .start =  0x5400, .end =  0x7fff },
2940	{ .start =  0x8140, .end =  0x815f },
2941	{ .start =  0x8c80, .end =  0x8dff },
2942	{ .start =  0x94d0, .end =  0x955f },
2943	{ .start =  0x9680, .end =  0x96ff },
2944	{ .start =  0xb000, .end =  0xb3ff },
2945	{ .start =  0xc800, .end =  0xcfff },
2946	{ .start =  0xd800, .end =  0xd8ff },
2947	{ .start =  0xdc00, .end =  0xffff },
2948	{ .start = 0x17000, .end = 0x17fff },
2949	{ .start = 0x24a00, .end = 0x24a7f },
2950	{},
2951};
2952
2953static bool mcr_range(struct drm_i915_private *i915, u32 offset)
2954{
2955	const struct i915_range *mcr_ranges;
2956	int i;
2957
2958	if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 55))
2959		mcr_ranges = mcr_ranges_xehp;
2960	else if (GRAPHICS_VER(i915) >= 12)
2961		mcr_ranges = mcr_ranges_gen12;
2962	else if (GRAPHICS_VER(i915) >= 8)
2963		mcr_ranges = mcr_ranges_gen8;
2964	else
2965		return false;
2966
2967	/*
2968	 * Registers in these ranges are affected by the MCR selector
2969	 * which only controls CPU initiated MMIO. Routing does not
2970	 * work for CS access so we cannot verify them on this path.
2971	 */
2972	for (i = 0; mcr_ranges[i].start; i++)
2973		if (offset >= mcr_ranges[i].start &&
2974		    offset <= mcr_ranges[i].end)
2975			return true;
2976
2977	return false;
2978}
2979
2980static int
2981wa_list_srm(struct i915_request *rq,
2982	    const struct i915_wa_list *wal,
2983	    struct i915_vma *vma)
2984{
2985	struct drm_i915_private *i915 = rq->i915;
2986	unsigned int i, count = 0;
2987	const struct i915_wa *wa;
2988	u32 srm, *cs;
2989
2990	srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
2991	if (GRAPHICS_VER(i915) >= 8)
2992		srm++;
2993
2994	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
2995		if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg)))
2996			count++;
2997	}
2998
2999	cs = intel_ring_begin(rq, 4 * count);
3000	if (IS_ERR(cs))
3001		return PTR_ERR(cs);
3002
3003	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3004		u32 offset = i915_mmio_reg_offset(wa->reg);
3005
3006		if (mcr_range(i915, offset))
3007			continue;
3008
3009		*cs++ = srm;
3010		*cs++ = offset;
3011		*cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i;
3012		*cs++ = 0;
3013	}
3014	intel_ring_advance(rq, cs);
3015
3016	return 0;
3017}
3018
3019static int engine_wa_list_verify(struct intel_context *ce,
3020				 const struct i915_wa_list * const wal,
3021				 const char *from)
3022{
3023	const struct i915_wa *wa;
3024	struct i915_request *rq;
3025	struct i915_vma *vma;
3026	struct i915_gem_ww_ctx ww;
3027	unsigned int i;
3028	u32 *results;
3029	int err;
3030
3031	if (!wal->count)
3032		return 0;
3033
3034	vma = __vm_create_scratch_for_read(&ce->engine->gt->ggtt->vm,
3035					   wal->count * sizeof(u32));
3036	if (IS_ERR(vma))
3037		return PTR_ERR(vma);
3038
3039	intel_engine_pm_get(ce->engine);
3040	i915_gem_ww_ctx_init(&ww, false);
3041retry:
3042	err = i915_gem_object_lock(vma->obj, &ww);
3043	if (err == 0)
3044		err = intel_context_pin_ww(ce, &ww);
3045	if (err)
3046		goto err_pm;
3047
3048	err = i915_vma_pin_ww(vma, &ww, 0, 0,
3049			   i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
3050	if (err)
3051		goto err_unpin;
3052
3053	rq = i915_request_create(ce);
3054	if (IS_ERR(rq)) {
3055		err = PTR_ERR(rq);
3056		goto err_vma;
3057	}
3058
3059	err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
3060	if (err == 0)
3061		err = wa_list_srm(rq, wal, vma);
3062
3063	i915_request_get(rq);
3064	if (err)
3065		i915_request_set_error_once(rq, err);
3066	i915_request_add(rq);
3067
3068	if (err)
3069		goto err_rq;
3070
3071	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
3072		err = -ETIME;
3073		goto err_rq;
3074	}
3075
3076	results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
3077	if (IS_ERR(results)) {
3078		err = PTR_ERR(results);
3079		goto err_rq;
3080	}
3081
3082	err = 0;
3083	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3084		if (mcr_range(rq->i915, i915_mmio_reg_offset(wa->reg)))
3085			continue;
3086
3087		if (!wa_verify(wal->gt, wa, results[i], wal->name, from))
3088			err = -ENXIO;
3089	}
3090
3091	i915_gem_object_unpin_map(vma->obj);
3092
3093err_rq:
3094	i915_request_put(rq);
3095err_vma:
3096	i915_vma_unpin(vma);
3097err_unpin:
3098	intel_context_unpin(ce);
3099err_pm:
3100	if (err == -EDEADLK) {
3101		err = i915_gem_ww_ctx_backoff(&ww);
3102		if (!err)
3103			goto retry;
3104	}
3105	i915_gem_ww_ctx_fini(&ww);
3106	intel_engine_pm_put(ce->engine);
3107	i915_vma_put(vma);
3108	return err;
3109}
3110
3111int intel_engine_verify_workarounds(struct intel_engine_cs *engine,
3112				    const char *from)
3113{
3114	return engine_wa_list_verify(engine->kernel_context,
3115				     &engine->wa_list,
3116				     from);
3117}
3118
3119#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
3120#include "selftest_workarounds.c"
3121#endif