Linux Audio

Check our new training course

Loading...
v6.13.7
   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2014-2018 Intel Corporation
   4 */
   5
   6#include "i915_drv.h"
   7#include "i915_reg.h"
   8#include "intel_context.h"
   9#include "intel_engine_pm.h"
  10#include "intel_engine_regs.h"
  11#include "intel_gpu_commands.h"
  12#include "intel_gt.h"
  13#include "intel_gt_ccs_mode.h"
  14#include "intel_gt_mcr.h"
  15#include "intel_gt_print.h"
  16#include "intel_gt_regs.h"
  17#include "intel_ring.h"
  18#include "intel_workarounds.h"
  19
  20#include "display/intel_fbc_regs.h"
  21
  22/**
  23 * DOC: Hardware workarounds
  24 *
  25 * Hardware workarounds are register programming documented to be executed in
  26 * the driver that fall outside of the normal programming sequences for a
  27 * platform. There are some basic categories of workarounds, depending on
  28 * how/when they are applied:
  29 *
  30 * - Context workarounds: workarounds that touch registers that are
  31 *   saved/restored to/from the HW context image. The list is emitted (via Load
  32 *   Register Immediate commands) once when initializing the device and saved in
  33 *   the default context. That default context is then used on every context
  34 *   creation to have a "primed golden context", i.e. a context image that
  35 *   already contains the changes needed to all the registers.
  36 *
  37 *   Context workarounds should be implemented in the \*_ctx_workarounds_init()
  38 *   variants respective to the targeted platforms.
  39 *
  40 * - Engine workarounds: the list of these WAs is applied whenever the specific
  41 *   engine is reset. It's also possible that a set of engine classes share a
  42 *   common power domain and they are reset together. This happens on some
  43 *   platforms with render and compute engines. In this case (at least) one of
  44 *   them need to keeep the workaround programming: the approach taken in the
  45 *   driver is to tie those workarounds to the first compute/render engine that
  46 *   is registered.  When executing with GuC submission, engine resets are
  47 *   outside of kernel driver control, hence the list of registers involved in
  48 *   written once, on engine initialization, and then passed to GuC, that
  49 *   saves/restores their values before/after the reset takes place. See
  50 *   ``drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c`` for reference.
  51 *
  52 *   Workarounds for registers specific to RCS and CCS should be implemented in
  53 *   rcs_engine_wa_init() and ccs_engine_wa_init(), respectively; those for
  54 *   registers belonging to BCS, VCS or VECS should be implemented in
  55 *   xcs_engine_wa_init(). Workarounds for registers not belonging to a specific
  56 *   engine's MMIO range but that are part of of the common RCS/CCS reset domain
  57 *   should be implemented in general_render_compute_wa_init(). The settings
  58 *   about the CCS load balancing should be added in ccs_engine_wa_mode().
  59 *
  60 * - GT workarounds: the list of these WAs is applied whenever these registers
  61 *   revert to their default values: on GPU reset, suspend/resume [1]_, etc.
  62 *
  63 *   GT workarounds should be implemented in the \*_gt_workarounds_init()
  64 *   variants respective to the targeted platforms.
  65 *
  66 * - Register whitelist: some workarounds need to be implemented in userspace,
  67 *   but need to touch privileged registers. The whitelist in the kernel
  68 *   instructs the hardware to allow the access to happen. From the kernel side,
  69 *   this is just a special case of a MMIO workaround (as we write the list of
  70 *   these to/be-whitelisted registers to some special HW registers).
  71 *
  72 *   Register whitelisting should be done in the \*_whitelist_build() variants
  73 *   respective to the targeted platforms.
  74 *
  75 * - Workaround batchbuffers: buffers that get executed automatically by the
  76 *   hardware on every HW context restore. These buffers are created and
  77 *   programmed in the default context so the hardware always go through those
  78 *   programming sequences when switching contexts. The support for workaround
  79 *   batchbuffers is enabled these hardware mechanisms:
  80 *
  81 *   #. INDIRECT_CTX: A batchbuffer and an offset are provided in the default
  82 *      context, pointing the hardware to jump to that location when that offset
  83 *      is reached in the context restore. Workaround batchbuffer in the driver
  84 *      currently uses this mechanism for all platforms.
  85 *
  86 *   #. BB_PER_CTX_PTR: A batchbuffer is provided in the default context,
  87 *      pointing the hardware to a buffer to continue executing after the
  88 *      engine registers are restored in a context restore sequence. This is
  89 *      currently not used in the driver.
  90 *
  91 * - Other:  There are WAs that, due to their nature, cannot be applied from a
  92 *   central place. Those are peppered around the rest of the code, as needed.
  93 *   Workarounds related to the display IP are the main example.
  94 *
  95 * .. [1] Technically, some registers are powercontext saved & restored, so they
  96 *    survive a suspend/resume. In practice, writing them again is not too
  97 *    costly and simplifies things, so it's the approach taken in the driver.
  98 */
  99
 100static void wa_init_start(struct i915_wa_list *wal, struct intel_gt *gt,
 101			  const char *name, const char *engine_name)
 102{
 103	wal->gt = gt;
 104	wal->name = name;
 105	wal->engine_name = engine_name;
 106}
 107
 108#define WA_LIST_CHUNK (1 << 4)
 109
 110static void wa_init_finish(struct i915_wa_list *wal)
 111{
 112	/* Trim unused entries. */
 113	if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) {
 114		struct i915_wa *list = kmemdup_array(wal->list, wal->count,
 115						     sizeof(*list), GFP_KERNEL);
 
 116
 117		if (list) {
 118			kfree(wal->list);
 119			wal->list = list;
 120		}
 121	}
 122
 123	if (!wal->count)
 124		return;
 125
 126	gt_dbg(wal->gt, "Initialized %u %s workarounds on %s\n",
 127	       wal->wa_count, wal->name, wal->engine_name);
 128}
 129
 130static enum forcewake_domains
 131wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal)
 132{
 133	enum forcewake_domains fw = 0;
 134	struct i915_wa *wa;
 135	unsigned int i;
 136
 137	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
 138		fw |= intel_uncore_forcewake_for_reg(uncore,
 139						     wa->reg,
 140						     FW_REG_READ |
 141						     FW_REG_WRITE);
 142
 143	return fw;
 144}
 145
 146static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa)
 147{
 148	unsigned int addr = i915_mmio_reg_offset(wa->reg);
 149	struct drm_i915_private *i915 = wal->gt->i915;
 150	unsigned int start = 0, end = wal->count;
 151	const unsigned int grow = WA_LIST_CHUNK;
 152	struct i915_wa *wa_;
 153
 154	GEM_BUG_ON(!is_power_of_2(grow));
 155
 156	if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */
 157		struct i915_wa *list;
 158
 159		list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*wa),
 160				     GFP_KERNEL);
 161		if (!list) {
 162			drm_err(&i915->drm, "No space for workaround init!\n");
 163			return;
 164		}
 165
 166		if (wal->list) {
 167			memcpy(list, wal->list, sizeof(*wa) * wal->count);
 168			kfree(wal->list);
 169		}
 170
 171		wal->list = list;
 172	}
 173
 174	while (start < end) {
 175		unsigned int mid = start + (end - start) / 2;
 176
 177		if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) {
 178			start = mid + 1;
 179		} else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) {
 180			end = mid;
 181		} else {
 182			wa_ = &wal->list[mid];
 183
 184			if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) {
 185				drm_err(&i915->drm,
 186					"Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n",
 187					i915_mmio_reg_offset(wa_->reg),
 188					wa_->clr, wa_->set);
 189
 190				wa_->set &= ~wa->clr;
 191			}
 192
 193			wal->wa_count++;
 194			wa_->set |= wa->set;
 195			wa_->clr |= wa->clr;
 196			wa_->read |= wa->read;
 197			return;
 198		}
 199	}
 200
 201	wal->wa_count++;
 202	wa_ = &wal->list[wal->count++];
 203	*wa_ = *wa;
 204
 205	while (wa_-- > wal->list) {
 206		GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) ==
 207			   i915_mmio_reg_offset(wa_[1].reg));
 208		if (i915_mmio_reg_offset(wa_[1].reg) >
 209		    i915_mmio_reg_offset(wa_[0].reg))
 210			break;
 211
 212		swap(wa_[1], wa_[0]);
 213	}
 214}
 215
 216static void wa_add(struct i915_wa_list *wal, i915_reg_t reg,
 217		   u32 clear, u32 set, u32 read_mask, bool masked_reg)
 218{
 219	struct i915_wa wa = {
 220		.reg  = reg,
 221		.clr  = clear,
 222		.set  = set,
 223		.read = read_mask,
 224		.masked_reg = masked_reg,
 225	};
 226
 227	_wa_add(wal, &wa);
 228}
 229
 230static void wa_mcr_add(struct i915_wa_list *wal, i915_mcr_reg_t reg,
 231		       u32 clear, u32 set, u32 read_mask, bool masked_reg)
 232{
 233	struct i915_wa wa = {
 234		.mcr_reg = reg,
 235		.clr  = clear,
 236		.set  = set,
 237		.read = read_mask,
 238		.masked_reg = masked_reg,
 239		.is_mcr = 1,
 240	};
 241
 242	_wa_add(wal, &wa);
 243}
 244
 245static void
 246wa_write_clr_set(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set)
 247{
 248	wa_add(wal, reg, clear, set, clear | set, false);
 249}
 250
 251static void
 252wa_mcr_write_clr_set(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clear, u32 set)
 253{
 254	wa_mcr_add(wal, reg, clear, set, clear | set, false);
 255}
 256
 257static void
 258wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
 259{
 260	wa_write_clr_set(wal, reg, ~0, set);
 261}
 262
 263static void
 
 
 
 
 
 
 264wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
 265{
 266	wa_write_clr_set(wal, reg, set, set);
 267}
 268
 269static void
 270wa_mcr_write_or(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set)
 271{
 272	wa_mcr_write_clr_set(wal, reg, set, set);
 273}
 274
 275static void
 276wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr)
 277{
 278	wa_write_clr_set(wal, reg, clr, 0);
 279}
 280
 281static void
 282wa_mcr_write_clr(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clr)
 283{
 284	wa_mcr_write_clr_set(wal, reg, clr, 0);
 285}
 286
 287/*
 288 * WA operations on "masked register". A masked register has the upper 16 bits
 289 * documented as "masked" in b-spec. Its purpose is to allow writing to just a
 290 * portion of the register without a rmw: you simply write in the upper 16 bits
 291 * the mask of bits you are going to modify.
 292 *
 293 * The wa_masked_* family of functions already does the necessary operations to
 294 * calculate the mask based on the parameters passed, so user only has to
 295 * provide the lower 16 bits of that register.
 296 */
 297
 298static void
 299wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
 300{
 301	wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
 302}
 303
 304static void
 305wa_mcr_masked_en(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
 306{
 307	wa_mcr_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
 308}
 309
 310static void
 311wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
 312{
 313	wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
 314}
 315
 316static void
 317wa_mcr_masked_dis(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
 318{
 319	wa_mcr_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
 320}
 321
 322static void
 323wa_masked_field_set(struct i915_wa_list *wal, i915_reg_t reg,
 324		    u32 mask, u32 val)
 325{
 326	wa_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
 327}
 328
 329static void
 330wa_mcr_masked_field_set(struct i915_wa_list *wal, i915_mcr_reg_t reg,
 331			u32 mask, u32 val)
 332{
 333	wa_mcr_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
 334}
 335
 336static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine,
 337				      struct i915_wa_list *wal)
 338{
 339	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
 340}
 341
 342static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine,
 343				      struct i915_wa_list *wal)
 344{
 345	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
 346}
 347
 348static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine,
 349				      struct i915_wa_list *wal)
 350{
 351	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
 352
 353	/* WaDisableAsyncFlipPerfMode:bdw,chv */
 354	wa_masked_en(wal, RING_MI_MODE(RENDER_RING_BASE), ASYNC_FLIP_PERF_DISABLE);
 355
 356	/* WaDisablePartialInstShootdown:bdw,chv */
 357	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
 358			 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
 359
 360	/* Use Force Non-Coherent whenever executing a 3D context. This is a
 361	 * workaround for a possible hang in the unlikely event a TLB
 362	 * invalidation occurs during a PSD flush.
 363	 */
 364	/* WaForceEnableNonCoherent:bdw,chv */
 365	/* WaHdcDisableFetchWhenMasked:bdw,chv */
 366	wa_masked_en(wal, HDC_CHICKEN0,
 367		     HDC_DONOT_FETCH_MEM_WHEN_MASKED |
 368		     HDC_FORCE_NON_COHERENT);
 369
 370	/* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
 371	 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
 372	 *  polygons in the same 8x4 pixel/sample area to be processed without
 373	 *  stalling waiting for the earlier ones to write to Hierarchical Z
 374	 *  buffer."
 375	 *
 376	 * This optimization is off by default for BDW and CHV; turn it on.
 377	 */
 378	wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
 379
 380	/* Wa4x4STCOptimizationDisable:bdw,chv */
 381	wa_masked_en(wal, CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
 382
 383	/*
 384	 * BSpec recommends 8x4 when MSAA is used,
 385	 * however in practice 16x4 seems fastest.
 386	 *
 387	 * Note that PS/WM thread counts depend on the WIZ hashing
 388	 * disable bit, which we don't touch here, but it's good
 389	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
 390	 */
 391	wa_masked_field_set(wal, GEN7_GT_MODE,
 392			    GEN6_WIZ_HASHING_MASK,
 393			    GEN6_WIZ_HASHING_16x4);
 394}
 395
 396static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine,
 397				     struct i915_wa_list *wal)
 398{
 399	struct drm_i915_private *i915 = engine->i915;
 400
 401	gen8_ctx_workarounds_init(engine, wal);
 402
 403	/* WaDisableThreadStallDopClockGating:bdw (pre-production) */
 404	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
 405
 406	/* WaDisableDopClockGating:bdw
 407	 *
 408	 * Also see the related UCGTCL1 write in bdw_init_clock_gating()
 409	 * to disable EUTC clock gating.
 410	 */
 411	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
 412			 DOP_CLOCK_GATING_DISABLE);
 413
 414	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
 415			 GEN8_SAMPLER_POWER_BYPASS_DIS);
 416
 417	wa_masked_en(wal, HDC_CHICKEN0,
 418		     /* WaForceContextSaveRestoreNonCoherent:bdw */
 419		     HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
 420		     /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
 421		     (INTEL_INFO(i915)->gt == 3 ? HDC_FENCE_DEST_SLM_DISABLE : 0));
 422}
 423
 424static void chv_ctx_workarounds_init(struct intel_engine_cs *engine,
 425				     struct i915_wa_list *wal)
 426{
 427	gen8_ctx_workarounds_init(engine, wal);
 428
 429	/* WaDisableThreadStallDopClockGating:chv */
 430	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
 431
 432	/* Improve HiZ throughput on CHV. */
 433	wa_masked_en(wal, HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
 434}
 435
 436static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine,
 437				      struct i915_wa_list *wal)
 438{
 439	struct drm_i915_private *i915 = engine->i915;
 440
 441	if (HAS_LLC(i915)) {
 442		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
 443		 *
 444		 * Must match Display Engine. See
 445		 * WaCompressedResourceDisplayNewHashMode.
 446		 */
 447		wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 448			     GEN9_PBE_COMPRESSED_HASH_SELECTION);
 449		wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
 450				 GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR);
 451	}
 452
 453	/* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */
 454	/* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */
 455	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
 456			 FLOW_CONTROL_ENABLE |
 457			 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
 458
 459	/* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */
 460	/* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */
 461	wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
 462			 GEN9_ENABLE_YV12_BUGFIX |
 463			 GEN9_ENABLE_GPGPU_PREEMPTION);
 464
 465	/* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */
 466	/* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */
 467	wa_masked_en(wal, CACHE_MODE_1,
 468		     GEN8_4x4_STC_OPTIMIZATION_DISABLE |
 469		     GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE);
 470
 471	/* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */
 472	wa_mcr_masked_dis(wal, GEN9_HALF_SLICE_CHICKEN5,
 473			  GEN9_CCS_TLB_PREFETCH_ENABLE);
 474
 475	/* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */
 476	wa_masked_en(wal, HDC_CHICKEN0,
 477		     HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
 478		     HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
 479
 480	/* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
 481	 * both tied to WaForceContextSaveRestoreNonCoherent
 482	 * in some hsds for skl. We keep the tie for all gen9. The
 483	 * documentation is a bit hazy and so we want to get common behaviour,
 484	 * even though there is no clear evidence we would need both on kbl/bxt.
 485	 * This area has been source of system hangs so we play it safe
 486	 * and mimic the skl regardless of what bspec says.
 487	 *
 488	 * Use Force Non-Coherent whenever executing a 3D context. This
 489	 * is a workaround for a possible hang in the unlikely event
 490	 * a TLB invalidation occurs during a PSD flush.
 491	 */
 492
 493	/* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */
 494	wa_masked_en(wal, HDC_CHICKEN0,
 495		     HDC_FORCE_NON_COHERENT);
 496
 497	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
 498	if (IS_SKYLAKE(i915) ||
 499	    IS_KABYLAKE(i915) ||
 500	    IS_COFFEELAKE(i915) ||
 501	    IS_COMETLAKE(i915))
 502		wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
 503				 GEN8_SAMPLER_POWER_BYPASS_DIS);
 504
 505	/* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */
 506	wa_mcr_masked_en(wal, HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
 507
 508	/*
 509	 * Supporting preemption with fine-granularity requires changes in the
 510	 * batch buffer programming. Since we can't break old userspace, we
 511	 * need to set our default preemption level to safe value. Userspace is
 512	 * still able to use more fine-grained preemption levels, since in
 513	 * WaEnablePreemptionGranularityControlByUMD we're whitelisting the
 514	 * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are
 515	 * not real HW workarounds, but merely a way to start using preemption
 516	 * while maintaining old contract with userspace.
 517	 */
 518
 519	/* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */
 520	wa_masked_dis(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
 521
 522	/* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */
 523	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
 524			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 525			    GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
 526
 527	/* WaClearHIZ_WM_CHICKEN3:bxt,glk */
 528	if (IS_GEN9_LP(i915))
 529		wa_masked_en(wal, GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
 530}
 531
 532static void skl_tune_iz_hashing(struct intel_engine_cs *engine,
 533				struct i915_wa_list *wal)
 534{
 535	struct intel_gt *gt = engine->gt;
 536	u8 vals[3] = { 0, 0, 0 };
 537	unsigned int i;
 538
 539	for (i = 0; i < 3; i++) {
 540		u8 ss;
 541
 542		/*
 543		 * Only consider slices where one, and only one, subslice has 7
 544		 * EUs
 545		 */
 546		if (!is_power_of_2(gt->info.sseu.subslice_7eu[i]))
 547			continue;
 548
 549		/*
 550		 * subslice_7eu[i] != 0 (because of the check above) and
 551		 * ss_max == 4 (maximum number of subslices possible per slice)
 552		 *
 553		 * ->    0 <= ss <= 3;
 554		 */
 555		ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1;
 556		vals[i] = 3 - ss;
 557	}
 558
 559	if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
 560		return;
 561
 562	/* Tune IZ hashing. See intel_device_info_runtime_init() */
 563	wa_masked_field_set(wal, GEN7_GT_MODE,
 564			    GEN9_IZ_HASHING_MASK(2) |
 565			    GEN9_IZ_HASHING_MASK(1) |
 566			    GEN9_IZ_HASHING_MASK(0),
 567			    GEN9_IZ_HASHING(2, vals[2]) |
 568			    GEN9_IZ_HASHING(1, vals[1]) |
 569			    GEN9_IZ_HASHING(0, vals[0]));
 570}
 571
 572static void skl_ctx_workarounds_init(struct intel_engine_cs *engine,
 573				     struct i915_wa_list *wal)
 574{
 575	gen9_ctx_workarounds_init(engine, wal);
 576	skl_tune_iz_hashing(engine, wal);
 577}
 578
 579static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine,
 580				     struct i915_wa_list *wal)
 581{
 582	gen9_ctx_workarounds_init(engine, wal);
 583
 584	/* WaDisableThreadStallDopClockGating:bxt */
 585	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
 586			 STALL_DOP_GATING_DISABLE);
 587
 588	/* WaToEnableHwFixForPushConstHWBug:bxt */
 589	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 590		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 591}
 592
 593static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine,
 594				     struct i915_wa_list *wal)
 595{
 596	struct drm_i915_private *i915 = engine->i915;
 597
 598	gen9_ctx_workarounds_init(engine, wal);
 599
 600	/* WaToEnableHwFixForPushConstHWBug:kbl */
 601	if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_C0, STEP_FOREVER))
 602		wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 603			     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 604
 605	/* WaDisableSbeCacheDispatchPortSharing:kbl */
 606	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
 607			 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
 608}
 609
 610static void glk_ctx_workarounds_init(struct intel_engine_cs *engine,
 611				     struct i915_wa_list *wal)
 612{
 613	gen9_ctx_workarounds_init(engine, wal);
 614
 615	/* WaToEnableHwFixForPushConstHWBug:glk */
 616	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 617		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 618}
 619
 620static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine,
 621				     struct i915_wa_list *wal)
 622{
 623	gen9_ctx_workarounds_init(engine, wal);
 624
 625	/* WaToEnableHwFixForPushConstHWBug:cfl */
 626	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 627		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 628
 629	/* WaDisableSbeCacheDispatchPortSharing:cfl */
 630	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
 631			 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
 632}
 633
 634static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
 635				     struct i915_wa_list *wal)
 636{
 637	/* Wa_1406697149 (WaDisableBankHangMode:icl) */
 638	wa_write(wal, GEN8_L3CNTLREG, GEN8_ERRDETBCTRL);
 639
 640	/* WaForceEnableNonCoherent:icl
 641	 * This is not the same workaround as in early Gen9 platforms, where
 642	 * lacking this could cause system hangs, but coherency performance
 643	 * overhead is high and only a few compute workloads really need it
 644	 * (the register is whitelisted in hardware now, so UMDs can opt in
 645	 * for coherency if they have a good reason).
 646	 */
 647	wa_mcr_masked_en(wal, ICL_HDC_MODE, HDC_FORCE_NON_COHERENT);
 648
 649	/* WaEnableFloatBlendOptimization:icl */
 650	wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
 651		   _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE),
 652		   0 /* write-only, so skip validation */,
 653		   true);
 654
 655	/* WaDisableGPGPUMidThreadPreemption:icl */
 656	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
 657			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 658			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
 659
 660	/* allow headerless messages for preemptible GPGPU context */
 661	wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
 662			 GEN11_SAMPLER_ENABLE_HEADLESS_MSG);
 663
 664	/* Wa_1604278689:icl,ehl */
 665	wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID);
 666	wa_write_clr_set(wal, IVB_FBC_RT_BASE_UPPER,
 667			 0,
 668			 0xFFFFFFFF);
 669
 670	/* Wa_1406306137:icl,ehl */
 671	wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU);
 672}
 673
 674/*
 675 * These settings aren't actually workarounds, but general tuning settings that
 676 * need to be programmed on dg2 platform.
 677 */
 678static void dg2_ctx_gt_tuning_init(struct intel_engine_cs *engine,
 679				   struct i915_wa_list *wal)
 680{
 681	wa_mcr_masked_en(wal, CHICKEN_RASTER_2, TBIMR_FAST_CLIP);
 682	wa_mcr_write_clr_set(wal, XEHP_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK,
 683			     REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f));
 684	wa_mcr_write_clr_set(wal, XEHP_FF_MODE2, FF_MODE2_TDS_TIMER_MASK,
 685			     FF_MODE2_TDS_TIMER_128);
 686}
 687
 688static void gen12_ctx_workarounds_init(struct intel_engine_cs *engine,
 689				       struct i915_wa_list *wal)
 690{
 691	struct drm_i915_private *i915 = engine->i915;
 692
 693	/*
 694	 * Wa_1409142259:tgl,dg1,adl-p
 695	 * Wa_1409347922:tgl,dg1,adl-p
 696	 * Wa_1409252684:tgl,dg1,adl-p
 697	 * Wa_1409217633:tgl,dg1,adl-p
 698	 * Wa_1409207793:tgl,dg1,adl-p
 699	 * Wa_1409178076:tgl,dg1,adl-p
 700	 * Wa_1408979724:tgl,dg1,adl-p
 701	 * Wa_14010443199:tgl,rkl,dg1,adl-p
 702	 * Wa_14010698770:tgl,rkl,dg1,adl-s,adl-p
 703	 * Wa_1409342910:tgl,rkl,dg1,adl-s,adl-p
 704	 */
 705	wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3,
 706		     GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
 707
 708	/* WaDisableGPGPUMidThreadPreemption:gen12 */
 709	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
 710			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 711			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
 712
 713	/*
 714	 * Wa_16011163337 - GS_TIMER
 715	 *
 716	 * TDS_TIMER: Although some platforms refer to it as Wa_1604555607, we
 717	 * need to program it even on those that don't explicitly list that
 718	 * workaround.
 719	 *
 720	 * Note that the programming of GEN12_FF_MODE2 is further modified
 721	 * according to the FF_MODE2 guidance given by Wa_1608008084.
 722	 * Wa_1608008084 tells us the FF_MODE2 register will return the wrong
 723	 * value when read from the CPU.
 724	 *
 725	 * The default value for this register is zero for all fields.
 726	 * So instead of doing a RMW we should just write the desired values
 727	 * for TDS and GS timers. Note that since the readback can't be trusted,
 728	 * the clear mask is just set to ~0 to make sure other bits are not
 729	 * inadvertently set. For the same reason read verification is ignored.
 730	 */
 731	wa_add(wal,
 732	       GEN12_FF_MODE2,
 733	       ~0,
 734	       FF_MODE2_TDS_TIMER_128 | FF_MODE2_GS_TIMER_224,
 735	       0, false);
 736
 737	if (!IS_DG1(i915)) {
 738		/* Wa_1806527549 */
 739		wa_masked_en(wal, HIZ_CHICKEN, HZ_DEPTH_TEST_LE_GE_OPT_DISABLE);
 740
 741		/* Wa_1606376872 */
 742		wa_masked_en(wal, COMMON_SLICE_CHICKEN4, DISABLE_TDC_LOAD_BALANCING_CALC);
 743	}
 744}
 745
 746static void dg1_ctx_workarounds_init(struct intel_engine_cs *engine,
 747				     struct i915_wa_list *wal)
 748{
 749	gen12_ctx_workarounds_init(engine, wal);
 750
 751	/* Wa_1409044764 */
 752	wa_masked_dis(wal, GEN11_COMMON_SLICE_CHICKEN3,
 753		      DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN);
 754
 755	/* Wa_22010493298 */
 756	wa_masked_en(wal, HIZ_CHICKEN,
 757		     DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE);
 758}
 759
 760static void dg2_ctx_workarounds_init(struct intel_engine_cs *engine,
 761				     struct i915_wa_list *wal)
 762{
 763	dg2_ctx_gt_tuning_init(engine, wal);
 764
 765	/* Wa_16013271637:dg2 */
 766	wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
 767			 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
 768
 769	/* Wa_14014947963:dg2 */
 770	wa_masked_field_set(wal, VF_PREEMPTION, PREEMPTION_VERTEX_COUNT, 0x4000);
 771
 772	/* Wa_18018764978:dg2 */
 773	wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
 774
 775	/* Wa_18019271663:dg2 */
 776	wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
 777
 778	/* Wa_14019877138:dg2 */
 779	wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT);
 780}
 781
 782static void xelpg_ctx_gt_tuning_init(struct intel_engine_cs *engine,
 783				     struct i915_wa_list *wal)
 784{
 785	struct intel_gt *gt = engine->gt;
 786
 787	dg2_ctx_gt_tuning_init(engine, wal);
 788
 789	/*
 790	 * Due to Wa_16014892111, the DRAW_WATERMARK tuning must be done in
 791	 * gen12_emit_indirect_ctx_rcs() rather than here on some early
 792	 * steppings.
 793	 */
 794	if (!(IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
 795	      IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)))
 796		wa_add(wal, DRAW_WATERMARK, VERT_WM_VAL, 0x3FF, 0, false);
 797}
 798
 799static void xelpg_ctx_workarounds_init(struct intel_engine_cs *engine,
 800				       struct i915_wa_list *wal)
 801{
 802	struct intel_gt *gt = engine->gt;
 803
 804	xelpg_ctx_gt_tuning_init(engine, wal);
 805
 806	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
 807	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
 808		/* Wa_14014947963 */
 809		wa_masked_field_set(wal, VF_PREEMPTION,
 810				    PREEMPTION_VERTEX_COUNT, 0x4000);
 811
 812		/* Wa_16013271637 */
 813		wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
 814				 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
 815
 816		/* Wa_18019627453 */
 817		wa_mcr_masked_en(wal, VFLSKPD, VF_PREFETCH_TLB_DIS);
 818
 819		/* Wa_18018764978 */
 820		wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
 821	}
 822
 823	/* Wa_18019271663 */
 824	wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
 825
 826	/* Wa_14019877138 */
 827	wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT);
 828}
 829
 830static void fakewa_disable_nestedbb_mode(struct intel_engine_cs *engine,
 831					 struct i915_wa_list *wal)
 832{
 833	/*
 834	 * This is a "fake" workaround defined by software to ensure we
 835	 * maintain reliable, backward-compatible behavior for userspace with
 836	 * regards to how nested MI_BATCH_BUFFER_START commands are handled.
 837	 *
 838	 * The per-context setting of MI_MODE[12] determines whether the bits
 839	 * of a nested MI_BATCH_BUFFER_START instruction should be interpreted
 840	 * in the traditional manner or whether they should instead use a new
 841	 * tgl+ meaning that breaks backward compatibility, but allows nesting
 842	 * into 3rd-level batchbuffers.  When this new capability was first
 843	 * added in TGL, it remained off by default unless a context
 844	 * intentionally opted in to the new behavior.  However Xe_HPG now
 845	 * flips this on by default and requires that we explicitly opt out if
 846	 * we don't want the new behavior.
 847	 *
 848	 * From a SW perspective, we want to maintain the backward-compatible
 849	 * behavior for userspace, so we'll apply a fake workaround to set it
 850	 * back to the legacy behavior on platforms where the hardware default
 851	 * is to break compatibility.  At the moment there is no Linux
 852	 * userspace that utilizes third-level batchbuffers, so this will avoid
 853	 * userspace from needing to make any changes.  using the legacy
 854	 * meaning is the correct thing to do.  If/when we have userspace
 855	 * consumers that want to utilize third-level batch nesting, we can
 856	 * provide a context parameter to allow them to opt-in.
 857	 */
 858	wa_masked_dis(wal, RING_MI_MODE(engine->mmio_base), TGL_NESTED_BB_EN);
 859}
 860
 861static void gen12_ctx_gt_mocs_init(struct intel_engine_cs *engine,
 862				   struct i915_wa_list *wal)
 863{
 864	u8 mocs;
 865
 866	/*
 867	 * Some blitter commands do not have a field for MOCS, those
 868	 * commands will use MOCS index pointed by BLIT_CCTL.
 869	 * BLIT_CCTL registers are needed to be programmed to un-cached.
 870	 */
 871	if (engine->class == COPY_ENGINE_CLASS) {
 872		mocs = engine->gt->mocs.uc_index;
 873		wa_write_clr_set(wal,
 874				 BLIT_CCTL(engine->mmio_base),
 875				 BLIT_CCTL_MASK,
 876				 BLIT_CCTL_MOCS(mocs, mocs));
 877	}
 878}
 879
 880/*
 881 * gen12_ctx_gt_fake_wa_init() aren't programmingan official workaround
 882 * defined by the hardware team, but it programming general context registers.
 883 * Adding those context register programming in context workaround
 884 * allow us to use the wa framework for proper application and validation.
 885 */
 886static void
 887gen12_ctx_gt_fake_wa_init(struct intel_engine_cs *engine,
 888			  struct i915_wa_list *wal)
 889{
 890	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 891		fakewa_disable_nestedbb_mode(engine, wal);
 892
 893	gen12_ctx_gt_mocs_init(engine, wal);
 894}
 895
 896static void
 897__intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
 898			   struct i915_wa_list *wal,
 899			   const char *name)
 900{
 901	struct drm_i915_private *i915 = engine->i915;
 902
 903	wa_init_start(wal, engine->gt, name, engine->name);
 904
 905	/* Applies to all engines */
 906	/*
 907	 * Fake workarounds are not the actual workaround but
 908	 * programming of context registers using workaround framework.
 909	 */
 910	if (GRAPHICS_VER(i915) >= 12)
 911		gen12_ctx_gt_fake_wa_init(engine, wal);
 912
 913	if (engine->class != RENDER_CLASS)
 914		goto done;
 915
 916	if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
 917		xelpg_ctx_workarounds_init(engine, wal);
 
 
 918	else if (IS_DG2(i915))
 919		dg2_ctx_workarounds_init(engine, wal);
 
 
 920	else if (IS_DG1(i915))
 921		dg1_ctx_workarounds_init(engine, wal);
 922	else if (GRAPHICS_VER(i915) == 12)
 923		gen12_ctx_workarounds_init(engine, wal);
 924	else if (GRAPHICS_VER(i915) == 11)
 925		icl_ctx_workarounds_init(engine, wal);
 926	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
 927		cfl_ctx_workarounds_init(engine, wal);
 928	else if (IS_GEMINILAKE(i915))
 929		glk_ctx_workarounds_init(engine, wal);
 930	else if (IS_KABYLAKE(i915))
 931		kbl_ctx_workarounds_init(engine, wal);
 932	else if (IS_BROXTON(i915))
 933		bxt_ctx_workarounds_init(engine, wal);
 934	else if (IS_SKYLAKE(i915))
 935		skl_ctx_workarounds_init(engine, wal);
 936	else if (IS_CHERRYVIEW(i915))
 937		chv_ctx_workarounds_init(engine, wal);
 938	else if (IS_BROADWELL(i915))
 939		bdw_ctx_workarounds_init(engine, wal);
 940	else if (GRAPHICS_VER(i915) == 7)
 941		gen7_ctx_workarounds_init(engine, wal);
 942	else if (GRAPHICS_VER(i915) == 6)
 943		gen6_ctx_workarounds_init(engine, wal);
 944	else if (GRAPHICS_VER(i915) < 8)
 945		;
 946	else
 947		MISSING_CASE(GRAPHICS_VER(i915));
 948
 949done:
 950	wa_init_finish(wal);
 951}
 952
 953void intel_engine_init_ctx_wa(struct intel_engine_cs *engine)
 954{
 955	__intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context");
 956}
 957
 958int intel_engine_emit_ctx_wa(struct i915_request *rq)
 959{
 960	struct i915_wa_list *wal = &rq->engine->ctx_wa_list;
 961	struct intel_uncore *uncore = rq->engine->uncore;
 962	enum forcewake_domains fw;
 963	unsigned long flags;
 964	struct i915_wa *wa;
 965	unsigned int i;
 966	u32 *cs;
 967	int ret;
 968
 969	if (wal->count == 0)
 970		return 0;
 971
 972	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
 973	if (ret)
 974		return ret;
 975
 976	if ((IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(12, 70), IP_VER(12, 74)) ||
 977	     IS_DG2(rq->i915)) && rq->engine->class == RENDER_CLASS)
 978		cs = intel_ring_begin(rq, (wal->count * 2 + 6));
 979	else
 980		cs = intel_ring_begin(rq, (wal->count * 2 + 2));
 981
 982	if (IS_ERR(cs))
 983		return PTR_ERR(cs);
 984
 985	fw = wal_get_fw_for_rmw(uncore, wal);
 986
 987	intel_gt_mcr_lock(wal->gt, &flags);
 988	spin_lock(&uncore->lock);
 989	intel_uncore_forcewake_get__locked(uncore, fw);
 990
 991	*cs++ = MI_LOAD_REGISTER_IMM(wal->count);
 992	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
 993		u32 val;
 994
 995		/* Skip reading the register if it's not really needed */
 996		if (wa->masked_reg || (wa->clr | wa->set) == U32_MAX) {
 997			val = wa->set;
 998		} else {
 999			val = wa->is_mcr ?
1000				intel_gt_mcr_read_any_fw(wal->gt, wa->mcr_reg) :
1001				intel_uncore_read_fw(uncore, wa->reg);
1002			val &= ~wa->clr;
1003			val |= wa->set;
1004		}
1005
1006		*cs++ = i915_mmio_reg_offset(wa->reg);
1007		*cs++ = val;
1008	}
1009	*cs++ = MI_NOOP;
1010
1011	/* Wa_14019789679 */
1012	if ((IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(12, 70), IP_VER(12, 74)) ||
1013	     IS_DG2(rq->i915)) && rq->engine->class == RENDER_CLASS) {
1014		*cs++ = CMD_3DSTATE_MESH_CONTROL;
1015		*cs++ = 0;
1016		*cs++ = 0;
1017		*cs++ = MI_NOOP;
1018	}
1019
1020	intel_uncore_forcewake_put__locked(uncore, fw);
1021	spin_unlock(&uncore->lock);
1022	intel_gt_mcr_unlock(wal->gt, flags);
1023
1024	intel_ring_advance(rq, cs);
1025
1026	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
1027	if (ret)
1028		return ret;
1029
1030	return 0;
1031}
1032
1033static void
1034gen4_gt_workarounds_init(struct intel_gt *gt,
1035			 struct i915_wa_list *wal)
1036{
1037	/* WaDisable_RenderCache_OperationalFlush:gen4,ilk */
1038	wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
1039}
1040
1041static void
1042g4x_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1043{
1044	gen4_gt_workarounds_init(gt, wal);
1045
1046	/* WaDisableRenderCachePipelinedFlush:g4x,ilk */
1047	wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE);
1048}
1049
1050static void
1051ilk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1052{
1053	g4x_gt_workarounds_init(gt, wal);
1054
1055	wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED);
1056}
1057
1058static void
1059snb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1060{
1061}
1062
1063static void
1064ivb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1065{
1066	/* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
1067	wa_masked_dis(wal,
1068		      GEN7_COMMON_SLICE_CHICKEN1,
1069		      GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
1070
1071	/* WaApplyL3ControlAndL3ChickenMode:ivb */
1072	wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL);
1073	wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
1074
1075	/* WaForceL3Serialization:ivb */
1076	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1077}
1078
1079static void
1080vlv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1081{
1082	/* WaForceL3Serialization:vlv */
1083	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1084
1085	/*
1086	 * WaIncreaseL3CreditsForVLVB0:vlv
1087	 * This is the hardware default actually.
1088	 */
1089	wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE);
1090}
1091
1092static void
1093hsw_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1094{
1095	/* L3 caching of data atomics doesn't work -- disable it. */
1096	wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
1097
1098	wa_add(wal,
1099	       HSW_ROW_CHICKEN3, 0,
1100	       _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE),
1101	       0 /* XXX does this reg exist? */, true);
1102
1103	/* WaVSRefCountFullforceMissDisable:hsw */
1104	wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME);
1105}
1106
1107static void
1108gen9_wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal)
1109{
1110	const struct sseu_dev_info *sseu = &to_gt(i915)->info.sseu;
1111	unsigned int slice, subslice;
1112	u32 mcr, mcr_mask;
1113
1114	GEM_BUG_ON(GRAPHICS_VER(i915) != 9);
1115
1116	/*
1117	 * WaProgramMgsrForCorrectSliceSpecificMmioReads:gen9,glk,kbl,cml
1118	 * Before any MMIO read into slice/subslice specific registers, MCR
1119	 * packet control register needs to be programmed to point to any
1120	 * enabled s/ss pair. Otherwise, incorrect values will be returned.
1121	 * This means each subsequent MMIO read will be forwarded to an
1122	 * specific s/ss combination, but this is OK since these registers
1123	 * are consistent across s/ss in almost all cases. In the rare
1124	 * occasions, such as INSTDONE, where this value is dependent
1125	 * on s/ss combo, the read should be done with read_subslice_reg.
1126	 */
1127	slice = ffs(sseu->slice_mask) - 1;
1128	GEM_BUG_ON(slice >= ARRAY_SIZE(sseu->subslice_mask.hsw));
1129	subslice = ffs(intel_sseu_get_hsw_subslices(sseu, slice));
1130	GEM_BUG_ON(!subslice);
1131	subslice--;
1132
1133	/*
1134	 * We use GEN8_MCR..() macros to calculate the |mcr| value for
1135	 * Gen9 to address WaProgramMgsrForCorrectSliceSpecificMmioReads
1136	 */
1137	mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice);
1138	mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
1139
1140	drm_dbg(&i915->drm, "MCR slice:%d/subslice:%d = %x\n", slice, subslice, mcr);
1141
1142	wa_write_clr_set(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr);
1143}
1144
1145static void
1146gen9_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1147{
1148	struct drm_i915_private *i915 = gt->i915;
1149
1150	/* WaProgramMgsrForCorrectSliceSpecificMmioReads:glk,kbl,cml,gen9 */
1151	gen9_wa_init_mcr(i915, wal);
1152
1153	/* WaDisableKillLogic:bxt,skl,kbl */
1154	if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915))
1155		wa_write_or(wal,
1156			    GAM_ECOCHK,
1157			    ECOCHK_DIS_TLB);
1158
1159	if (HAS_LLC(i915)) {
1160		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
1161		 *
1162		 * Must match Display Engine. See
1163		 * WaCompressedResourceDisplayNewHashMode.
1164		 */
1165		wa_write_or(wal,
1166			    MMCD_MISC_CTRL,
1167			    MMCD_PCLA | MMCD_HOTSPOT_EN);
1168	}
1169
1170	/* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */
1171	wa_write_or(wal,
1172		    GAM_ECOCHK,
1173		    BDW_DISABLE_HDC_INVALIDATION);
1174}
1175
1176static void
1177skl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1178{
1179	gen9_gt_workarounds_init(gt, wal);
1180
1181	/* WaDisableGafsUnitClkGating:skl */
1182	wa_write_or(wal,
1183		    GEN7_UCGCTL4,
1184		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1185
1186	/* WaInPlaceDecompressionHang:skl */
1187	if (IS_SKYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, STEP_A0, STEP_H0))
1188		wa_write_or(wal,
1189			    GEN9_GAMT_ECO_REG_RW_IA,
1190			    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1191}
1192
1193static void
1194kbl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1195{
1196	gen9_gt_workarounds_init(gt, wal);
1197
1198	/* WaDisableDynamicCreditSharing:kbl */
1199	if (IS_KABYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, 0, STEP_C0))
1200		wa_write_or(wal,
1201			    GAMT_CHKN_BIT_REG,
1202			    GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);
1203
1204	/* WaDisableGafsUnitClkGating:kbl */
1205	wa_write_or(wal,
1206		    GEN7_UCGCTL4,
1207		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1208
1209	/* WaInPlaceDecompressionHang:kbl */
1210	wa_write_or(wal,
1211		    GEN9_GAMT_ECO_REG_RW_IA,
1212		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1213}
1214
1215static void
1216glk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1217{
1218	gen9_gt_workarounds_init(gt, wal);
1219}
1220
1221static void
1222cfl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1223{
1224	gen9_gt_workarounds_init(gt, wal);
1225
1226	/* WaDisableGafsUnitClkGating:cfl */
1227	wa_write_or(wal,
1228		    GEN7_UCGCTL4,
1229		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1230
1231	/* WaInPlaceDecompressionHang:cfl */
1232	wa_write_or(wal,
1233		    GEN9_GAMT_ECO_REG_RW_IA,
1234		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1235}
1236
1237static void __set_mcr_steering(struct i915_wa_list *wal,
1238			       i915_reg_t steering_reg,
1239			       unsigned int slice, unsigned int subslice)
1240{
1241	u32 mcr, mcr_mask;
1242
1243	mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice);
1244	mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
1245
1246	wa_write_clr_set(wal, steering_reg, mcr_mask, mcr);
1247}
1248
1249static void debug_dump_steering(struct intel_gt *gt)
1250{
1251	struct drm_printer p = drm_dbg_printer(&gt->i915->drm, DRM_UT_DRIVER,
1252					       "MCR Steering:");
1253
1254	if (drm_debug_enabled(DRM_UT_DRIVER))
1255		intel_gt_mcr_report_steering(&p, gt, false);
1256}
1257
1258static void __add_mcr_wa(struct intel_gt *gt, struct i915_wa_list *wal,
1259			 unsigned int slice, unsigned int subslice)
1260{
1261	__set_mcr_steering(wal, GEN8_MCR_SELECTOR, slice, subslice);
1262
1263	gt->default_steering.groupid = slice;
1264	gt->default_steering.instanceid = subslice;
1265
1266	debug_dump_steering(gt);
1267}
1268
1269static void
1270icl_wa_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1271{
1272	const struct sseu_dev_info *sseu = &gt->info.sseu;
1273	unsigned int subslice;
1274
1275	GEM_BUG_ON(GRAPHICS_VER(gt->i915) < 11);
1276	GEM_BUG_ON(hweight8(sseu->slice_mask) > 1);
1277
1278	/*
1279	 * Although a platform may have subslices, we need to always steer
1280	 * reads to the lowest instance that isn't fused off.  When Render
1281	 * Power Gating is enabled, grabbing forcewake will only power up a
1282	 * single subslice (the "minconfig") if there isn't a real workload
1283	 * that needs to be run; this means that if we steer register reads to
1284	 * one of the higher subslices, we run the risk of reading back 0's or
1285	 * random garbage.
1286	 */
1287	subslice = __ffs(intel_sseu_get_hsw_subslices(sseu, 0));
1288
1289	/*
1290	 * If the subslice we picked above also steers us to a valid L3 bank,
1291	 * then we can just rely on the default steering and won't need to
1292	 * worry about explicitly re-steering L3BANK reads later.
1293	 */
1294	if (gt->info.l3bank_mask & BIT(subslice))
1295		gt->steering_table[L3BANK] = NULL;
1296
1297	__add_mcr_wa(gt, wal, 0, subslice);
1298}
1299
1300static void
1301xehp_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1302{
1303	const struct sseu_dev_info *sseu = &gt->info.sseu;
1304	unsigned long slice, subslice = 0, slice_mask = 0;
1305	u32 lncf_mask = 0;
1306	int i;
1307
1308	/*
1309	 * On Xe_HP the steering increases in complexity. There are now several
1310	 * more units that require steering and we're not guaranteed to be able
1311	 * to find a common setting for all of them. These are:
1312	 * - GSLICE (fusable)
1313	 * - DSS (sub-unit within gslice; fusable)
1314	 * - L3 Bank (fusable)
1315	 * - MSLICE (fusable)
1316	 * - LNCF (sub-unit within mslice; always present if mslice is present)
1317	 *
1318	 * We'll do our default/implicit steering based on GSLICE (in the
1319	 * sliceid field) and DSS (in the subsliceid field).  If we can
1320	 * find overlap between the valid MSLICE and/or LNCF values with
1321	 * a suitable GSLICE, then we can just re-use the default value and
1322	 * skip and explicit steering at runtime.
1323	 *
1324	 * We only need to look for overlap between GSLICE/MSLICE/LNCF to find
1325	 * a valid sliceid value.  DSS steering is the only type of steering
1326	 * that utilizes the 'subsliceid' bits.
1327	 *
1328	 * Also note that, even though the steering domain is called "GSlice"
1329	 * and it is encoded in the register using the gslice format, the spec
1330	 * says that the combined (geometry | compute) fuse should be used to
1331	 * select the steering.
1332	 */
1333
1334	/* Find the potential gslice candidates */
1335	slice_mask = intel_slicemask_from_xehp_dssmask(sseu->subslice_mask,
1336						       GEN_DSS_PER_GSLICE);
1337
1338	/*
1339	 * Find the potential LNCF candidates.  Either LNCF within a valid
1340	 * mslice is fine.
1341	 */
1342	for_each_set_bit(i, &gt->info.mslice_mask, GEN12_MAX_MSLICES)
1343		lncf_mask |= (0x3 << (i * 2));
1344
1345	/*
1346	 * Are there any sliceid values that work for both GSLICE and LNCF
1347	 * steering?
1348	 */
1349	if (slice_mask & lncf_mask) {
1350		slice_mask &= lncf_mask;
1351		gt->steering_table[LNCF] = NULL;
1352	}
1353
1354	/* How about sliceid values that also work for MSLICE steering? */
1355	if (slice_mask & gt->info.mslice_mask) {
1356		slice_mask &= gt->info.mslice_mask;
1357		gt->steering_table[MSLICE] = NULL;
1358	}
1359
 
 
 
1360	slice = __ffs(slice_mask);
1361	subslice = intel_sseu_find_first_xehp_dss(sseu, GEN_DSS_PER_GSLICE, slice) %
1362		GEN_DSS_PER_GSLICE;
1363
1364	__add_mcr_wa(gt, wal, slice, subslice);
1365
1366	/*
1367	 * SQIDI ranges are special because they use different steering
1368	 * registers than everything else we work with.  On XeHP SDV and
1369	 * DG2-G10, any value in the steering registers will work fine since
1370	 * all instances are present, but DG2-G11 only has SQIDI instances at
1371	 * ID's 2 and 3, so we need to steer to one of those.  For simplicity
1372	 * we'll just steer to a hardcoded "2" since that value will work
1373	 * everywhere.
1374	 */
1375	__set_mcr_steering(wal, MCFG_MCR_SELECTOR, 0, 2);
1376	__set_mcr_steering(wal, SF_MCR_SELECTOR, 0, 2);
1377
1378	/*
1379	 * On DG2, GAM registers have a dedicated steering control register
1380	 * and must always be programmed to a hardcoded groupid of "1."
1381	 */
1382	if (IS_DG2(gt->i915))
1383		__set_mcr_steering(wal, GAM_MCR_SELECTOR, 1, 0);
1384}
1385
1386static void
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1387icl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1388{
1389	struct drm_i915_private *i915 = gt->i915;
1390
1391	icl_wa_init_mcr(gt, wal);
1392
1393	/* WaModifyGamTlbPartitioning:icl */
1394	wa_write_clr_set(wal,
1395			 GEN11_GACB_PERF_CTRL,
1396			 GEN11_HASH_CTRL_MASK,
1397			 GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4);
1398
1399	/* Wa_1405766107:icl
1400	 * Formerly known as WaCL2SFHalfMaxAlloc
1401	 */
1402	wa_write_or(wal,
1403		    GEN11_LSN_UNSLCVC,
1404		    GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC |
1405		    GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC);
1406
1407	/* Wa_220166154:icl
1408	 * Formerly known as WaDisCtxReload
1409	 */
1410	wa_write_or(wal,
1411		    GEN8_GAMW_ECO_DEV_RW_IA,
1412		    GAMW_ECO_DEV_CTX_RELOAD_DISABLE);
1413
1414	/* Wa_1406463099:icl
1415	 * Formerly known as WaGamTlbPendError
1416	 */
1417	wa_write_or(wal,
1418		    GAMT_CHKN_BIT_REG,
1419		    GAMT_CHKN_DISABLE_L3_COH_PIPE);
1420
1421	/*
1422	 * Wa_1408615072:icl,ehl  (vsunit)
1423	 * Wa_1407596294:icl,ehl  (hsunit)
1424	 */
1425	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1426		    VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS);
1427
1428	/* Wa_1407352427:icl,ehl */
1429	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1430		    PSDUNIT_CLKGATE_DIS);
1431
1432	/* Wa_1406680159:icl,ehl */
1433	wa_mcr_write_or(wal,
1434			GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1435			GWUNIT_CLKGATE_DIS);
1436
1437	/* Wa_1607087056:icl,ehl,jsl */
1438	if (IS_ICELAKE(i915) ||
1439		((IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) &&
1440		IS_GRAPHICS_STEP(i915, STEP_A0, STEP_B0)))
1441		wa_write_or(wal,
1442			    GEN11_SLICE_UNIT_LEVEL_CLKGATE,
1443			    L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1444
1445	/*
1446	 * This is not a documented workaround, but rather an optimization
1447	 * to reduce sampler power.
1448	 */
1449	wa_mcr_write_clr(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1450}
1451
1452/*
1453 * Though there are per-engine instances of these registers,
1454 * they retain their value through engine resets and should
1455 * only be provided on the GT workaround list rather than
1456 * the engine-specific workaround list.
1457 */
1458static void
1459wa_14011060649(struct intel_gt *gt, struct i915_wa_list *wal)
1460{
1461	struct intel_engine_cs *engine;
1462	int id;
1463
1464	for_each_engine(engine, gt, id) {
1465		if (engine->class != VIDEO_DECODE_CLASS ||
1466		    (engine->instance % 2))
1467			continue;
1468
1469		wa_write_or(wal, VDBOX_CGCTL3F10(engine->mmio_base),
1470			    IECPUNIT_CLKGATE_DIS);
1471	}
1472}
1473
1474static void
1475gen12_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1476{
1477	icl_wa_init_mcr(gt, wal);
1478
1479	/* Wa_14011060649:tgl,rkl,dg1,adl-s,adl-p */
1480	wa_14011060649(gt, wal);
1481
1482	/* Wa_14011059788:tgl,rkl,adl-s,dg1,adl-p */
1483	wa_mcr_write_or(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1484
1485	/*
1486	 * Wa_14015795083
1487	 *
1488	 * Firmware on some gen12 platforms locks the MISCCPCTL register,
1489	 * preventing i915 from modifying it for this workaround.  Skip the
1490	 * readback verification for this workaround on debug builds; if the
1491	 * workaround doesn't stick due to firmware behavior, it's not an error
1492	 * that we want CI to flag.
1493	 */
1494	wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE,
1495	       0, 0, false);
1496}
1497
1498static void
1499dg1_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1500{
1501	gen12_gt_workarounds_init(gt, wal);
1502
1503	/* Wa_1409420604:dg1 */
1504	wa_mcr_write_or(wal, SUBSLICE_UNIT_LEVEL_CLKGATE2,
1505			CPSSUNIT_CLKGATE_DIS);
1506
1507	/* Wa_1408615072:dg1 */
1508	/* Empirical testing shows this register is unaffected by engine reset. */
1509	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, VSUNIT_CLKGATE_DIS_TGL);
1510}
1511
1512static void
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1513dg2_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1514{
1515	xehp_init_mcr(gt, wal);
1516
1517	/* Wa_14011060649:dg2 */
1518	wa_14011060649(gt, wal);
1519
1520	if (IS_DG2_G10(gt->i915)) {
1521		/* Wa_22010523718:dg2 */
1522		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1523			    CG3DDISCFEG_CLKGATE_DIS);
1524
1525		/* Wa_14011006942:dg2 */
1526		wa_mcr_write_or(wal, GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1527				DSS_ROUTER_CLKGATE_DIS);
1528	}
1529
1530	/* Wa_14014830051:dg2 */
1531	wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1532
1533	/*
1534	 * Wa_14015795083
1535	 * Skip verification for possibly locked register.
1536	 */
1537	wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE,
1538	       0, 0, false);
1539
1540	/* Wa_18018781329 */
1541	wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1542	wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1543	wa_mcr_write_or(wal, XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1544	wa_mcr_write_or(wal, XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB);
1545
1546	/* Wa_1509235366:dg2 */
1547	wa_mcr_write_or(wal, XEHP_GAMCNTRL_CTRL,
1548			INVALIDATION_BROADCAST_MODE_DIS | GLOBAL_INVALIDATION_MODE);
1549
1550	/* Wa_14010648519:dg2 */
1551	wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
1552}
1553
1554static void
1555xelpg_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1556{
1557	/* Wa_14018575942 / Wa_18018781329 */
 
 
 
 
 
1558	wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1559	wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
 
 
 
 
 
 
 
 
 
 
 
 
1560
1561	/* Wa_22016670082 */
1562	wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE);
1563
1564	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1565	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
1566		/* Wa_14014830051 */
1567		wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1568
1569		/* Wa_14015795083 */
1570		wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1571	}
1572
1573	/*
1574	 * Unlike older platforms, we no longer setup implicit steering here;
1575	 * all MCR accesses are explicitly steered.
1576	 */
1577	debug_dump_steering(gt);
1578}
1579
1580static void
1581wa_16021867713(struct intel_gt *gt, struct i915_wa_list *wal)
1582{
1583	struct intel_engine_cs *engine;
1584	int id;
1585
1586	for_each_engine(engine, gt, id)
1587		if (engine->class == VIDEO_DECODE_CLASS)
1588			wa_write_or(wal, VDBOX_CGCTL3F1C(engine->mmio_base),
1589				    MFXPIPE_CLKGATE_DIS);
1590}
1591
1592static void
1593xelpmp_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1594{
1595	wa_16021867713(gt, wal);
1596
1597	/*
1598	 * Wa_14018778641
1599	 * Wa_18018781329
1600	 *
1601	 * Note that although these registers are MCR on the primary
1602	 * GT, the media GT's versions are regular singleton registers.
1603	 */
1604	wa_write_or(wal, XELPMP_GSC_MOD_CTRL, FORCE_MISS_FTLB);
1605
1606	/*
1607	 * Wa_14018575942
1608	 *
1609	 * Issue is seen on media KPI test running on VDBOX engine
1610	 * especially VP9 encoding WLs
1611	 */
1612	wa_write_or(wal, XELPMP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1613
1614	/* Wa_22016670082 */
1615	wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE);
1616
1617	debug_dump_steering(gt);
1618}
1619
1620/*
1621 * The bspec performance guide has recommended MMIO tuning settings.  These
1622 * aren't truly "workarounds" but we want to program them through the
1623 * workaround infrastructure to make sure they're (re)applied at the proper
1624 * times.
1625 *
1626 * The programming in this function is for settings that persist through
1627 * engine resets and also are not part of any engine's register state context.
1628 * I.e., settings that only need to be re-applied in the event of a full GT
1629 * reset.
1630 */
1631static void gt_tuning_settings(struct intel_gt *gt, struct i915_wa_list *wal)
1632{
1633	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74))) {
1634		wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1635		wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1636	}
1637
 
 
 
 
 
 
1638	if (IS_DG2(gt->i915)) {
1639		wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1640		wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1641	}
1642}
1643
1644static void
1645gt_init_workarounds(struct intel_gt *gt, struct i915_wa_list *wal)
1646{
1647	struct drm_i915_private *i915 = gt->i915;
1648
1649	gt_tuning_settings(gt, wal);
1650
1651	if (gt->type == GT_MEDIA) {
1652		if (MEDIA_VER_FULL(i915) == IP_VER(13, 0))
1653			xelpmp_gt_workarounds_init(gt, wal);
1654		else
1655			MISSING_CASE(MEDIA_VER_FULL(i915));
1656
1657		return;
1658	}
1659
1660	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)))
1661		xelpg_gt_workarounds_init(gt, wal);
 
 
1662	else if (IS_DG2(i915))
1663		dg2_gt_workarounds_init(gt, wal);
 
 
1664	else if (IS_DG1(i915))
1665		dg1_gt_workarounds_init(gt, wal);
1666	else if (GRAPHICS_VER(i915) == 12)
1667		gen12_gt_workarounds_init(gt, wal);
1668	else if (GRAPHICS_VER(i915) == 11)
1669		icl_gt_workarounds_init(gt, wal);
1670	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
1671		cfl_gt_workarounds_init(gt, wal);
1672	else if (IS_GEMINILAKE(i915))
1673		glk_gt_workarounds_init(gt, wal);
1674	else if (IS_KABYLAKE(i915))
1675		kbl_gt_workarounds_init(gt, wal);
1676	else if (IS_BROXTON(i915))
1677		gen9_gt_workarounds_init(gt, wal);
1678	else if (IS_SKYLAKE(i915))
1679		skl_gt_workarounds_init(gt, wal);
1680	else if (IS_HASWELL(i915))
1681		hsw_gt_workarounds_init(gt, wal);
1682	else if (IS_VALLEYVIEW(i915))
1683		vlv_gt_workarounds_init(gt, wal);
1684	else if (IS_IVYBRIDGE(i915))
1685		ivb_gt_workarounds_init(gt, wal);
1686	else if (GRAPHICS_VER(i915) == 6)
1687		snb_gt_workarounds_init(gt, wal);
1688	else if (GRAPHICS_VER(i915) == 5)
1689		ilk_gt_workarounds_init(gt, wal);
1690	else if (IS_G4X(i915))
1691		g4x_gt_workarounds_init(gt, wal);
1692	else if (GRAPHICS_VER(i915) == 4)
1693		gen4_gt_workarounds_init(gt, wal);
1694	else if (GRAPHICS_VER(i915) <= 8)
1695		;
1696	else
1697		MISSING_CASE(GRAPHICS_VER(i915));
1698}
1699
1700void intel_gt_init_workarounds(struct intel_gt *gt)
1701{
1702	struct i915_wa_list *wal = &gt->wa_list;
1703
1704	wa_init_start(wal, gt, "GT", "global");
1705	gt_init_workarounds(gt, wal);
1706	wa_init_finish(wal);
1707}
1708
1709static bool
1710wa_verify(struct intel_gt *gt, const struct i915_wa *wa, u32 cur,
1711	  const char *name, const char *from)
1712{
1713	if ((cur ^ wa->set) & wa->read) {
1714		gt_err(gt,
1715		       "%s workaround lost on %s! (reg[%x]=0x%x, relevant bits were 0x%x vs expected 0x%x)\n",
1716		       name, from, i915_mmio_reg_offset(wa->reg),
1717		       cur, cur & wa->read, wa->set & wa->read);
1718
1719		return false;
1720	}
1721
1722	return true;
1723}
1724
1725static void wa_list_apply(const struct i915_wa_list *wal)
1726{
1727	struct intel_gt *gt = wal->gt;
1728	struct intel_uncore *uncore = gt->uncore;
1729	enum forcewake_domains fw;
1730	unsigned long flags;
1731	struct i915_wa *wa;
1732	unsigned int i;
1733
1734	if (!wal->count)
1735		return;
1736
1737	fw = wal_get_fw_for_rmw(uncore, wal);
1738
1739	intel_gt_mcr_lock(gt, &flags);
1740	spin_lock(&uncore->lock);
1741	intel_uncore_forcewake_get__locked(uncore, fw);
1742
1743	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1744		u32 val, old = 0;
1745
1746		/* open-coded rmw due to steering */
1747		if (wa->clr)
1748			old = wa->is_mcr ?
1749				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1750				intel_uncore_read_fw(uncore, wa->reg);
1751		val = (old & ~wa->clr) | wa->set;
1752		if (val != old || !wa->clr) {
1753			if (wa->is_mcr)
1754				intel_gt_mcr_multicast_write_fw(gt, wa->mcr_reg, val);
1755			else
1756				intel_uncore_write_fw(uncore, wa->reg, val);
1757		}
1758
1759		if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
1760			u32 val = wa->is_mcr ?
1761				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1762				intel_uncore_read_fw(uncore, wa->reg);
1763
1764			wa_verify(gt, wa, val, wal->name, "application");
1765		}
1766	}
1767
1768	intel_uncore_forcewake_put__locked(uncore, fw);
1769	spin_unlock(&uncore->lock);
1770	intel_gt_mcr_unlock(gt, flags);
1771}
1772
1773void intel_gt_apply_workarounds(struct intel_gt *gt)
1774{
1775	wa_list_apply(&gt->wa_list);
1776}
1777
1778static bool wa_list_verify(struct intel_gt *gt,
1779			   const struct i915_wa_list *wal,
1780			   const char *from)
1781{
1782	struct intel_uncore *uncore = gt->uncore;
1783	struct i915_wa *wa;
1784	enum forcewake_domains fw;
1785	unsigned long flags;
1786	unsigned int i;
1787	bool ok = true;
1788
1789	fw = wal_get_fw_for_rmw(uncore, wal);
1790
1791	intel_gt_mcr_lock(gt, &flags);
1792	spin_lock(&uncore->lock);
1793	intel_uncore_forcewake_get__locked(uncore, fw);
1794
1795	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1796		ok &= wa_verify(wal->gt, wa, wa->is_mcr ?
1797				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1798				intel_uncore_read_fw(uncore, wa->reg),
1799				wal->name, from);
1800
1801	intel_uncore_forcewake_put__locked(uncore, fw);
1802	spin_unlock(&uncore->lock);
1803	intel_gt_mcr_unlock(gt, flags);
1804
1805	return ok;
1806}
1807
1808bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from)
1809{
1810	return wa_list_verify(gt, &gt->wa_list, from);
1811}
1812
1813__maybe_unused
1814static bool is_nonpriv_flags_valid(u32 flags)
1815{
1816	/* Check only valid flag bits are set */
1817	if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID)
1818		return false;
1819
1820	/* NB: Only 3 out of 4 enum values are valid for access field */
1821	if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) ==
1822	    RING_FORCE_TO_NONPRIV_ACCESS_INVALID)
1823		return false;
1824
1825	return true;
1826}
1827
1828static void
1829whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags)
1830{
1831	struct i915_wa wa = {
1832		.reg = reg
1833	};
1834
1835	if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1836		return;
1837
1838	if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1839		return;
1840
1841	wa.reg.reg |= flags;
1842	_wa_add(wal, &wa);
1843}
1844
1845static void
1846whitelist_mcr_reg_ext(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 flags)
1847{
1848	struct i915_wa wa = {
1849		.mcr_reg = reg,
1850		.is_mcr = 1,
1851	};
1852
1853	if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1854		return;
1855
1856	if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1857		return;
1858
1859	wa.mcr_reg.reg |= flags;
1860	_wa_add(wal, &wa);
1861}
1862
1863static void
1864whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
1865{
1866	whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1867}
1868
1869static void
1870whitelist_mcr_reg(struct i915_wa_list *wal, i915_mcr_reg_t reg)
1871{
1872	whitelist_mcr_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1873}
1874
1875static void gen9_whitelist_build(struct i915_wa_list *w)
1876{
1877	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
1878	whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
1879
1880	/* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */
1881	whitelist_reg(w, GEN8_CS_CHICKEN1);
1882
1883	/* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */
1884	whitelist_reg(w, GEN8_HDC_CHICKEN1);
1885
1886	/* WaSendPushConstantsFromMMIO:skl,bxt */
1887	whitelist_reg(w, COMMON_SLICE_CHICKEN2);
1888}
1889
1890static void skl_whitelist_build(struct intel_engine_cs *engine)
1891{
1892	struct i915_wa_list *w = &engine->whitelist;
1893
1894	if (engine->class != RENDER_CLASS)
1895		return;
1896
1897	gen9_whitelist_build(w);
1898
1899	/* WaDisableLSQCROPERFforOCL:skl */
1900	whitelist_mcr_reg(w, GEN8_L3SQCREG4);
1901}
1902
1903static void bxt_whitelist_build(struct intel_engine_cs *engine)
1904{
1905	if (engine->class != RENDER_CLASS)
1906		return;
1907
1908	gen9_whitelist_build(&engine->whitelist);
1909}
1910
1911static void kbl_whitelist_build(struct intel_engine_cs *engine)
1912{
1913	struct i915_wa_list *w = &engine->whitelist;
1914
1915	if (engine->class != RENDER_CLASS)
1916		return;
1917
1918	gen9_whitelist_build(w);
1919
1920	/* WaDisableLSQCROPERFforOCL:kbl */
1921	whitelist_mcr_reg(w, GEN8_L3SQCREG4);
1922}
1923
1924static void glk_whitelist_build(struct intel_engine_cs *engine)
1925{
1926	struct i915_wa_list *w = &engine->whitelist;
1927
1928	if (engine->class != RENDER_CLASS)
1929		return;
1930
1931	gen9_whitelist_build(w);
1932
1933	/* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */
1934	whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1935}
1936
1937static void cfl_whitelist_build(struct intel_engine_cs *engine)
1938{
1939	struct i915_wa_list *w = &engine->whitelist;
1940
1941	if (engine->class != RENDER_CLASS)
1942		return;
1943
1944	gen9_whitelist_build(w);
1945
1946	/*
1947	 * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml
1948	 *
1949	 * This covers 4 register which are next to one another :
1950	 *   - PS_INVOCATION_COUNT
1951	 *   - PS_INVOCATION_COUNT_UDW
1952	 *   - PS_DEPTH_COUNT
1953	 *   - PS_DEPTH_COUNT_UDW
1954	 */
1955	whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1956			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
1957			  RING_FORCE_TO_NONPRIV_RANGE_4);
1958}
1959
1960static void allow_read_ctx_timestamp(struct intel_engine_cs *engine)
1961{
1962	struct i915_wa_list *w = &engine->whitelist;
1963
1964	if (engine->class != RENDER_CLASS)
1965		whitelist_reg_ext(w,
1966				  RING_CTX_TIMESTAMP(engine->mmio_base),
1967				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1968}
1969
1970static void cml_whitelist_build(struct intel_engine_cs *engine)
1971{
1972	allow_read_ctx_timestamp(engine);
1973
1974	cfl_whitelist_build(engine);
1975}
1976
1977static void icl_whitelist_build(struct intel_engine_cs *engine)
1978{
1979	struct i915_wa_list *w = &engine->whitelist;
1980
1981	allow_read_ctx_timestamp(engine);
1982
1983	switch (engine->class) {
1984	case RENDER_CLASS:
1985		/* WaAllowUMDToModifyHalfSliceChicken7:icl */
1986		whitelist_mcr_reg(w, GEN9_HALF_SLICE_CHICKEN7);
1987
1988		/* WaAllowUMDToModifySamplerMode:icl */
1989		whitelist_mcr_reg(w, GEN10_SAMPLER_MODE);
1990
1991		/* WaEnableStateCacheRedirectToCS:icl */
1992		whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1993
1994		/*
1995		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl
1996		 *
1997		 * This covers 4 register which are next to one another :
1998		 *   - PS_INVOCATION_COUNT
1999		 *   - PS_INVOCATION_COUNT_UDW
2000		 *   - PS_DEPTH_COUNT
2001		 *   - PS_DEPTH_COUNT_UDW
2002		 */
2003		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2004				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
2005				  RING_FORCE_TO_NONPRIV_RANGE_4);
2006		break;
2007
2008	case VIDEO_DECODE_CLASS:
2009		/* hucStatusRegOffset */
2010		whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base),
2011				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2012		/* hucUKernelHdrInfoRegOffset */
2013		whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base),
2014				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2015		/* hucStatus2RegOffset */
2016		whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base),
2017				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2018		break;
2019
2020	default:
2021		break;
2022	}
2023}
2024
2025static void tgl_whitelist_build(struct intel_engine_cs *engine)
2026{
2027	struct i915_wa_list *w = &engine->whitelist;
2028
2029	allow_read_ctx_timestamp(engine);
2030
2031	switch (engine->class) {
2032	case RENDER_CLASS:
2033		/*
2034		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl
2035		 * Wa_1408556865:tgl
2036		 *
2037		 * This covers 4 registers which are next to one another :
2038		 *   - PS_INVOCATION_COUNT
2039		 *   - PS_INVOCATION_COUNT_UDW
2040		 *   - PS_DEPTH_COUNT
2041		 *   - PS_DEPTH_COUNT_UDW
2042		 */
2043		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2044				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
2045				  RING_FORCE_TO_NONPRIV_RANGE_4);
2046
2047		/*
2048		 * Wa_1808121037:tgl
2049		 * Wa_14012131227:dg1
2050		 * Wa_1508744258:tgl,rkl,dg1,adl-s,adl-p
2051		 */
2052		whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
2053
2054		/* Wa_1806527549:tgl */
2055		whitelist_reg(w, HIZ_CHICKEN);
2056
2057		/* Required by recommended tuning setting (not a workaround) */
2058		whitelist_reg(w, GEN11_COMMON_SLICE_CHICKEN3);
2059
2060		break;
2061	default:
2062		break;
2063	}
2064}
2065
2066static void dg2_whitelist_build(struct intel_engine_cs *engine)
2067{
2068	struct i915_wa_list *w = &engine->whitelist;
2069
2070	switch (engine->class) {
2071	case RENDER_CLASS:
2072		/* Required by recommended tuning setting (not a workaround) */
2073		whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3);
2074		whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
2075		break;
2076	default:
2077		break;
2078	}
2079}
2080
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2081static void xelpg_whitelist_build(struct intel_engine_cs *engine)
2082{
2083	struct i915_wa_list *w = &engine->whitelist;
2084
2085	switch (engine->class) {
2086	case RENDER_CLASS:
2087		/* Required by recommended tuning setting (not a workaround) */
2088		whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3);
2089		whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
2090		break;
2091	default:
2092		break;
2093	}
2094}
2095
2096void intel_engine_init_whitelist(struct intel_engine_cs *engine)
2097{
2098	struct drm_i915_private *i915 = engine->i915;
2099	struct i915_wa_list *w = &engine->whitelist;
2100
2101	wa_init_start(w, engine->gt, "whitelist", engine->name);
2102
2103	if (engine->gt->type == GT_MEDIA)
2104		; /* none yet */
2105	else if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
2106		xelpg_whitelist_build(engine);
 
 
2107	else if (IS_DG2(i915))
2108		dg2_whitelist_build(engine);
 
 
2109	else if (GRAPHICS_VER(i915) == 12)
2110		tgl_whitelist_build(engine);
2111	else if (GRAPHICS_VER(i915) == 11)
2112		icl_whitelist_build(engine);
2113	else if (IS_COMETLAKE(i915))
2114		cml_whitelist_build(engine);
2115	else if (IS_COFFEELAKE(i915))
2116		cfl_whitelist_build(engine);
2117	else if (IS_GEMINILAKE(i915))
2118		glk_whitelist_build(engine);
2119	else if (IS_KABYLAKE(i915))
2120		kbl_whitelist_build(engine);
2121	else if (IS_BROXTON(i915))
2122		bxt_whitelist_build(engine);
2123	else if (IS_SKYLAKE(i915))
2124		skl_whitelist_build(engine);
2125	else if (GRAPHICS_VER(i915) <= 8)
2126		;
2127	else
2128		MISSING_CASE(GRAPHICS_VER(i915));
2129
2130	wa_init_finish(w);
2131}
2132
2133void intel_engine_apply_whitelist(struct intel_engine_cs *engine)
2134{
2135	const struct i915_wa_list *wal = &engine->whitelist;
2136	struct intel_uncore *uncore = engine->uncore;
2137	const u32 base = engine->mmio_base;
2138	struct i915_wa *wa;
2139	unsigned int i;
2140
2141	if (!wal->count)
2142		return;
2143
2144	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
2145		intel_uncore_write(uncore,
2146				   RING_FORCE_TO_NONPRIV(base, i),
2147				   i915_mmio_reg_offset(wa->reg));
2148
2149	/* And clear the rest just in case of garbage */
2150	for (; i < RING_MAX_NONPRIV_SLOTS; i++)
2151		intel_uncore_write(uncore,
2152				   RING_FORCE_TO_NONPRIV(base, i),
2153				   i915_mmio_reg_offset(RING_NOPID(base)));
2154}
2155
2156/*
2157 * engine_fake_wa_init(), a place holder to program the registers
2158 * which are not part of an official workaround defined by the
2159 * hardware team.
2160 * Adding programming of those register inside workaround will
2161 * allow utilizing wa framework to proper application and verification.
2162 */
2163static void
2164engine_fake_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2165{
2166	u8 mocs_w, mocs_r;
2167
2168	/*
2169	 * RING_CMD_CCTL specifies the default MOCS entry that will be used
2170	 * by the command streamer when executing commands that don't have
2171	 * a way to explicitly specify a MOCS setting.  The default should
2172	 * usually reference whichever MOCS entry corresponds to uncached
2173	 * behavior, although use of a WB cached entry is recommended by the
2174	 * spec in certain circumstances on specific platforms.
2175	 */
2176	if (GRAPHICS_VER(engine->i915) >= 12) {
2177		mocs_r = engine->gt->mocs.uc_index;
2178		mocs_w = engine->gt->mocs.uc_index;
2179
2180		if (HAS_L3_CCS_READ(engine->i915) &&
2181		    engine->class == COMPUTE_CLASS) {
2182			mocs_r = engine->gt->mocs.wb_index;
2183
2184			/*
2185			 * Even on the few platforms where MOCS 0 is a
2186			 * legitimate table entry, it's never the correct
2187			 * setting to use here; we can assume the MOCS init
2188			 * just forgot to initialize wb_index.
2189			 */
2190			drm_WARN_ON(&engine->i915->drm, mocs_r == 0);
2191		}
2192
2193		wa_masked_field_set(wal,
2194				    RING_CMD_CCTL(engine->mmio_base),
2195				    CMD_CCTL_MOCS_MASK,
2196				    CMD_CCTL_MOCS_OVERRIDE(mocs_w, mocs_r));
2197	}
2198}
2199
2200static void
2201rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2202{
2203	struct drm_i915_private *i915 = engine->i915;
2204	struct intel_gt *gt = engine->gt;
2205
2206	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2207	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
2208		/* Wa_22014600077 */
2209		wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2210				 ENABLE_EU_COUNT_FOR_TDL_FLUSH);
2211	}
2212
2213	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2214	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2215	    IS_DG2(i915)) {
2216		/* Wa_1509727124 */
2217		wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2218				 SC_DISABLE_POWER_OPTIMIZATION_EBB);
2219	}
2220
2221	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2222	    IS_DG2(i915)) {
2223		/* Wa_22012856258 */
2224		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2225				 GEN12_DISABLE_READ_SUPPRESSION);
2226	}
2227
2228	if (IS_DG2(i915)) {
2229		/*
2230		 * Wa_22010960976:dg2
2231		 * Wa_14013347512:dg2
2232		 */
2233		wa_mcr_masked_dis(wal, XEHP_HDC_CHICKEN0,
2234				  LSC_L1_FLUSH_CTL_3D_DATAPORT_FLUSH_EVENTS_MASK);
2235	}
2236
2237	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) ||
2238	    IS_DG2(i915)) {
2239		/* Wa_14015150844 */
2240		wa_mcr_add(wal, XEHP_HDC_CHICKEN0, 0,
2241			   _MASKED_BIT_ENABLE(DIS_ATOMIC_CHAINING_TYPED_WRITES),
2242			   0, true);
2243	}
2244
2245	if (IS_DG2(i915) || IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2246	    IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2247		/*
2248		 * Wa_1606700617:tgl,dg1,adl-p
2249		 * Wa_22010271021:tgl,rkl,dg1,adl-s,adl-p
2250		 * Wa_14010826681:tgl,dg1,rkl,adl-p
2251		 * Wa_18019627453:dg2
2252		 */
2253		wa_masked_en(wal,
2254			     GEN9_CS_DEBUG_MODE1,
2255			     FF_DOP_CLOCK_GATE_DISABLE);
2256	}
2257
2258	if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || IS_DG1(i915) ||
2259	    IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2260		/* Wa_1606931601:tgl,rkl,dg1,adl-s,adl-p */
2261		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ);
2262
2263		/*
2264		 * Wa_1407928979:tgl A*
2265		 * Wa_18011464164:tgl[B0+],dg1[B0+]
2266		 * Wa_22010931296:tgl[B0+],dg1[B0+]
2267		 * Wa_14010919138:rkl,dg1,adl-s,adl-p
2268		 */
2269		wa_write_or(wal, GEN7_FF_THREAD_MODE,
2270			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2271
2272		/* Wa_1406941453:tgl,rkl,dg1,adl-s,adl-p */
2273		wa_mcr_masked_en(wal,
2274				 GEN10_SAMPLER_MODE,
2275				 ENABLE_SMALLPL);
2276	}
2277
2278	if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2279	    IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2280		/* Wa_1409804808 */
2281		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2282				 GEN12_PUSH_CONST_DEREF_HOLD_DIS);
2283
2284		/* Wa_14010229206 */
2285		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH);
2286	}
2287
2288	if (IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) || IS_ALDERLAKE_P(i915)) {
2289		/*
2290		 * Wa_1607297627
2291		 *
2292		 * On TGL and RKL there are multiple entries for this WA in the
2293		 * BSpec; some indicate this is an A0-only WA, others indicate
2294		 * it applies to all steppings so we trust the "all steppings."
2295		 */
2296		wa_masked_en(wal,
2297			     RING_PSMI_CTL(RENDER_RING_BASE),
2298			     GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE |
2299			     GEN8_RC_SEMA_IDLE_MSG_DISABLE);
2300	}
2301
2302	if (IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) {
2303		/*
2304		 * "Disable Repacking for Compression (masked R/W access)
2305		 *  before rendering compressed surfaces for display."
2306		 */
2307		wa_masked_en(wal, CACHE_MODE_0_GEN7,
2308			     DISABLE_REPACKING_FOR_COMPRESSION);
2309	}
2310
2311	if (GRAPHICS_VER(i915) == 11) {
2312		/* This is not an Wa. Enable for better image quality */
2313		wa_masked_en(wal,
2314			     _3D_CHICKEN3,
2315			     _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE);
2316
2317		/*
2318		 * Wa_1405543622:icl
2319		 * Formerly known as WaGAPZPriorityScheme
2320		 */
2321		wa_write_or(wal,
2322			    GEN8_GARBCNTL,
2323			    GEN11_ARBITRATION_PRIO_ORDER_MASK);
2324
2325		/*
2326		 * Wa_1604223664:icl
2327		 * Formerly known as WaL3BankAddressHashing
2328		 */
2329		wa_write_clr_set(wal,
2330				 GEN8_GARBCNTL,
2331				 GEN11_HASH_CTRL_EXCL_MASK,
2332				 GEN11_HASH_CTRL_EXCL_BIT0);
2333		wa_write_clr_set(wal,
2334				 GEN11_GLBLINVL,
2335				 GEN11_BANK_HASH_ADDR_EXCL_MASK,
2336				 GEN11_BANK_HASH_ADDR_EXCL_BIT0);
2337
2338		/*
2339		 * Wa_1405733216:icl
2340		 * Formerly known as WaDisableCleanEvicts
2341		 */
2342		wa_mcr_write_or(wal,
2343				GEN8_L3SQCREG4,
2344				GEN11_LQSC_CLEAN_EVICT_DISABLE);
2345
2346		/* Wa_1606682166:icl */
2347		wa_write_or(wal,
2348			    GEN7_SARCHKMD,
2349			    GEN7_DISABLE_SAMPLER_PREFETCH);
2350
2351		/* Wa_1409178092:icl */
2352		wa_mcr_write_clr_set(wal,
2353				     GEN11_SCRATCH2,
2354				     GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE,
2355				     0);
2356
2357		/* WaEnable32PlaneMode:icl */
2358		wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS,
2359			     GEN11_ENABLE_32_PLANE_MODE);
2360
2361		/*
2362		 * Wa_1408767742:icl[a2..forever],ehl[all]
2363		 * Wa_1605460711:icl[a0..c0]
2364		 */
2365		wa_write_or(wal,
2366			    GEN7_FF_THREAD_MODE,
2367			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2368
2369		/* Wa_22010271021 */
2370		wa_masked_en(wal,
2371			     GEN9_CS_DEBUG_MODE1,
2372			     FF_DOP_CLOCK_GATE_DISABLE);
2373	}
2374
2375	/*
2376	 * Intel platforms that support fine-grained preemption (i.e., gen9 and
2377	 * beyond) allow the kernel-mode driver to choose between two different
2378	 * options for controlling preemption granularity and behavior.
2379	 *
2380	 * Option 1 (hardware default):
2381	 *   Preemption settings are controlled in a global manner via
2382	 *   kernel-only register CS_DEBUG_MODE1 (0x20EC).  Any granularity
2383	 *   and settings chosen by the kernel-mode driver will apply to all
2384	 *   userspace clients.
2385	 *
2386	 * Option 2:
2387	 *   Preemption settings are controlled on a per-context basis via
2388	 *   register CS_CHICKEN1 (0x2580).  CS_CHICKEN1 is saved/restored on
2389	 *   context switch and is writable by userspace (e.g., via
2390	 *   MI_LOAD_REGISTER_IMMEDIATE instructions placed in a batch buffer)
2391	 *   which allows different userspace drivers/clients to select
2392	 *   different settings, or to change those settings on the fly in
2393	 *   response to runtime needs.  This option was known by name
2394	 *   "FtrPerCtxtPreemptionGranularityControl" at one time, although
2395	 *   that name is somewhat misleading as other non-granularity
2396	 *   preemption settings are also impacted by this decision.
2397	 *
2398	 * On Linux, our policy has always been to let userspace drivers
2399	 * control preemption granularity/settings (Option 2).  This was
2400	 * originally mandatory on gen9 to prevent ABI breakage (old gen9
2401	 * userspace developed before object-level preemption was enabled would
2402	 * not behave well if i915 were to go with Option 1 and enable that
2403	 * preemption in a global manner).  On gen9 each context would have
2404	 * object-level preemption disabled by default (see
2405	 * WaDisable3DMidCmdPreemption in gen9_ctx_workarounds_init), but
2406	 * userspace drivers could opt-in to object-level preemption as they
2407	 * saw fit.  For post-gen9 platforms, we continue to utilize Option 2;
2408	 * even though it is no longer necessary for ABI compatibility when
2409	 * enabling a new platform, it does ensure that userspace will be able
2410	 * to implement any workarounds that show up requiring temporary
2411	 * adjustments to preemption behavior at runtime.
2412	 *
2413	 * Notes/Workarounds:
2414	 *  - Wa_14015141709:  On DG2 and early steppings of MTL,
2415	 *      CS_CHICKEN1[0] does not disable object-level preemption as
2416	 *      it is supposed to (nor does CS_DEBUG_MODE1[0] if we had been
2417	 *      using Option 1).  Effectively this means userspace is unable
2418	 *      to disable object-level preemption on these platforms/steppings
2419	 *      despite the setting here.
2420	 *
2421	 *  - Wa_16013994831:  May require that userspace program
2422	 *      CS_CHICKEN1[10] when certain runtime conditions are true.
2423	 *      Userspace requires Option 2 to be in effect for their update of
2424	 *      CS_CHICKEN1[10] to be effective.
2425	 *
2426	 * Other workarounds may appear in the future that will also require
2427	 * Option 2 behavior to allow proper userspace implementation.
2428	 */
2429	if (GRAPHICS_VER(i915) >= 9)
2430		wa_masked_en(wal,
2431			     GEN7_FF_SLICE_CS_CHICKEN1,
2432			     GEN9_FFSC_PERCTX_PREEMPT_CTRL);
2433
2434	if (IS_SKYLAKE(i915) ||
2435	    IS_KABYLAKE(i915) ||
2436	    IS_COFFEELAKE(i915) ||
2437	    IS_COMETLAKE(i915)) {
2438		/* WaEnableGapsTsvCreditFix:skl,kbl,cfl */
2439		wa_write_or(wal,
2440			    GEN8_GARBCNTL,
2441			    GEN9_GAPS_TSV_CREDIT_DISABLE);
2442	}
2443
2444	if (IS_BROXTON(i915)) {
2445		/* WaDisablePooledEuLoadBalancingFix:bxt */
2446		wa_masked_en(wal,
2447			     FF_SLICE_CS_CHICKEN2,
2448			     GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
2449	}
2450
2451	if (GRAPHICS_VER(i915) == 9) {
2452		/* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */
2453		wa_masked_en(wal,
2454			     GEN9_CSFE_CHICKEN1_RCS,
2455			     GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE);
2456
2457		/* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */
2458		wa_mcr_write_or(wal,
2459				BDW_SCRATCH1,
2460				GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
2461
2462		/* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */
2463		if (IS_GEN9_LP(i915))
2464			wa_mcr_write_clr_set(wal,
2465					     GEN8_L3SQCREG1,
2466					     L3_PRIO_CREDITS_MASK,
2467					     L3_GENERAL_PRIO_CREDITS(62) |
2468					     L3_HIGH_PRIO_CREDITS(2));
2469
2470		/* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */
2471		wa_mcr_write_or(wal,
2472				GEN8_L3SQCREG4,
2473				GEN8_LQSC_FLUSH_COHERENT_LINES);
2474
2475		/* Disable atomics in L3 to prevent unrecoverable hangs */
2476		wa_write_clr_set(wal, GEN9_SCRATCH_LNCF1,
2477				 GEN9_LNCF_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2478		wa_mcr_write_clr_set(wal, GEN8_L3SQCREG4,
2479				     GEN8_LQSQ_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2480		wa_mcr_write_clr_set(wal, GEN9_SCRATCH1,
2481				     EVICTION_PERF_FIX_ENABLE, 0);
2482	}
2483
2484	if (IS_HASWELL(i915)) {
2485		/* WaSampleCChickenBitEnable:hsw */
2486		wa_masked_en(wal,
2487			     HSW_HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE);
2488
2489		wa_masked_dis(wal,
2490			      CACHE_MODE_0_GEN7,
2491			      /* enable HiZ Raw Stall Optimization */
2492			      HIZ_RAW_STALL_OPT_DISABLE);
2493	}
2494
2495	if (IS_VALLEYVIEW(i915)) {
2496		/* WaDisableEarlyCull:vlv */
2497		wa_masked_en(wal,
2498			     _3D_CHICKEN3,
2499			     _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2500
2501		/*
2502		 * WaVSThreadDispatchOverride:ivb,vlv
2503		 *
2504		 * This actually overrides the dispatch
2505		 * mode for all thread types.
2506		 */
2507		wa_write_clr_set(wal,
2508				 GEN7_FF_THREAD_MODE,
2509				 GEN7_FF_SCHED_MASK,
2510				 GEN7_FF_TS_SCHED_HW |
2511				 GEN7_FF_VS_SCHED_HW |
2512				 GEN7_FF_DS_SCHED_HW);
2513
2514		/* WaPsdDispatchEnable:vlv */
2515		/* WaDisablePSDDualDispatchEnable:vlv */
2516		wa_masked_en(wal,
2517			     GEN7_HALF_SLICE_CHICKEN1,
2518			     GEN7_MAX_PS_THREAD_DEP |
2519			     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2520	}
2521
2522	if (IS_IVYBRIDGE(i915)) {
2523		/* WaDisableEarlyCull:ivb */
2524		wa_masked_en(wal,
2525			     _3D_CHICKEN3,
2526			     _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2527
2528		if (0) { /* causes HiZ corruption on ivb:gt1 */
2529			/* enable HiZ Raw Stall Optimization */
2530			wa_masked_dis(wal,
2531				      CACHE_MODE_0_GEN7,
2532				      HIZ_RAW_STALL_OPT_DISABLE);
2533		}
2534
2535		/*
2536		 * WaVSThreadDispatchOverride:ivb,vlv
2537		 *
2538		 * This actually overrides the dispatch
2539		 * mode for all thread types.
2540		 */
2541		wa_write_clr_set(wal,
2542				 GEN7_FF_THREAD_MODE,
2543				 GEN7_FF_SCHED_MASK,
2544				 GEN7_FF_TS_SCHED_HW |
2545				 GEN7_FF_VS_SCHED_HW |
2546				 GEN7_FF_DS_SCHED_HW);
2547
2548		/* WaDisablePSDDualDispatchEnable:ivb */
2549		if (INTEL_INFO(i915)->gt == 1)
2550			wa_masked_en(wal,
2551				     GEN7_HALF_SLICE_CHICKEN1,
2552				     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2553	}
2554
2555	if (GRAPHICS_VER(i915) == 7) {
2556		/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
2557		wa_masked_en(wal,
2558			     RING_MODE_GEN7(RENDER_RING_BASE),
2559			     GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
2560
2561		/* WaDisable_RenderCache_OperationalFlush:ivb,vlv,hsw */
2562		wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
2563
2564		/*
2565		 * BSpec says this must be set, even though
2566		 * WaDisable4x2SubspanOptimization:ivb,hsw
2567		 * WaDisable4x2SubspanOptimization isn't listed for VLV.
2568		 */
2569		wa_masked_en(wal,
2570			     CACHE_MODE_1,
2571			     PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
2572
2573		/*
2574		 * BSpec recommends 8x4 when MSAA is used,
2575		 * however in practice 16x4 seems fastest.
2576		 *
2577		 * Note that PS/WM thread counts depend on the WIZ hashing
2578		 * disable bit, which we don't touch here, but it's good
2579		 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2580		 */
2581		wa_masked_field_set(wal,
2582				    GEN7_GT_MODE,
2583				    GEN6_WIZ_HASHING_MASK,
2584				    GEN6_WIZ_HASHING_16x4);
2585	}
2586
2587	if (IS_GRAPHICS_VER(i915, 6, 7))
2588		/*
2589		 * We need to disable the AsyncFlip performance optimisations in
2590		 * order to use MI_WAIT_FOR_EVENT within the CS. It should
2591		 * already be programmed to '1' on all products.
2592		 *
2593		 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
2594		 */
2595		wa_masked_en(wal,
2596			     RING_MI_MODE(RENDER_RING_BASE),
2597			     ASYNC_FLIP_PERF_DISABLE);
2598
2599	if (GRAPHICS_VER(i915) == 6) {
2600		/*
2601		 * Required for the hardware to program scanline values for
2602		 * waiting
2603		 * WaEnableFlushTlbInvalidationMode:snb
2604		 */
2605		wa_masked_en(wal,
2606			     GFX_MODE,
2607			     GFX_TLB_INVALIDATE_EXPLICIT);
2608
2609		/* WaDisableHiZPlanesWhenMSAAEnabled:snb */
2610		wa_masked_en(wal,
2611			     _3D_CHICKEN,
2612			     _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB);
2613
2614		wa_masked_en(wal,
2615			     _3D_CHICKEN3,
2616			     /* WaStripsFansDisableFastClipPerformanceFix:snb */
2617			     _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL |
2618			     /*
2619			      * Bspec says:
2620			      * "This bit must be set if 3DSTATE_CLIP clip mode is set
2621			      * to normal and 3DSTATE_SF number of SF output attributes
2622			      * is more than 16."
2623			      */
2624			     _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH);
2625
2626		/*
2627		 * BSpec recommends 8x4 when MSAA is used,
2628		 * however in practice 16x4 seems fastest.
2629		 *
2630		 * Note that PS/WM thread counts depend on the WIZ hashing
2631		 * disable bit, which we don't touch here, but it's good
2632		 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2633		 */
2634		wa_masked_field_set(wal,
2635				    GEN6_GT_MODE,
2636				    GEN6_WIZ_HASHING_MASK,
2637				    GEN6_WIZ_HASHING_16x4);
2638
2639		/* WaDisable_RenderCache_OperationalFlush:snb */
2640		wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
2641
2642		/*
2643		 * From the Sandybridge PRM, volume 1 part 3, page 24:
2644		 * "If this bit is set, STCunit will have LRA as replacement
2645		 *  policy. [...] This bit must be reset. LRA replacement
2646		 *  policy is not supported."
2647		 */
2648		wa_masked_dis(wal,
2649			      CACHE_MODE_0,
2650			      CM0_STC_EVICT_DISABLE_LRA_SNB);
2651	}
2652
2653	if (IS_GRAPHICS_VER(i915, 4, 6))
2654		/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
2655		wa_add(wal, RING_MI_MODE(RENDER_RING_BASE),
2656		       0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH),
2657		       /* XXX bit doesn't stick on Broadwater */
2658		       IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH, true);
2659
2660	if (GRAPHICS_VER(i915) == 4)
2661		/*
2662		 * Disable CONSTANT_BUFFER before it is loaded from the context
2663		 * image. For as it is loaded, it is executed and the stored
2664		 * address may no longer be valid, leading to a GPU hang.
2665		 *
2666		 * This imposes the requirement that userspace reload their
2667		 * CONSTANT_BUFFER on every batch, fortunately a requirement
2668		 * they are already accustomed to from before contexts were
2669		 * enabled.
2670		 */
2671		wa_add(wal, ECOSKPD(RENDER_RING_BASE),
2672		       0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE),
2673		       0 /* XXX bit doesn't stick on Broadwater */,
2674		       true);
2675}
2676
2677static void
2678xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2679{
2680	struct drm_i915_private *i915 = engine->i915;
2681
2682	/* WaKBLVECSSemaphoreWaitPoll:kbl */
2683	if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_A0, STEP_F0)) {
2684		wa_write(wal,
2685			 RING_SEMA_WAIT_POLL(engine->mmio_base),
2686			 1);
2687	}
2688	/* Wa_16018031267, Wa_16018063123 */
2689	if (NEEDS_FASTCOLOR_BLT_WABB(engine))
2690		wa_masked_field_set(wal, ECOSKPD(engine->mmio_base),
2691				    XEHP_BLITTER_SCHEDULING_MODE_MASK,
2692				    XEHP_BLITTER_ROUND_ROBIN_MODE);
2693}
2694
2695static void
2696ccs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2697{
2698	/* boilerplate for any CCS engine workaround */
 
 
 
2699}
2700
2701/*
2702 * The bspec performance guide has recommended MMIO tuning settings.  These
2703 * aren't truly "workarounds" but we want to program them with the same
2704 * workaround infrastructure to ensure that they're automatically added to
2705 * the GuC save/restore lists, re-applied at the right times, and checked for
2706 * any conflicting programming requested by real workarounds.
2707 *
2708 * Programming settings should be added here only if their registers are not
2709 * part of an engine's register state context.  If a register is part of a
2710 * context, then any tuning settings should be programmed in an appropriate
2711 * function invoked by __intel_engine_init_ctx_wa().
2712 */
2713static void
2714add_render_compute_tuning_settings(struct intel_gt *gt,
2715				   struct i915_wa_list *wal)
2716{
2717	struct drm_i915_private *i915 = gt->i915;
2718
2719	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)) || IS_DG2(i915))
2720		wa_mcr_write_clr_set(wal, RT_CTRL, STACKID_CTRL, STACKID_CTRL_512);
2721
2722	/*
2723	 * This tuning setting proves beneficial only on ATS-M designs; the
2724	 * default "age based" setting is optimal on regular DG2 and other
2725	 * platforms.
2726	 */
2727	if (INTEL_INFO(i915)->tuning_thread_rr_after_dep)
2728		wa_mcr_masked_field_set(wal, GEN9_ROW_CHICKEN4, THREAD_EX_ARB_MODE,
2729					THREAD_EX_ARB_MODE_RR_AFTER_DEP);
2730
2731	if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 55))
2732		wa_write_clr(wal, GEN8_GARBCNTL, GEN12_BUS_HASH_CTL_BIT_EXC);
2733}
2734
2735static void ccs_engine_wa_mode(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2736{
2737	struct intel_gt *gt = engine->gt;
2738	u32 mode;
2739
2740	if (!IS_DG2(gt->i915))
2741		return;
2742
2743	/*
2744	 * Wa_14019159160: This workaround, along with others, leads to
2745	 * significant challenges in utilizing load balancing among the
2746	 * CCS slices. Consequently, an architectural decision has been
2747	 * made to completely disable automatic CCS load balancing.
2748	 */
2749	wa_masked_en(wal, GEN12_RCU_MODE, XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE);
2750
2751	/*
2752	 * After having disabled automatic load balancing we need to
2753	 * assign all slices to a single CCS. We will call it CCS mode 1
2754	 */
2755	mode = intel_gt_apply_ccs_mode(gt);
2756	wa_masked_en(wal, XEHP_CCS_MODE, mode);
2757}
2758
2759/*
2760 * The workarounds in this function apply to shared registers in
2761 * the general render reset domain that aren't tied to a
2762 * specific engine.  Since all render+compute engines get reset
2763 * together, and the contents of these registers are lost during
2764 * the shared render domain reset, we'll define such workarounds
2765 * here and then add them to just a single RCS or CCS engine's
2766 * workaround list (whichever engine has the XXXX flag).
2767 */
2768static void
2769general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2770{
2771	struct drm_i915_private *i915 = engine->i915;
2772	struct intel_gt *gt = engine->gt;
2773
2774	add_render_compute_tuning_settings(gt, wal);
2775
2776	if (GRAPHICS_VER(i915) >= 11) {
2777		/* This is not a Wa (although referred to as
2778		 * WaSetInidrectStateOverride in places), this allows
2779		 * applications that reference sampler states through
2780		 * the BindlessSamplerStateBaseAddress to have their
2781		 * border color relative to DynamicStateBaseAddress
2782		 * rather than BindlessSamplerStateBaseAddress.
2783		 *
2784		 * Otherwise SAMPLER_STATE border colors have to be
2785		 * copied in multiple heaps (DynamicStateBaseAddress &
2786		 * BindlessSamplerStateBaseAddress)
2787		 *
2788		 * BSpec: 46052
2789		 */
2790		wa_mcr_masked_en(wal,
2791				 GEN10_SAMPLER_MODE,
2792				 GEN11_INDIRECT_STATE_BASE_ADDR_OVERRIDE);
2793	}
2794
2795	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_B0, STEP_FOREVER) ||
2796	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B0, STEP_FOREVER) ||
2797	    IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 74), IP_VER(12, 74))) {
2798		/* Wa_14017856879 */
2799		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN3, MTL_DISABLE_FIX_FOR_EOT_FLUSH);
2800
2801		/* Wa_14020495402 */
2802		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, XELPG_DISABLE_TDL_SVHS_GATING);
2803	}
2804
2805	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2806	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2807		/*
2808		 * Wa_14017066071
2809		 * Wa_14017654203
2810		 */
2811		wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2812				 MTL_DISABLE_SAMPLER_SC_OOO);
2813
2814	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2815		/* Wa_22015279794 */
2816		wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2817				 DISABLE_PREFETCH_INTO_IC);
2818
2819	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2820	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2821	    IS_DG2(i915)) {
2822		/* Wa_22013037850 */
2823		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW,
2824				DISABLE_128B_EVICTION_COMMAND_UDW);
2825
2826		/* Wa_18017747507 */
2827		wa_masked_en(wal, VFG_PREEMPTION_CHICKEN, POLYGON_TRIFAN_LINELOOP_DISABLE);
2828	}
2829
2830	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2831	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
 
2832	    IS_DG2(i915)) {
2833		/* Wa_22014226127 */
2834		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0, DISABLE_D8_D16_COASLESCE);
2835	}
2836
2837	if (IS_DG2(i915)) {
2838		/* Wa_14015227452:dg2,pvc */
2839		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE);
2840
 
 
 
 
 
2841		/*
2842		 * Wa_16011620976:dg2_g11
2843		 * Wa_22015475538:dg2
2844		 */
2845		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8);
2846
2847		/* Wa_18028616096 */
2848		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, UGM_FRAGMENT_THRESHOLD_TO_3);
2849	}
2850
2851	if (IS_DG2_G11(i915)) {
2852		/*
2853		 * Wa_22012826095:dg2
2854		 * Wa_22013059131:dg2
2855		 */
2856		wa_mcr_write_clr_set(wal, LSC_CHICKEN_BIT_0_UDW,
2857				     MAXREQS_PER_BANK,
2858				     REG_FIELD_PREP(MAXREQS_PER_BANK, 2));
2859
2860		/* Wa_22013059131:dg2 */
2861		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0,
2862				FORCE_1_SUB_MESSAGE_PER_FRAGMENT);
2863
2864		/*
2865		 * Wa_22012654132
2866		 *
2867		 * Note that register 0xE420 is write-only and cannot be read
2868		 * back for verification on DG2 (due to Wa_14012342262), so
2869		 * we need to explicitly skip the readback.
2870		 */
2871		wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
2872			   _MASKED_BIT_ENABLE(ENABLE_PREFETCH_INTO_IC),
2873			   0 /* write-only, so skip validation */,
2874			   true);
2875	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2876}
2877
2878static void
2879engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2880{
2881	if (GRAPHICS_VER(engine->i915) < 4)
2882		return;
2883
2884	engine_fake_wa_init(engine, wal);
2885
2886	/*
2887	 * These are common workarounds that just need to applied
2888	 * to a single RCS/CCS engine's workaround list since
2889	 * they're reset as part of the general render domain reset.
2890	 */
2891	if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE) {
2892		general_render_compute_wa_init(engine, wal);
2893		ccs_engine_wa_mode(engine, wal);
2894	}
2895
2896	if (engine->class == COMPUTE_CLASS)
2897		ccs_engine_wa_init(engine, wal);
2898	else if (engine->class == RENDER_CLASS)
2899		rcs_engine_wa_init(engine, wal);
2900	else
2901		xcs_engine_wa_init(engine, wal);
2902}
2903
2904void intel_engine_init_workarounds(struct intel_engine_cs *engine)
2905{
2906	struct i915_wa_list *wal = &engine->wa_list;
2907
2908	wa_init_start(wal, engine->gt, "engine", engine->name);
2909	engine_init_workarounds(engine, wal);
2910	wa_init_finish(wal);
2911}
2912
2913void intel_engine_apply_workarounds(struct intel_engine_cs *engine)
2914{
2915	wa_list_apply(&engine->wa_list);
2916}
2917
2918static const struct i915_range mcr_ranges_gen8[] = {
2919	{ .start = 0x5500, .end = 0x55ff },
2920	{ .start = 0x7000, .end = 0x7fff },
2921	{ .start = 0x9400, .end = 0x97ff },
2922	{ .start = 0xb000, .end = 0xb3ff },
2923	{ .start = 0xe000, .end = 0xe7ff },
2924	{},
2925};
2926
2927static const struct i915_range mcr_ranges_gen12[] = {
2928	{ .start =  0x8150, .end =  0x815f },
2929	{ .start =  0x9520, .end =  0x955f },
2930	{ .start =  0xb100, .end =  0xb3ff },
2931	{ .start =  0xde80, .end =  0xe8ff },
2932	{ .start = 0x24a00, .end = 0x24a7f },
2933	{},
2934};
2935
2936static const struct i915_range mcr_ranges_xehp[] = {
2937	{ .start =  0x4000, .end =  0x4aff },
2938	{ .start =  0x5200, .end =  0x52ff },
2939	{ .start =  0x5400, .end =  0x7fff },
2940	{ .start =  0x8140, .end =  0x815f },
2941	{ .start =  0x8c80, .end =  0x8dff },
2942	{ .start =  0x94d0, .end =  0x955f },
2943	{ .start =  0x9680, .end =  0x96ff },
2944	{ .start =  0xb000, .end =  0xb3ff },
2945	{ .start =  0xc800, .end =  0xcfff },
2946	{ .start =  0xd800, .end =  0xd8ff },
2947	{ .start =  0xdc00, .end =  0xffff },
2948	{ .start = 0x17000, .end = 0x17fff },
2949	{ .start = 0x24a00, .end = 0x24a7f },
2950	{},
2951};
2952
2953static bool mcr_range(struct drm_i915_private *i915, u32 offset)
2954{
2955	const struct i915_range *mcr_ranges;
2956	int i;
2957
2958	if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 55))
2959		mcr_ranges = mcr_ranges_xehp;
2960	else if (GRAPHICS_VER(i915) >= 12)
2961		mcr_ranges = mcr_ranges_gen12;
2962	else if (GRAPHICS_VER(i915) >= 8)
2963		mcr_ranges = mcr_ranges_gen8;
2964	else
2965		return false;
2966
2967	/*
2968	 * Registers in these ranges are affected by the MCR selector
2969	 * which only controls CPU initiated MMIO. Routing does not
2970	 * work for CS access so we cannot verify them on this path.
2971	 */
2972	for (i = 0; mcr_ranges[i].start; i++)
2973		if (offset >= mcr_ranges[i].start &&
2974		    offset <= mcr_ranges[i].end)
2975			return true;
2976
2977	return false;
2978}
2979
2980static int
2981wa_list_srm(struct i915_request *rq,
2982	    const struct i915_wa_list *wal,
2983	    struct i915_vma *vma)
2984{
2985	struct drm_i915_private *i915 = rq->i915;
2986	unsigned int i, count = 0;
2987	const struct i915_wa *wa;
2988	u32 srm, *cs;
2989
2990	srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
2991	if (GRAPHICS_VER(i915) >= 8)
2992		srm++;
2993
2994	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
2995		if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg)))
2996			count++;
2997	}
2998
2999	cs = intel_ring_begin(rq, 4 * count);
3000	if (IS_ERR(cs))
3001		return PTR_ERR(cs);
3002
3003	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3004		u32 offset = i915_mmio_reg_offset(wa->reg);
3005
3006		if (mcr_range(i915, offset))
3007			continue;
3008
3009		*cs++ = srm;
3010		*cs++ = offset;
3011		*cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i;
3012		*cs++ = 0;
3013	}
3014	intel_ring_advance(rq, cs);
3015
3016	return 0;
3017}
3018
3019static int engine_wa_list_verify(struct intel_context *ce,
3020				 const struct i915_wa_list * const wal,
3021				 const char *from)
3022{
3023	const struct i915_wa *wa;
3024	struct i915_request *rq;
3025	struct i915_vma *vma;
3026	struct i915_gem_ww_ctx ww;
3027	unsigned int i;
3028	u32 *results;
3029	int err;
3030
3031	if (!wal->count)
3032		return 0;
3033
3034	vma = __vm_create_scratch_for_read(&ce->engine->gt->ggtt->vm,
3035					   wal->count * sizeof(u32));
3036	if (IS_ERR(vma))
3037		return PTR_ERR(vma);
3038
3039	intel_engine_pm_get(ce->engine);
3040	i915_gem_ww_ctx_init(&ww, false);
3041retry:
3042	err = i915_gem_object_lock(vma->obj, &ww);
3043	if (err == 0)
3044		err = intel_context_pin_ww(ce, &ww);
3045	if (err)
3046		goto err_pm;
3047
3048	err = i915_vma_pin_ww(vma, &ww, 0, 0,
3049			   i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
3050	if (err)
3051		goto err_unpin;
3052
3053	rq = i915_request_create(ce);
3054	if (IS_ERR(rq)) {
3055		err = PTR_ERR(rq);
3056		goto err_vma;
3057	}
3058
3059	err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
3060	if (err == 0)
3061		err = wa_list_srm(rq, wal, vma);
3062
3063	i915_request_get(rq);
3064	if (err)
3065		i915_request_set_error_once(rq, err);
3066	i915_request_add(rq);
3067
3068	if (err)
3069		goto err_rq;
3070
3071	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
3072		err = -ETIME;
3073		goto err_rq;
3074	}
3075
3076	results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
3077	if (IS_ERR(results)) {
3078		err = PTR_ERR(results);
3079		goto err_rq;
3080	}
3081
3082	err = 0;
3083	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3084		if (mcr_range(rq->i915, i915_mmio_reg_offset(wa->reg)))
3085			continue;
3086
3087		if (!wa_verify(wal->gt, wa, results[i], wal->name, from))
3088			err = -ENXIO;
3089	}
3090
3091	i915_gem_object_unpin_map(vma->obj);
3092
3093err_rq:
3094	i915_request_put(rq);
3095err_vma:
3096	i915_vma_unpin(vma);
3097err_unpin:
3098	intel_context_unpin(ce);
3099err_pm:
3100	if (err == -EDEADLK) {
3101		err = i915_gem_ww_ctx_backoff(&ww);
3102		if (!err)
3103			goto retry;
3104	}
3105	i915_gem_ww_ctx_fini(&ww);
3106	intel_engine_pm_put(ce->engine);
3107	i915_vma_put(vma);
3108	return err;
3109}
3110
3111int intel_engine_verify_workarounds(struct intel_engine_cs *engine,
3112				    const char *from)
3113{
3114	return engine_wa_list_verify(engine->kernel_context,
3115				     &engine->wa_list,
3116				     from);
3117}
3118
3119#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
3120#include "selftest_workarounds.c"
3121#endif
v6.8
   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2014-2018 Intel Corporation
   4 */
   5
   6#include "i915_drv.h"
   7#include "i915_reg.h"
   8#include "intel_context.h"
   9#include "intel_engine_pm.h"
  10#include "intel_engine_regs.h"
  11#include "intel_gpu_commands.h"
  12#include "intel_gt.h"
 
  13#include "intel_gt_mcr.h"
  14#include "intel_gt_print.h"
  15#include "intel_gt_regs.h"
  16#include "intel_ring.h"
  17#include "intel_workarounds.h"
  18
 
 
  19/**
  20 * DOC: Hardware workarounds
  21 *
  22 * Hardware workarounds are register programming documented to be executed in
  23 * the driver that fall outside of the normal programming sequences for a
  24 * platform. There are some basic categories of workarounds, depending on
  25 * how/when they are applied:
  26 *
  27 * - Context workarounds: workarounds that touch registers that are
  28 *   saved/restored to/from the HW context image. The list is emitted (via Load
  29 *   Register Immediate commands) once when initializing the device and saved in
  30 *   the default context. That default context is then used on every context
  31 *   creation to have a "primed golden context", i.e. a context image that
  32 *   already contains the changes needed to all the registers.
  33 *
  34 *   Context workarounds should be implemented in the \*_ctx_workarounds_init()
  35 *   variants respective to the targeted platforms.
  36 *
  37 * - Engine workarounds: the list of these WAs is applied whenever the specific
  38 *   engine is reset. It's also possible that a set of engine classes share a
  39 *   common power domain and they are reset together. This happens on some
  40 *   platforms with render and compute engines. In this case (at least) one of
  41 *   them need to keeep the workaround programming: the approach taken in the
  42 *   driver is to tie those workarounds to the first compute/render engine that
  43 *   is registered.  When executing with GuC submission, engine resets are
  44 *   outside of kernel driver control, hence the list of registers involved in
  45 *   written once, on engine initialization, and then passed to GuC, that
  46 *   saves/restores their values before/after the reset takes place. See
  47 *   ``drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c`` for reference.
  48 *
  49 *   Workarounds for registers specific to RCS and CCS should be implemented in
  50 *   rcs_engine_wa_init() and ccs_engine_wa_init(), respectively; those for
  51 *   registers belonging to BCS, VCS or VECS should be implemented in
  52 *   xcs_engine_wa_init(). Workarounds for registers not belonging to a specific
  53 *   engine's MMIO range but that are part of of the common RCS/CCS reset domain
  54 *   should be implemented in general_render_compute_wa_init().
 
  55 *
  56 * - GT workarounds: the list of these WAs is applied whenever these registers
  57 *   revert to their default values: on GPU reset, suspend/resume [1]_, etc.
  58 *
  59 *   GT workarounds should be implemented in the \*_gt_workarounds_init()
  60 *   variants respective to the targeted platforms.
  61 *
  62 * - Register whitelist: some workarounds need to be implemented in userspace,
  63 *   but need to touch privileged registers. The whitelist in the kernel
  64 *   instructs the hardware to allow the access to happen. From the kernel side,
  65 *   this is just a special case of a MMIO workaround (as we write the list of
  66 *   these to/be-whitelisted registers to some special HW registers).
  67 *
  68 *   Register whitelisting should be done in the \*_whitelist_build() variants
  69 *   respective to the targeted platforms.
  70 *
  71 * - Workaround batchbuffers: buffers that get executed automatically by the
  72 *   hardware on every HW context restore. These buffers are created and
  73 *   programmed in the default context so the hardware always go through those
  74 *   programming sequences when switching contexts. The support for workaround
  75 *   batchbuffers is enabled these hardware mechanisms:
  76 *
  77 *   #. INDIRECT_CTX: A batchbuffer and an offset are provided in the default
  78 *      context, pointing the hardware to jump to that location when that offset
  79 *      is reached in the context restore. Workaround batchbuffer in the driver
  80 *      currently uses this mechanism for all platforms.
  81 *
  82 *   #. BB_PER_CTX_PTR: A batchbuffer is provided in the default context,
  83 *      pointing the hardware to a buffer to continue executing after the
  84 *      engine registers are restored in a context restore sequence. This is
  85 *      currently not used in the driver.
  86 *
  87 * - Other:  There are WAs that, due to their nature, cannot be applied from a
  88 *   central place. Those are peppered around the rest of the code, as needed.
  89 *   Workarounds related to the display IP are the main example.
  90 *
  91 * .. [1] Technically, some registers are powercontext saved & restored, so they
  92 *    survive a suspend/resume. In practice, writing them again is not too
  93 *    costly and simplifies things, so it's the approach taken in the driver.
  94 */
  95
  96static void wa_init_start(struct i915_wa_list *wal, struct intel_gt *gt,
  97			  const char *name, const char *engine_name)
  98{
  99	wal->gt = gt;
 100	wal->name = name;
 101	wal->engine_name = engine_name;
 102}
 103
 104#define WA_LIST_CHUNK (1 << 4)
 105
 106static void wa_init_finish(struct i915_wa_list *wal)
 107{
 108	/* Trim unused entries. */
 109	if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) {
 110		struct i915_wa *list = kmemdup(wal->list,
 111					       wal->count * sizeof(*list),
 112					       GFP_KERNEL);
 113
 114		if (list) {
 115			kfree(wal->list);
 116			wal->list = list;
 117		}
 118	}
 119
 120	if (!wal->count)
 121		return;
 122
 123	gt_dbg(wal->gt, "Initialized %u %s workarounds on %s\n",
 124	       wal->wa_count, wal->name, wal->engine_name);
 125}
 126
 127static enum forcewake_domains
 128wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal)
 129{
 130	enum forcewake_domains fw = 0;
 131	struct i915_wa *wa;
 132	unsigned int i;
 133
 134	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
 135		fw |= intel_uncore_forcewake_for_reg(uncore,
 136						     wa->reg,
 137						     FW_REG_READ |
 138						     FW_REG_WRITE);
 139
 140	return fw;
 141}
 142
 143static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa)
 144{
 145	unsigned int addr = i915_mmio_reg_offset(wa->reg);
 146	struct drm_i915_private *i915 = wal->gt->i915;
 147	unsigned int start = 0, end = wal->count;
 148	const unsigned int grow = WA_LIST_CHUNK;
 149	struct i915_wa *wa_;
 150
 151	GEM_BUG_ON(!is_power_of_2(grow));
 152
 153	if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */
 154		struct i915_wa *list;
 155
 156		list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*wa),
 157				     GFP_KERNEL);
 158		if (!list) {
 159			drm_err(&i915->drm, "No space for workaround init!\n");
 160			return;
 161		}
 162
 163		if (wal->list) {
 164			memcpy(list, wal->list, sizeof(*wa) * wal->count);
 165			kfree(wal->list);
 166		}
 167
 168		wal->list = list;
 169	}
 170
 171	while (start < end) {
 172		unsigned int mid = start + (end - start) / 2;
 173
 174		if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) {
 175			start = mid + 1;
 176		} else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) {
 177			end = mid;
 178		} else {
 179			wa_ = &wal->list[mid];
 180
 181			if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) {
 182				drm_err(&i915->drm,
 183					"Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n",
 184					i915_mmio_reg_offset(wa_->reg),
 185					wa_->clr, wa_->set);
 186
 187				wa_->set &= ~wa->clr;
 188			}
 189
 190			wal->wa_count++;
 191			wa_->set |= wa->set;
 192			wa_->clr |= wa->clr;
 193			wa_->read |= wa->read;
 194			return;
 195		}
 196	}
 197
 198	wal->wa_count++;
 199	wa_ = &wal->list[wal->count++];
 200	*wa_ = *wa;
 201
 202	while (wa_-- > wal->list) {
 203		GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) ==
 204			   i915_mmio_reg_offset(wa_[1].reg));
 205		if (i915_mmio_reg_offset(wa_[1].reg) >
 206		    i915_mmio_reg_offset(wa_[0].reg))
 207			break;
 208
 209		swap(wa_[1], wa_[0]);
 210	}
 211}
 212
 213static void wa_add(struct i915_wa_list *wal, i915_reg_t reg,
 214		   u32 clear, u32 set, u32 read_mask, bool masked_reg)
 215{
 216	struct i915_wa wa = {
 217		.reg  = reg,
 218		.clr  = clear,
 219		.set  = set,
 220		.read = read_mask,
 221		.masked_reg = masked_reg,
 222	};
 223
 224	_wa_add(wal, &wa);
 225}
 226
 227static void wa_mcr_add(struct i915_wa_list *wal, i915_mcr_reg_t reg,
 228		       u32 clear, u32 set, u32 read_mask, bool masked_reg)
 229{
 230	struct i915_wa wa = {
 231		.mcr_reg = reg,
 232		.clr  = clear,
 233		.set  = set,
 234		.read = read_mask,
 235		.masked_reg = masked_reg,
 236		.is_mcr = 1,
 237	};
 238
 239	_wa_add(wal, &wa);
 240}
 241
 242static void
 243wa_write_clr_set(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set)
 244{
 245	wa_add(wal, reg, clear, set, clear | set, false);
 246}
 247
 248static void
 249wa_mcr_write_clr_set(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clear, u32 set)
 250{
 251	wa_mcr_add(wal, reg, clear, set, clear | set, false);
 252}
 253
 254static void
 255wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
 256{
 257	wa_write_clr_set(wal, reg, ~0, set);
 258}
 259
 260static void
 261wa_mcr_write(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set)
 262{
 263	wa_mcr_write_clr_set(wal, reg, ~0, set);
 264}
 265
 266static void
 267wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
 268{
 269	wa_write_clr_set(wal, reg, set, set);
 270}
 271
 272static void
 273wa_mcr_write_or(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set)
 274{
 275	wa_mcr_write_clr_set(wal, reg, set, set);
 276}
 277
 278static void
 279wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr)
 280{
 281	wa_write_clr_set(wal, reg, clr, 0);
 282}
 283
 284static void
 285wa_mcr_write_clr(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clr)
 286{
 287	wa_mcr_write_clr_set(wal, reg, clr, 0);
 288}
 289
 290/*
 291 * WA operations on "masked register". A masked register has the upper 16 bits
 292 * documented as "masked" in b-spec. Its purpose is to allow writing to just a
 293 * portion of the register without a rmw: you simply write in the upper 16 bits
 294 * the mask of bits you are going to modify.
 295 *
 296 * The wa_masked_* family of functions already does the necessary operations to
 297 * calculate the mask based on the parameters passed, so user only has to
 298 * provide the lower 16 bits of that register.
 299 */
 300
 301static void
 302wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
 303{
 304	wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
 305}
 306
 307static void
 308wa_mcr_masked_en(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
 309{
 310	wa_mcr_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
 311}
 312
 313static void
 314wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
 315{
 316	wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
 317}
 318
 319static void
 320wa_mcr_masked_dis(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
 321{
 322	wa_mcr_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
 323}
 324
 325static void
 326wa_masked_field_set(struct i915_wa_list *wal, i915_reg_t reg,
 327		    u32 mask, u32 val)
 328{
 329	wa_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
 330}
 331
 332static void
 333wa_mcr_masked_field_set(struct i915_wa_list *wal, i915_mcr_reg_t reg,
 334			u32 mask, u32 val)
 335{
 336	wa_mcr_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
 337}
 338
 339static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine,
 340				      struct i915_wa_list *wal)
 341{
 342	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
 343}
 344
 345static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine,
 346				      struct i915_wa_list *wal)
 347{
 348	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
 349}
 350
 351static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine,
 352				      struct i915_wa_list *wal)
 353{
 354	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
 355
 356	/* WaDisableAsyncFlipPerfMode:bdw,chv */
 357	wa_masked_en(wal, RING_MI_MODE(RENDER_RING_BASE), ASYNC_FLIP_PERF_DISABLE);
 358
 359	/* WaDisablePartialInstShootdown:bdw,chv */
 360	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
 361			 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
 362
 363	/* Use Force Non-Coherent whenever executing a 3D context. This is a
 364	 * workaround for a possible hang in the unlikely event a TLB
 365	 * invalidation occurs during a PSD flush.
 366	 */
 367	/* WaForceEnableNonCoherent:bdw,chv */
 368	/* WaHdcDisableFetchWhenMasked:bdw,chv */
 369	wa_masked_en(wal, HDC_CHICKEN0,
 370		     HDC_DONOT_FETCH_MEM_WHEN_MASKED |
 371		     HDC_FORCE_NON_COHERENT);
 372
 373	/* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
 374	 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
 375	 *  polygons in the same 8x4 pixel/sample area to be processed without
 376	 *  stalling waiting for the earlier ones to write to Hierarchical Z
 377	 *  buffer."
 378	 *
 379	 * This optimization is off by default for BDW and CHV; turn it on.
 380	 */
 381	wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
 382
 383	/* Wa4x4STCOptimizationDisable:bdw,chv */
 384	wa_masked_en(wal, CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
 385
 386	/*
 387	 * BSpec recommends 8x4 when MSAA is used,
 388	 * however in practice 16x4 seems fastest.
 389	 *
 390	 * Note that PS/WM thread counts depend on the WIZ hashing
 391	 * disable bit, which we don't touch here, but it's good
 392	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
 393	 */
 394	wa_masked_field_set(wal, GEN7_GT_MODE,
 395			    GEN6_WIZ_HASHING_MASK,
 396			    GEN6_WIZ_HASHING_16x4);
 397}
 398
 399static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine,
 400				     struct i915_wa_list *wal)
 401{
 402	struct drm_i915_private *i915 = engine->i915;
 403
 404	gen8_ctx_workarounds_init(engine, wal);
 405
 406	/* WaDisableThreadStallDopClockGating:bdw (pre-production) */
 407	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
 408
 409	/* WaDisableDopClockGating:bdw
 410	 *
 411	 * Also see the related UCGTCL1 write in bdw_init_clock_gating()
 412	 * to disable EUTC clock gating.
 413	 */
 414	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
 415			 DOP_CLOCK_GATING_DISABLE);
 416
 417	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
 418			 GEN8_SAMPLER_POWER_BYPASS_DIS);
 419
 420	wa_masked_en(wal, HDC_CHICKEN0,
 421		     /* WaForceContextSaveRestoreNonCoherent:bdw */
 422		     HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
 423		     /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
 424		     (IS_BROADWELL_GT3(i915) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
 425}
 426
 427static void chv_ctx_workarounds_init(struct intel_engine_cs *engine,
 428				     struct i915_wa_list *wal)
 429{
 430	gen8_ctx_workarounds_init(engine, wal);
 431
 432	/* WaDisableThreadStallDopClockGating:chv */
 433	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
 434
 435	/* Improve HiZ throughput on CHV. */
 436	wa_masked_en(wal, HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
 437}
 438
 439static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine,
 440				      struct i915_wa_list *wal)
 441{
 442	struct drm_i915_private *i915 = engine->i915;
 443
 444	if (HAS_LLC(i915)) {
 445		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
 446		 *
 447		 * Must match Display Engine. See
 448		 * WaCompressedResourceDisplayNewHashMode.
 449		 */
 450		wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 451			     GEN9_PBE_COMPRESSED_HASH_SELECTION);
 452		wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
 453				 GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR);
 454	}
 455
 456	/* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */
 457	/* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */
 458	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
 459			 FLOW_CONTROL_ENABLE |
 460			 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
 461
 462	/* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */
 463	/* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */
 464	wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
 465			 GEN9_ENABLE_YV12_BUGFIX |
 466			 GEN9_ENABLE_GPGPU_PREEMPTION);
 467
 468	/* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */
 469	/* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */
 470	wa_masked_en(wal, CACHE_MODE_1,
 471		     GEN8_4x4_STC_OPTIMIZATION_DISABLE |
 472		     GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE);
 473
 474	/* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */
 475	wa_mcr_masked_dis(wal, GEN9_HALF_SLICE_CHICKEN5,
 476			  GEN9_CCS_TLB_PREFETCH_ENABLE);
 477
 478	/* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */
 479	wa_masked_en(wal, HDC_CHICKEN0,
 480		     HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
 481		     HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
 482
 483	/* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
 484	 * both tied to WaForceContextSaveRestoreNonCoherent
 485	 * in some hsds for skl. We keep the tie for all gen9. The
 486	 * documentation is a bit hazy and so we want to get common behaviour,
 487	 * even though there is no clear evidence we would need both on kbl/bxt.
 488	 * This area has been source of system hangs so we play it safe
 489	 * and mimic the skl regardless of what bspec says.
 490	 *
 491	 * Use Force Non-Coherent whenever executing a 3D context. This
 492	 * is a workaround for a possible hang in the unlikely event
 493	 * a TLB invalidation occurs during a PSD flush.
 494	 */
 495
 496	/* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */
 497	wa_masked_en(wal, HDC_CHICKEN0,
 498		     HDC_FORCE_NON_COHERENT);
 499
 500	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
 501	if (IS_SKYLAKE(i915) ||
 502	    IS_KABYLAKE(i915) ||
 503	    IS_COFFEELAKE(i915) ||
 504	    IS_COMETLAKE(i915))
 505		wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
 506				 GEN8_SAMPLER_POWER_BYPASS_DIS);
 507
 508	/* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */
 509	wa_mcr_masked_en(wal, HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
 510
 511	/*
 512	 * Supporting preemption with fine-granularity requires changes in the
 513	 * batch buffer programming. Since we can't break old userspace, we
 514	 * need to set our default preemption level to safe value. Userspace is
 515	 * still able to use more fine-grained preemption levels, since in
 516	 * WaEnablePreemptionGranularityControlByUMD we're whitelisting the
 517	 * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are
 518	 * not real HW workarounds, but merely a way to start using preemption
 519	 * while maintaining old contract with userspace.
 520	 */
 521
 522	/* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */
 523	wa_masked_dis(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
 524
 525	/* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */
 526	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
 527			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 528			    GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
 529
 530	/* WaClearHIZ_WM_CHICKEN3:bxt,glk */
 531	if (IS_GEN9_LP(i915))
 532		wa_masked_en(wal, GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
 533}
 534
 535static void skl_tune_iz_hashing(struct intel_engine_cs *engine,
 536				struct i915_wa_list *wal)
 537{
 538	struct intel_gt *gt = engine->gt;
 539	u8 vals[3] = { 0, 0, 0 };
 540	unsigned int i;
 541
 542	for (i = 0; i < 3; i++) {
 543		u8 ss;
 544
 545		/*
 546		 * Only consider slices where one, and only one, subslice has 7
 547		 * EUs
 548		 */
 549		if (!is_power_of_2(gt->info.sseu.subslice_7eu[i]))
 550			continue;
 551
 552		/*
 553		 * subslice_7eu[i] != 0 (because of the check above) and
 554		 * ss_max == 4 (maximum number of subslices possible per slice)
 555		 *
 556		 * ->    0 <= ss <= 3;
 557		 */
 558		ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1;
 559		vals[i] = 3 - ss;
 560	}
 561
 562	if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
 563		return;
 564
 565	/* Tune IZ hashing. See intel_device_info_runtime_init() */
 566	wa_masked_field_set(wal, GEN7_GT_MODE,
 567			    GEN9_IZ_HASHING_MASK(2) |
 568			    GEN9_IZ_HASHING_MASK(1) |
 569			    GEN9_IZ_HASHING_MASK(0),
 570			    GEN9_IZ_HASHING(2, vals[2]) |
 571			    GEN9_IZ_HASHING(1, vals[1]) |
 572			    GEN9_IZ_HASHING(0, vals[0]));
 573}
 574
 575static void skl_ctx_workarounds_init(struct intel_engine_cs *engine,
 576				     struct i915_wa_list *wal)
 577{
 578	gen9_ctx_workarounds_init(engine, wal);
 579	skl_tune_iz_hashing(engine, wal);
 580}
 581
 582static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine,
 583				     struct i915_wa_list *wal)
 584{
 585	gen9_ctx_workarounds_init(engine, wal);
 586
 587	/* WaDisableThreadStallDopClockGating:bxt */
 588	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
 589			 STALL_DOP_GATING_DISABLE);
 590
 591	/* WaToEnableHwFixForPushConstHWBug:bxt */
 592	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 593		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 594}
 595
 596static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine,
 597				     struct i915_wa_list *wal)
 598{
 599	struct drm_i915_private *i915 = engine->i915;
 600
 601	gen9_ctx_workarounds_init(engine, wal);
 602
 603	/* WaToEnableHwFixForPushConstHWBug:kbl */
 604	if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_C0, STEP_FOREVER))
 605		wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 606			     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 607
 608	/* WaDisableSbeCacheDispatchPortSharing:kbl */
 609	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
 610			 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
 611}
 612
 613static void glk_ctx_workarounds_init(struct intel_engine_cs *engine,
 614				     struct i915_wa_list *wal)
 615{
 616	gen9_ctx_workarounds_init(engine, wal);
 617
 618	/* WaToEnableHwFixForPushConstHWBug:glk */
 619	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 620		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 621}
 622
 623static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine,
 624				     struct i915_wa_list *wal)
 625{
 626	gen9_ctx_workarounds_init(engine, wal);
 627
 628	/* WaToEnableHwFixForPushConstHWBug:cfl */
 629	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 630		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 631
 632	/* WaDisableSbeCacheDispatchPortSharing:cfl */
 633	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
 634			 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
 635}
 636
 637static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
 638				     struct i915_wa_list *wal)
 639{
 640	/* Wa_1406697149 (WaDisableBankHangMode:icl) */
 641	wa_write(wal, GEN8_L3CNTLREG, GEN8_ERRDETBCTRL);
 642
 643	/* WaForceEnableNonCoherent:icl
 644	 * This is not the same workaround as in early Gen9 platforms, where
 645	 * lacking this could cause system hangs, but coherency performance
 646	 * overhead is high and only a few compute workloads really need it
 647	 * (the register is whitelisted in hardware now, so UMDs can opt in
 648	 * for coherency if they have a good reason).
 649	 */
 650	wa_mcr_masked_en(wal, ICL_HDC_MODE, HDC_FORCE_NON_COHERENT);
 651
 652	/* WaEnableFloatBlendOptimization:icl */
 653	wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
 654		   _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE),
 655		   0 /* write-only, so skip validation */,
 656		   true);
 657
 658	/* WaDisableGPGPUMidThreadPreemption:icl */
 659	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
 660			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 661			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
 662
 663	/* allow headerless messages for preemptible GPGPU context */
 664	wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
 665			 GEN11_SAMPLER_ENABLE_HEADLESS_MSG);
 666
 667	/* Wa_1604278689:icl,ehl */
 668	wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID);
 669	wa_write_clr_set(wal, IVB_FBC_RT_BASE_UPPER,
 670			 0,
 671			 0xFFFFFFFF);
 672
 673	/* Wa_1406306137:icl,ehl */
 674	wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU);
 675}
 676
 677/*
 678 * These settings aren't actually workarounds, but general tuning settings that
 679 * need to be programmed on dg2 platform.
 680 */
 681static void dg2_ctx_gt_tuning_init(struct intel_engine_cs *engine,
 682				   struct i915_wa_list *wal)
 683{
 684	wa_mcr_masked_en(wal, CHICKEN_RASTER_2, TBIMR_FAST_CLIP);
 685	wa_mcr_write_clr_set(wal, XEHP_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK,
 686			     REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f));
 687	wa_mcr_write_clr_set(wal, XEHP_FF_MODE2, FF_MODE2_TDS_TIMER_MASK,
 688			     FF_MODE2_TDS_TIMER_128);
 689}
 690
 691static void gen12_ctx_workarounds_init(struct intel_engine_cs *engine,
 692				       struct i915_wa_list *wal)
 693{
 694	struct drm_i915_private *i915 = engine->i915;
 695
 696	/*
 697	 * Wa_1409142259:tgl,dg1,adl-p
 698	 * Wa_1409347922:tgl,dg1,adl-p
 699	 * Wa_1409252684:tgl,dg1,adl-p
 700	 * Wa_1409217633:tgl,dg1,adl-p
 701	 * Wa_1409207793:tgl,dg1,adl-p
 702	 * Wa_1409178076:tgl,dg1,adl-p
 703	 * Wa_1408979724:tgl,dg1,adl-p
 704	 * Wa_14010443199:tgl,rkl,dg1,adl-p
 705	 * Wa_14010698770:tgl,rkl,dg1,adl-s,adl-p
 706	 * Wa_1409342910:tgl,rkl,dg1,adl-s,adl-p
 707	 */
 708	wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3,
 709		     GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
 710
 711	/* WaDisableGPGPUMidThreadPreemption:gen12 */
 712	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
 713			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 714			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
 715
 716	/*
 717	 * Wa_16011163337 - GS_TIMER
 718	 *
 719	 * TDS_TIMER: Although some platforms refer to it as Wa_1604555607, we
 720	 * need to program it even on those that don't explicitly list that
 721	 * workaround.
 722	 *
 723	 * Note that the programming of GEN12_FF_MODE2 is further modified
 724	 * according to the FF_MODE2 guidance given by Wa_1608008084.
 725	 * Wa_1608008084 tells us the FF_MODE2 register will return the wrong
 726	 * value when read from the CPU.
 727	 *
 728	 * The default value for this register is zero for all fields.
 729	 * So instead of doing a RMW we should just write the desired values
 730	 * for TDS and GS timers. Note that since the readback can't be trusted,
 731	 * the clear mask is just set to ~0 to make sure other bits are not
 732	 * inadvertently set. For the same reason read verification is ignored.
 733	 */
 734	wa_add(wal,
 735	       GEN12_FF_MODE2,
 736	       ~0,
 737	       FF_MODE2_TDS_TIMER_128 | FF_MODE2_GS_TIMER_224,
 738	       0, false);
 739
 740	if (!IS_DG1(i915)) {
 741		/* Wa_1806527549 */
 742		wa_masked_en(wal, HIZ_CHICKEN, HZ_DEPTH_TEST_LE_GE_OPT_DISABLE);
 743
 744		/* Wa_1606376872 */
 745		wa_masked_en(wal, COMMON_SLICE_CHICKEN4, DISABLE_TDC_LOAD_BALANCING_CALC);
 746	}
 747}
 748
 749static void dg1_ctx_workarounds_init(struct intel_engine_cs *engine,
 750				     struct i915_wa_list *wal)
 751{
 752	gen12_ctx_workarounds_init(engine, wal);
 753
 754	/* Wa_1409044764 */
 755	wa_masked_dis(wal, GEN11_COMMON_SLICE_CHICKEN3,
 756		      DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN);
 757
 758	/* Wa_22010493298 */
 759	wa_masked_en(wal, HIZ_CHICKEN,
 760		     DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE);
 761}
 762
 763static void dg2_ctx_workarounds_init(struct intel_engine_cs *engine,
 764				     struct i915_wa_list *wal)
 765{
 766	dg2_ctx_gt_tuning_init(engine, wal);
 767
 768	/* Wa_16013271637:dg2 */
 769	wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
 770			 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
 771
 772	/* Wa_14014947963:dg2 */
 773	wa_masked_field_set(wal, VF_PREEMPTION, PREEMPTION_VERTEX_COUNT, 0x4000);
 774
 775	/* Wa_18018764978:dg2 */
 776	wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
 777
 778	/* Wa_18019271663:dg2 */
 779	wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
 780
 781	/* Wa_14019877138:dg2 */
 782	wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT);
 783}
 784
 785static void xelpg_ctx_gt_tuning_init(struct intel_engine_cs *engine,
 786				     struct i915_wa_list *wal)
 787{
 788	struct intel_gt *gt = engine->gt;
 789
 790	dg2_ctx_gt_tuning_init(engine, wal);
 791
 792	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_B0, STEP_FOREVER) ||
 793	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B0, STEP_FOREVER))
 
 
 
 
 
 794		wa_add(wal, DRAW_WATERMARK, VERT_WM_VAL, 0x3FF, 0, false);
 795}
 796
 797static void xelpg_ctx_workarounds_init(struct intel_engine_cs *engine,
 798				       struct i915_wa_list *wal)
 799{
 800	struct intel_gt *gt = engine->gt;
 801
 802	xelpg_ctx_gt_tuning_init(engine, wal);
 803
 804	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
 805	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
 806		/* Wa_14014947963 */
 807		wa_masked_field_set(wal, VF_PREEMPTION,
 808				    PREEMPTION_VERTEX_COUNT, 0x4000);
 809
 810		/* Wa_16013271637 */
 811		wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
 812				 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
 813
 814		/* Wa_18019627453 */
 815		wa_mcr_masked_en(wal, VFLSKPD, VF_PREFETCH_TLB_DIS);
 816
 817		/* Wa_18018764978 */
 818		wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
 819	}
 820
 821	/* Wa_18019271663 */
 822	wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
 
 
 
 823}
 824
 825static void fakewa_disable_nestedbb_mode(struct intel_engine_cs *engine,
 826					 struct i915_wa_list *wal)
 827{
 828	/*
 829	 * This is a "fake" workaround defined by software to ensure we
 830	 * maintain reliable, backward-compatible behavior for userspace with
 831	 * regards to how nested MI_BATCH_BUFFER_START commands are handled.
 832	 *
 833	 * The per-context setting of MI_MODE[12] determines whether the bits
 834	 * of a nested MI_BATCH_BUFFER_START instruction should be interpreted
 835	 * in the traditional manner or whether they should instead use a new
 836	 * tgl+ meaning that breaks backward compatibility, but allows nesting
 837	 * into 3rd-level batchbuffers.  When this new capability was first
 838	 * added in TGL, it remained off by default unless a context
 839	 * intentionally opted in to the new behavior.  However Xe_HPG now
 840	 * flips this on by default and requires that we explicitly opt out if
 841	 * we don't want the new behavior.
 842	 *
 843	 * From a SW perspective, we want to maintain the backward-compatible
 844	 * behavior for userspace, so we'll apply a fake workaround to set it
 845	 * back to the legacy behavior on platforms where the hardware default
 846	 * is to break compatibility.  At the moment there is no Linux
 847	 * userspace that utilizes third-level batchbuffers, so this will avoid
 848	 * userspace from needing to make any changes.  using the legacy
 849	 * meaning is the correct thing to do.  If/when we have userspace
 850	 * consumers that want to utilize third-level batch nesting, we can
 851	 * provide a context parameter to allow them to opt-in.
 852	 */
 853	wa_masked_dis(wal, RING_MI_MODE(engine->mmio_base), TGL_NESTED_BB_EN);
 854}
 855
 856static void gen12_ctx_gt_mocs_init(struct intel_engine_cs *engine,
 857				   struct i915_wa_list *wal)
 858{
 859	u8 mocs;
 860
 861	/*
 862	 * Some blitter commands do not have a field for MOCS, those
 863	 * commands will use MOCS index pointed by BLIT_CCTL.
 864	 * BLIT_CCTL registers are needed to be programmed to un-cached.
 865	 */
 866	if (engine->class == COPY_ENGINE_CLASS) {
 867		mocs = engine->gt->mocs.uc_index;
 868		wa_write_clr_set(wal,
 869				 BLIT_CCTL(engine->mmio_base),
 870				 BLIT_CCTL_MASK,
 871				 BLIT_CCTL_MOCS(mocs, mocs));
 872	}
 873}
 874
 875/*
 876 * gen12_ctx_gt_fake_wa_init() aren't programmingan official workaround
 877 * defined by the hardware team, but it programming general context registers.
 878 * Adding those context register programming in context workaround
 879 * allow us to use the wa framework for proper application and validation.
 880 */
 881static void
 882gen12_ctx_gt_fake_wa_init(struct intel_engine_cs *engine,
 883			  struct i915_wa_list *wal)
 884{
 885	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 886		fakewa_disable_nestedbb_mode(engine, wal);
 887
 888	gen12_ctx_gt_mocs_init(engine, wal);
 889}
 890
 891static void
 892__intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
 893			   struct i915_wa_list *wal,
 894			   const char *name)
 895{
 896	struct drm_i915_private *i915 = engine->i915;
 897
 898	wa_init_start(wal, engine->gt, name, engine->name);
 899
 900	/* Applies to all engines */
 901	/*
 902	 * Fake workarounds are not the actual workaround but
 903	 * programming of context registers using workaround framework.
 904	 */
 905	if (GRAPHICS_VER(i915) >= 12)
 906		gen12_ctx_gt_fake_wa_init(engine, wal);
 907
 908	if (engine->class != RENDER_CLASS)
 909		goto done;
 910
 911	if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 71)))
 912		xelpg_ctx_workarounds_init(engine, wal);
 913	else if (IS_PONTEVECCHIO(i915))
 914		; /* noop; none at this time */
 915	else if (IS_DG2(i915))
 916		dg2_ctx_workarounds_init(engine, wal);
 917	else if (IS_XEHPSDV(i915))
 918		; /* noop; none at this time */
 919	else if (IS_DG1(i915))
 920		dg1_ctx_workarounds_init(engine, wal);
 921	else if (GRAPHICS_VER(i915) == 12)
 922		gen12_ctx_workarounds_init(engine, wal);
 923	else if (GRAPHICS_VER(i915) == 11)
 924		icl_ctx_workarounds_init(engine, wal);
 925	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
 926		cfl_ctx_workarounds_init(engine, wal);
 927	else if (IS_GEMINILAKE(i915))
 928		glk_ctx_workarounds_init(engine, wal);
 929	else if (IS_KABYLAKE(i915))
 930		kbl_ctx_workarounds_init(engine, wal);
 931	else if (IS_BROXTON(i915))
 932		bxt_ctx_workarounds_init(engine, wal);
 933	else if (IS_SKYLAKE(i915))
 934		skl_ctx_workarounds_init(engine, wal);
 935	else if (IS_CHERRYVIEW(i915))
 936		chv_ctx_workarounds_init(engine, wal);
 937	else if (IS_BROADWELL(i915))
 938		bdw_ctx_workarounds_init(engine, wal);
 939	else if (GRAPHICS_VER(i915) == 7)
 940		gen7_ctx_workarounds_init(engine, wal);
 941	else if (GRAPHICS_VER(i915) == 6)
 942		gen6_ctx_workarounds_init(engine, wal);
 943	else if (GRAPHICS_VER(i915) < 8)
 944		;
 945	else
 946		MISSING_CASE(GRAPHICS_VER(i915));
 947
 948done:
 949	wa_init_finish(wal);
 950}
 951
 952void intel_engine_init_ctx_wa(struct intel_engine_cs *engine)
 953{
 954	__intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context");
 955}
 956
 957int intel_engine_emit_ctx_wa(struct i915_request *rq)
 958{
 959	struct i915_wa_list *wal = &rq->engine->ctx_wa_list;
 960	struct intel_uncore *uncore = rq->engine->uncore;
 961	enum forcewake_domains fw;
 962	unsigned long flags;
 963	struct i915_wa *wa;
 964	unsigned int i;
 965	u32 *cs;
 966	int ret;
 967
 968	if (wal->count == 0)
 969		return 0;
 970
 971	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
 972	if (ret)
 973		return ret;
 974
 975	cs = intel_ring_begin(rq, (wal->count * 2 + 2));
 
 
 
 
 
 976	if (IS_ERR(cs))
 977		return PTR_ERR(cs);
 978
 979	fw = wal_get_fw_for_rmw(uncore, wal);
 980
 981	intel_gt_mcr_lock(wal->gt, &flags);
 982	spin_lock(&uncore->lock);
 983	intel_uncore_forcewake_get__locked(uncore, fw);
 984
 985	*cs++ = MI_LOAD_REGISTER_IMM(wal->count);
 986	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
 987		u32 val;
 988
 989		/* Skip reading the register if it's not really needed */
 990		if (wa->masked_reg || (wa->clr | wa->set) == U32_MAX) {
 991			val = wa->set;
 992		} else {
 993			val = wa->is_mcr ?
 994				intel_gt_mcr_read_any_fw(wal->gt, wa->mcr_reg) :
 995				intel_uncore_read_fw(uncore, wa->reg);
 996			val &= ~wa->clr;
 997			val |= wa->set;
 998		}
 999
1000		*cs++ = i915_mmio_reg_offset(wa->reg);
1001		*cs++ = val;
1002	}
1003	*cs++ = MI_NOOP;
1004
 
 
 
 
 
 
 
 
 
1005	intel_uncore_forcewake_put__locked(uncore, fw);
1006	spin_unlock(&uncore->lock);
1007	intel_gt_mcr_unlock(wal->gt, flags);
1008
1009	intel_ring_advance(rq, cs);
1010
1011	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
1012	if (ret)
1013		return ret;
1014
1015	return 0;
1016}
1017
1018static void
1019gen4_gt_workarounds_init(struct intel_gt *gt,
1020			 struct i915_wa_list *wal)
1021{
1022	/* WaDisable_RenderCache_OperationalFlush:gen4,ilk */
1023	wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
1024}
1025
1026static void
1027g4x_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1028{
1029	gen4_gt_workarounds_init(gt, wal);
1030
1031	/* WaDisableRenderCachePipelinedFlush:g4x,ilk */
1032	wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE);
1033}
1034
1035static void
1036ilk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1037{
1038	g4x_gt_workarounds_init(gt, wal);
1039
1040	wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED);
1041}
1042
1043static void
1044snb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1045{
1046}
1047
1048static void
1049ivb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1050{
1051	/* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
1052	wa_masked_dis(wal,
1053		      GEN7_COMMON_SLICE_CHICKEN1,
1054		      GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
1055
1056	/* WaApplyL3ControlAndL3ChickenMode:ivb */
1057	wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL);
1058	wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
1059
1060	/* WaForceL3Serialization:ivb */
1061	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1062}
1063
1064static void
1065vlv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1066{
1067	/* WaForceL3Serialization:vlv */
1068	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1069
1070	/*
1071	 * WaIncreaseL3CreditsForVLVB0:vlv
1072	 * This is the hardware default actually.
1073	 */
1074	wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE);
1075}
1076
1077static void
1078hsw_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1079{
1080	/* L3 caching of data atomics doesn't work -- disable it. */
1081	wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
1082
1083	wa_add(wal,
1084	       HSW_ROW_CHICKEN3, 0,
1085	       _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE),
1086	       0 /* XXX does this reg exist? */, true);
1087
1088	/* WaVSRefCountFullforceMissDisable:hsw */
1089	wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME);
1090}
1091
1092static void
1093gen9_wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal)
1094{
1095	const struct sseu_dev_info *sseu = &to_gt(i915)->info.sseu;
1096	unsigned int slice, subslice;
1097	u32 mcr, mcr_mask;
1098
1099	GEM_BUG_ON(GRAPHICS_VER(i915) != 9);
1100
1101	/*
1102	 * WaProgramMgsrForCorrectSliceSpecificMmioReads:gen9,glk,kbl,cml
1103	 * Before any MMIO read into slice/subslice specific registers, MCR
1104	 * packet control register needs to be programmed to point to any
1105	 * enabled s/ss pair. Otherwise, incorrect values will be returned.
1106	 * This means each subsequent MMIO read will be forwarded to an
1107	 * specific s/ss combination, but this is OK since these registers
1108	 * are consistent across s/ss in almost all cases. In the rare
1109	 * occasions, such as INSTDONE, where this value is dependent
1110	 * on s/ss combo, the read should be done with read_subslice_reg.
1111	 */
1112	slice = ffs(sseu->slice_mask) - 1;
1113	GEM_BUG_ON(slice >= ARRAY_SIZE(sseu->subslice_mask.hsw));
1114	subslice = ffs(intel_sseu_get_hsw_subslices(sseu, slice));
1115	GEM_BUG_ON(!subslice);
1116	subslice--;
1117
1118	/*
1119	 * We use GEN8_MCR..() macros to calculate the |mcr| value for
1120	 * Gen9 to address WaProgramMgsrForCorrectSliceSpecificMmioReads
1121	 */
1122	mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice);
1123	mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
1124
1125	drm_dbg(&i915->drm, "MCR slice:%d/subslice:%d = %x\n", slice, subslice, mcr);
1126
1127	wa_write_clr_set(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr);
1128}
1129
1130static void
1131gen9_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1132{
1133	struct drm_i915_private *i915 = gt->i915;
1134
1135	/* WaProgramMgsrForCorrectSliceSpecificMmioReads:glk,kbl,cml,gen9 */
1136	gen9_wa_init_mcr(i915, wal);
1137
1138	/* WaDisableKillLogic:bxt,skl,kbl */
1139	if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915))
1140		wa_write_or(wal,
1141			    GAM_ECOCHK,
1142			    ECOCHK_DIS_TLB);
1143
1144	if (HAS_LLC(i915)) {
1145		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
1146		 *
1147		 * Must match Display Engine. See
1148		 * WaCompressedResourceDisplayNewHashMode.
1149		 */
1150		wa_write_or(wal,
1151			    MMCD_MISC_CTRL,
1152			    MMCD_PCLA | MMCD_HOTSPOT_EN);
1153	}
1154
1155	/* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */
1156	wa_write_or(wal,
1157		    GAM_ECOCHK,
1158		    BDW_DISABLE_HDC_INVALIDATION);
1159}
1160
1161static void
1162skl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1163{
1164	gen9_gt_workarounds_init(gt, wal);
1165
1166	/* WaDisableGafsUnitClkGating:skl */
1167	wa_write_or(wal,
1168		    GEN7_UCGCTL4,
1169		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1170
1171	/* WaInPlaceDecompressionHang:skl */
1172	if (IS_SKYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, STEP_A0, STEP_H0))
1173		wa_write_or(wal,
1174			    GEN9_GAMT_ECO_REG_RW_IA,
1175			    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1176}
1177
1178static void
1179kbl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1180{
1181	gen9_gt_workarounds_init(gt, wal);
1182
1183	/* WaDisableDynamicCreditSharing:kbl */
1184	if (IS_KABYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, 0, STEP_C0))
1185		wa_write_or(wal,
1186			    GAMT_CHKN_BIT_REG,
1187			    GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);
1188
1189	/* WaDisableGafsUnitClkGating:kbl */
1190	wa_write_or(wal,
1191		    GEN7_UCGCTL4,
1192		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1193
1194	/* WaInPlaceDecompressionHang:kbl */
1195	wa_write_or(wal,
1196		    GEN9_GAMT_ECO_REG_RW_IA,
1197		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1198}
1199
1200static void
1201glk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1202{
1203	gen9_gt_workarounds_init(gt, wal);
1204}
1205
1206static void
1207cfl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1208{
1209	gen9_gt_workarounds_init(gt, wal);
1210
1211	/* WaDisableGafsUnitClkGating:cfl */
1212	wa_write_or(wal,
1213		    GEN7_UCGCTL4,
1214		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1215
1216	/* WaInPlaceDecompressionHang:cfl */
1217	wa_write_or(wal,
1218		    GEN9_GAMT_ECO_REG_RW_IA,
1219		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1220}
1221
1222static void __set_mcr_steering(struct i915_wa_list *wal,
1223			       i915_reg_t steering_reg,
1224			       unsigned int slice, unsigned int subslice)
1225{
1226	u32 mcr, mcr_mask;
1227
1228	mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice);
1229	mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
1230
1231	wa_write_clr_set(wal, steering_reg, mcr_mask, mcr);
1232}
1233
1234static void debug_dump_steering(struct intel_gt *gt)
1235{
1236	struct drm_printer p = drm_debug_printer("MCR Steering:");
 
1237
1238	if (drm_debug_enabled(DRM_UT_DRIVER))
1239		intel_gt_mcr_report_steering(&p, gt, false);
1240}
1241
1242static void __add_mcr_wa(struct intel_gt *gt, struct i915_wa_list *wal,
1243			 unsigned int slice, unsigned int subslice)
1244{
1245	__set_mcr_steering(wal, GEN8_MCR_SELECTOR, slice, subslice);
1246
1247	gt->default_steering.groupid = slice;
1248	gt->default_steering.instanceid = subslice;
1249
1250	debug_dump_steering(gt);
1251}
1252
1253static void
1254icl_wa_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1255{
1256	const struct sseu_dev_info *sseu = &gt->info.sseu;
1257	unsigned int subslice;
1258
1259	GEM_BUG_ON(GRAPHICS_VER(gt->i915) < 11);
1260	GEM_BUG_ON(hweight8(sseu->slice_mask) > 1);
1261
1262	/*
1263	 * Although a platform may have subslices, we need to always steer
1264	 * reads to the lowest instance that isn't fused off.  When Render
1265	 * Power Gating is enabled, grabbing forcewake will only power up a
1266	 * single subslice (the "minconfig") if there isn't a real workload
1267	 * that needs to be run; this means that if we steer register reads to
1268	 * one of the higher subslices, we run the risk of reading back 0's or
1269	 * random garbage.
1270	 */
1271	subslice = __ffs(intel_sseu_get_hsw_subslices(sseu, 0));
1272
1273	/*
1274	 * If the subslice we picked above also steers us to a valid L3 bank,
1275	 * then we can just rely on the default steering and won't need to
1276	 * worry about explicitly re-steering L3BANK reads later.
1277	 */
1278	if (gt->info.l3bank_mask & BIT(subslice))
1279		gt->steering_table[L3BANK] = NULL;
1280
1281	__add_mcr_wa(gt, wal, 0, subslice);
1282}
1283
1284static void
1285xehp_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1286{
1287	const struct sseu_dev_info *sseu = &gt->info.sseu;
1288	unsigned long slice, subslice = 0, slice_mask = 0;
1289	u32 lncf_mask = 0;
1290	int i;
1291
1292	/*
1293	 * On Xe_HP the steering increases in complexity. There are now several
1294	 * more units that require steering and we're not guaranteed to be able
1295	 * to find a common setting for all of them. These are:
1296	 * - GSLICE (fusable)
1297	 * - DSS (sub-unit within gslice; fusable)
1298	 * - L3 Bank (fusable)
1299	 * - MSLICE (fusable)
1300	 * - LNCF (sub-unit within mslice; always present if mslice is present)
1301	 *
1302	 * We'll do our default/implicit steering based on GSLICE (in the
1303	 * sliceid field) and DSS (in the subsliceid field).  If we can
1304	 * find overlap between the valid MSLICE and/or LNCF values with
1305	 * a suitable GSLICE, then we can just re-use the default value and
1306	 * skip and explicit steering at runtime.
1307	 *
1308	 * We only need to look for overlap between GSLICE/MSLICE/LNCF to find
1309	 * a valid sliceid value.  DSS steering is the only type of steering
1310	 * that utilizes the 'subsliceid' bits.
1311	 *
1312	 * Also note that, even though the steering domain is called "GSlice"
1313	 * and it is encoded in the register using the gslice format, the spec
1314	 * says that the combined (geometry | compute) fuse should be used to
1315	 * select the steering.
1316	 */
1317
1318	/* Find the potential gslice candidates */
1319	slice_mask = intel_slicemask_from_xehp_dssmask(sseu->subslice_mask,
1320						       GEN_DSS_PER_GSLICE);
1321
1322	/*
1323	 * Find the potential LNCF candidates.  Either LNCF within a valid
1324	 * mslice is fine.
1325	 */
1326	for_each_set_bit(i, &gt->info.mslice_mask, GEN12_MAX_MSLICES)
1327		lncf_mask |= (0x3 << (i * 2));
1328
1329	/*
1330	 * Are there any sliceid values that work for both GSLICE and LNCF
1331	 * steering?
1332	 */
1333	if (slice_mask & lncf_mask) {
1334		slice_mask &= lncf_mask;
1335		gt->steering_table[LNCF] = NULL;
1336	}
1337
1338	/* How about sliceid values that also work for MSLICE steering? */
1339	if (slice_mask & gt->info.mslice_mask) {
1340		slice_mask &= gt->info.mslice_mask;
1341		gt->steering_table[MSLICE] = NULL;
1342	}
1343
1344	if (IS_XEHPSDV(gt->i915) && slice_mask & BIT(0))
1345		gt->steering_table[GAM] = NULL;
1346
1347	slice = __ffs(slice_mask);
1348	subslice = intel_sseu_find_first_xehp_dss(sseu, GEN_DSS_PER_GSLICE, slice) %
1349		GEN_DSS_PER_GSLICE;
1350
1351	__add_mcr_wa(gt, wal, slice, subslice);
1352
1353	/*
1354	 * SQIDI ranges are special because they use different steering
1355	 * registers than everything else we work with.  On XeHP SDV and
1356	 * DG2-G10, any value in the steering registers will work fine since
1357	 * all instances are present, but DG2-G11 only has SQIDI instances at
1358	 * ID's 2 and 3, so we need to steer to one of those.  For simplicity
1359	 * we'll just steer to a hardcoded "2" since that value will work
1360	 * everywhere.
1361	 */
1362	__set_mcr_steering(wal, MCFG_MCR_SELECTOR, 0, 2);
1363	__set_mcr_steering(wal, SF_MCR_SELECTOR, 0, 2);
1364
1365	/*
1366	 * On DG2, GAM registers have a dedicated steering control register
1367	 * and must always be programmed to a hardcoded groupid of "1."
1368	 */
1369	if (IS_DG2(gt->i915))
1370		__set_mcr_steering(wal, GAM_MCR_SELECTOR, 1, 0);
1371}
1372
1373static void
1374pvc_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1375{
1376	unsigned int dss;
1377
1378	/*
1379	 * Setup implicit steering for COMPUTE and DSS ranges to the first
1380	 * non-fused-off DSS.  All other types of MCR registers will be
1381	 * explicitly steered.
1382	 */
1383	dss = intel_sseu_find_first_xehp_dss(&gt->info.sseu, 0, 0);
1384	__add_mcr_wa(gt, wal, dss / GEN_DSS_PER_CSLICE, dss % GEN_DSS_PER_CSLICE);
1385}
1386
1387static void
1388icl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1389{
1390	struct drm_i915_private *i915 = gt->i915;
1391
1392	icl_wa_init_mcr(gt, wal);
1393
1394	/* WaModifyGamTlbPartitioning:icl */
1395	wa_write_clr_set(wal,
1396			 GEN11_GACB_PERF_CTRL,
1397			 GEN11_HASH_CTRL_MASK,
1398			 GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4);
1399
1400	/* Wa_1405766107:icl
1401	 * Formerly known as WaCL2SFHalfMaxAlloc
1402	 */
1403	wa_write_or(wal,
1404		    GEN11_LSN_UNSLCVC,
1405		    GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC |
1406		    GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC);
1407
1408	/* Wa_220166154:icl
1409	 * Formerly known as WaDisCtxReload
1410	 */
1411	wa_write_or(wal,
1412		    GEN8_GAMW_ECO_DEV_RW_IA,
1413		    GAMW_ECO_DEV_CTX_RELOAD_DISABLE);
1414
1415	/* Wa_1406463099:icl
1416	 * Formerly known as WaGamTlbPendError
1417	 */
1418	wa_write_or(wal,
1419		    GAMT_CHKN_BIT_REG,
1420		    GAMT_CHKN_DISABLE_L3_COH_PIPE);
1421
1422	/*
1423	 * Wa_1408615072:icl,ehl  (vsunit)
1424	 * Wa_1407596294:icl,ehl  (hsunit)
1425	 */
1426	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1427		    VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS);
1428
1429	/* Wa_1407352427:icl,ehl */
1430	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1431		    PSDUNIT_CLKGATE_DIS);
1432
1433	/* Wa_1406680159:icl,ehl */
1434	wa_mcr_write_or(wal,
1435			GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1436			GWUNIT_CLKGATE_DIS);
1437
1438	/* Wa_1607087056:icl,ehl,jsl */
1439	if (IS_ICELAKE(i915) ||
1440		((IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) &&
1441		IS_GRAPHICS_STEP(i915, STEP_A0, STEP_B0)))
1442		wa_write_or(wal,
1443			    GEN11_SLICE_UNIT_LEVEL_CLKGATE,
1444			    L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1445
1446	/*
1447	 * This is not a documented workaround, but rather an optimization
1448	 * to reduce sampler power.
1449	 */
1450	wa_mcr_write_clr(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1451}
1452
1453/*
1454 * Though there are per-engine instances of these registers,
1455 * they retain their value through engine resets and should
1456 * only be provided on the GT workaround list rather than
1457 * the engine-specific workaround list.
1458 */
1459static void
1460wa_14011060649(struct intel_gt *gt, struct i915_wa_list *wal)
1461{
1462	struct intel_engine_cs *engine;
1463	int id;
1464
1465	for_each_engine(engine, gt, id) {
1466		if (engine->class != VIDEO_DECODE_CLASS ||
1467		    (engine->instance % 2))
1468			continue;
1469
1470		wa_write_or(wal, VDBOX_CGCTL3F10(engine->mmio_base),
1471			    IECPUNIT_CLKGATE_DIS);
1472	}
1473}
1474
1475static void
1476gen12_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1477{
1478	icl_wa_init_mcr(gt, wal);
1479
1480	/* Wa_14011060649:tgl,rkl,dg1,adl-s,adl-p */
1481	wa_14011060649(gt, wal);
1482
1483	/* Wa_14011059788:tgl,rkl,adl-s,dg1,adl-p */
1484	wa_mcr_write_or(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1485
1486	/*
1487	 * Wa_14015795083
1488	 *
1489	 * Firmware on some gen12 platforms locks the MISCCPCTL register,
1490	 * preventing i915 from modifying it for this workaround.  Skip the
1491	 * readback verification for this workaround on debug builds; if the
1492	 * workaround doesn't stick due to firmware behavior, it's not an error
1493	 * that we want CI to flag.
1494	 */
1495	wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE,
1496	       0, 0, false);
1497}
1498
1499static void
1500dg1_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1501{
1502	gen12_gt_workarounds_init(gt, wal);
1503
1504	/* Wa_1409420604:dg1 */
1505	wa_mcr_write_or(wal, SUBSLICE_UNIT_LEVEL_CLKGATE2,
1506			CPSSUNIT_CLKGATE_DIS);
1507
1508	/* Wa_1408615072:dg1 */
1509	/* Empirical testing shows this register is unaffected by engine reset. */
1510	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, VSUNIT_CLKGATE_DIS_TGL);
1511}
1512
1513static void
1514xehpsdv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1515{
1516	struct drm_i915_private *i915 = gt->i915;
1517
1518	xehp_init_mcr(gt, wal);
1519
1520	/* Wa_1409757795:xehpsdv */
1521	wa_mcr_write_or(wal, SCCGCTL94DC, CG3DDISURB);
1522
1523	/* Wa_18011725039:xehpsdv */
1524	if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A1, STEP_B0)) {
1525		wa_mcr_masked_dis(wal, MLTICTXCTL, TDONRENDER);
1526		wa_mcr_write_or(wal, L3SQCREG1_CCS0, FLUSHALLNONCOH);
1527	}
1528
1529	/* Wa_16011155590:xehpsdv */
1530	if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1531		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1532			    TSGUNIT_CLKGATE_DIS);
1533
1534	/* Wa_14011780169:xehpsdv */
1535	if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_B0, STEP_FOREVER)) {
1536		wa_write_or(wal, UNSLCGCTL9440, GAMTLBOACS_CLKGATE_DIS |
1537			    GAMTLBVDBOX7_CLKGATE_DIS |
1538			    GAMTLBVDBOX6_CLKGATE_DIS |
1539			    GAMTLBVDBOX5_CLKGATE_DIS |
1540			    GAMTLBVDBOX4_CLKGATE_DIS |
1541			    GAMTLBVDBOX3_CLKGATE_DIS |
1542			    GAMTLBVDBOX2_CLKGATE_DIS |
1543			    GAMTLBVDBOX1_CLKGATE_DIS |
1544			    GAMTLBVDBOX0_CLKGATE_DIS |
1545			    GAMTLBKCR_CLKGATE_DIS |
1546			    GAMTLBGUC_CLKGATE_DIS |
1547			    GAMTLBBLT_CLKGATE_DIS);
1548		wa_write_or(wal, UNSLCGCTL9444, GAMTLBGFXA0_CLKGATE_DIS |
1549			    GAMTLBGFXA1_CLKGATE_DIS |
1550			    GAMTLBCOMPA0_CLKGATE_DIS |
1551			    GAMTLBCOMPA1_CLKGATE_DIS |
1552			    GAMTLBCOMPB0_CLKGATE_DIS |
1553			    GAMTLBCOMPB1_CLKGATE_DIS |
1554			    GAMTLBCOMPC0_CLKGATE_DIS |
1555			    GAMTLBCOMPC1_CLKGATE_DIS |
1556			    GAMTLBCOMPD0_CLKGATE_DIS |
1557			    GAMTLBCOMPD1_CLKGATE_DIS |
1558			    GAMTLBMERT_CLKGATE_DIS   |
1559			    GAMTLBVEBOX3_CLKGATE_DIS |
1560			    GAMTLBVEBOX2_CLKGATE_DIS |
1561			    GAMTLBVEBOX1_CLKGATE_DIS |
1562			    GAMTLBVEBOX0_CLKGATE_DIS);
1563	}
1564
1565	/* Wa_16012725990:xehpsdv */
1566	if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A1, STEP_FOREVER))
1567		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, VFUNIT_CLKGATE_DIS);
1568
1569	/* Wa_14011060649:xehpsdv */
1570	wa_14011060649(gt, wal);
1571
1572	/* Wa_14012362059:xehpsdv */
1573	wa_mcr_write_or(wal, XEHP_MERT_MOD_CTRL, FORCE_MISS_FTLB);
1574
1575	/* Wa_14014368820:xehpsdv */
1576	wa_mcr_write_or(wal, XEHP_GAMCNTRL_CTRL,
1577			INVALIDATION_BROADCAST_MODE_DIS | GLOBAL_INVALIDATION_MODE);
1578
1579	/* Wa_14010670810:xehpsdv */
1580	wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
1581}
1582
1583static void
1584dg2_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1585{
1586	xehp_init_mcr(gt, wal);
1587
1588	/* Wa_14011060649:dg2 */
1589	wa_14011060649(gt, wal);
1590
1591	if (IS_DG2_G10(gt->i915)) {
1592		/* Wa_22010523718:dg2 */
1593		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1594			    CG3DDISCFEG_CLKGATE_DIS);
1595
1596		/* Wa_14011006942:dg2 */
1597		wa_mcr_write_or(wal, GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1598				DSS_ROUTER_CLKGATE_DIS);
1599	}
1600
1601	/* Wa_14014830051:dg2 */
1602	wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1603
1604	/*
1605	 * Wa_14015795083
1606	 * Skip verification for possibly locked register.
1607	 */
1608	wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE,
1609	       0, 0, false);
1610
1611	/* Wa_18018781329 */
1612	wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1613	wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1614	wa_mcr_write_or(wal, XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1615	wa_mcr_write_or(wal, XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB);
1616
1617	/* Wa_1509235366:dg2 */
1618	wa_mcr_write_or(wal, XEHP_GAMCNTRL_CTRL,
1619			INVALIDATION_BROADCAST_MODE_DIS | GLOBAL_INVALIDATION_MODE);
1620
1621	/* Wa_14010648519:dg2 */
1622	wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
1623}
1624
1625static void
1626pvc_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1627{
1628	pvc_init_mcr(gt, wal);
1629
1630	/* Wa_14015795083 */
1631	wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1632
1633	/* Wa_18018781329 */
1634	wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1635	wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1636	wa_mcr_write_or(wal, XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1637	wa_mcr_write_or(wal, XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB);
1638
1639	/* Wa_16016694945 */
1640	wa_mcr_masked_en(wal, XEHPC_LNCFMISCCFGREG0, XEHPC_OVRLSCCC);
1641}
1642
1643static void
1644xelpg_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1645{
1646	/* Wa_14018778641 / Wa_18018781329 */
1647	wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1648
1649	/* Wa_22016670082 */
1650	wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE);
1651
1652	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1653	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
1654		/* Wa_14014830051 */
1655		wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1656
1657		/* Wa_14015795083 */
1658		wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1659	}
1660
1661	/*
1662	 * Unlike older platforms, we no longer setup implicit steering here;
1663	 * all MCR accesses are explicitly steered.
1664	 */
1665	debug_dump_steering(gt);
1666}
1667
1668static void
1669wa_16021867713(struct intel_gt *gt, struct i915_wa_list *wal)
1670{
1671	struct intel_engine_cs *engine;
1672	int id;
1673
1674	for_each_engine(engine, gt, id)
1675		if (engine->class == VIDEO_DECODE_CLASS)
1676			wa_write_or(wal, VDBOX_CGCTL3F1C(engine->mmio_base),
1677				    MFXPIPE_CLKGATE_DIS);
1678}
1679
1680static void
1681xelpmp_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1682{
1683	wa_16021867713(gt, wal);
1684
1685	/*
1686	 * Wa_14018778641
1687	 * Wa_18018781329
1688	 *
1689	 * Note that although these registers are MCR on the primary
1690	 * GT, the media GT's versions are regular singleton registers.
1691	 */
1692	wa_write_or(wal, XELPMP_GSC_MOD_CTRL, FORCE_MISS_FTLB);
1693
 
 
 
 
 
 
 
 
1694	/* Wa_22016670082 */
1695	wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE);
1696
1697	debug_dump_steering(gt);
1698}
1699
1700/*
1701 * The bspec performance guide has recommended MMIO tuning settings.  These
1702 * aren't truly "workarounds" but we want to program them through the
1703 * workaround infrastructure to make sure they're (re)applied at the proper
1704 * times.
1705 *
1706 * The programming in this function is for settings that persist through
1707 * engine resets and also are not part of any engine's register state context.
1708 * I.e., settings that only need to be re-applied in the event of a full GT
1709 * reset.
1710 */
1711static void gt_tuning_settings(struct intel_gt *gt, struct i915_wa_list *wal)
1712{
1713	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71))) {
1714		wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1715		wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1716	}
1717
1718	if (IS_PONTEVECCHIO(gt->i915)) {
1719		wa_mcr_write(wal, XEHPC_L3SCRUB,
1720			     SCRUB_CL_DWNGRADE_SHARED | SCRUB_RATE_4B_PER_CLK);
1721		wa_mcr_masked_en(wal, XEHPC_LNCFMISCCFGREG0, XEHPC_HOSTCACHEEN);
1722	}
1723
1724	if (IS_DG2(gt->i915)) {
1725		wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1726		wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1727	}
1728}
1729
1730static void
1731gt_init_workarounds(struct intel_gt *gt, struct i915_wa_list *wal)
1732{
1733	struct drm_i915_private *i915 = gt->i915;
1734
1735	gt_tuning_settings(gt, wal);
1736
1737	if (gt->type == GT_MEDIA) {
1738		if (MEDIA_VER_FULL(i915) == IP_VER(13, 0))
1739			xelpmp_gt_workarounds_init(gt, wal);
1740		else
1741			MISSING_CASE(MEDIA_VER_FULL(i915));
1742
1743		return;
1744	}
1745
1746	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)))
1747		xelpg_gt_workarounds_init(gt, wal);
1748	else if (IS_PONTEVECCHIO(i915))
1749		pvc_gt_workarounds_init(gt, wal);
1750	else if (IS_DG2(i915))
1751		dg2_gt_workarounds_init(gt, wal);
1752	else if (IS_XEHPSDV(i915))
1753		xehpsdv_gt_workarounds_init(gt, wal);
1754	else if (IS_DG1(i915))
1755		dg1_gt_workarounds_init(gt, wal);
1756	else if (GRAPHICS_VER(i915) == 12)
1757		gen12_gt_workarounds_init(gt, wal);
1758	else if (GRAPHICS_VER(i915) == 11)
1759		icl_gt_workarounds_init(gt, wal);
1760	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
1761		cfl_gt_workarounds_init(gt, wal);
1762	else if (IS_GEMINILAKE(i915))
1763		glk_gt_workarounds_init(gt, wal);
1764	else if (IS_KABYLAKE(i915))
1765		kbl_gt_workarounds_init(gt, wal);
1766	else if (IS_BROXTON(i915))
1767		gen9_gt_workarounds_init(gt, wal);
1768	else if (IS_SKYLAKE(i915))
1769		skl_gt_workarounds_init(gt, wal);
1770	else if (IS_HASWELL(i915))
1771		hsw_gt_workarounds_init(gt, wal);
1772	else if (IS_VALLEYVIEW(i915))
1773		vlv_gt_workarounds_init(gt, wal);
1774	else if (IS_IVYBRIDGE(i915))
1775		ivb_gt_workarounds_init(gt, wal);
1776	else if (GRAPHICS_VER(i915) == 6)
1777		snb_gt_workarounds_init(gt, wal);
1778	else if (GRAPHICS_VER(i915) == 5)
1779		ilk_gt_workarounds_init(gt, wal);
1780	else if (IS_G4X(i915))
1781		g4x_gt_workarounds_init(gt, wal);
1782	else if (GRAPHICS_VER(i915) == 4)
1783		gen4_gt_workarounds_init(gt, wal);
1784	else if (GRAPHICS_VER(i915) <= 8)
1785		;
1786	else
1787		MISSING_CASE(GRAPHICS_VER(i915));
1788}
1789
1790void intel_gt_init_workarounds(struct intel_gt *gt)
1791{
1792	struct i915_wa_list *wal = &gt->wa_list;
1793
1794	wa_init_start(wal, gt, "GT", "global");
1795	gt_init_workarounds(gt, wal);
1796	wa_init_finish(wal);
1797}
1798
1799static bool
1800wa_verify(struct intel_gt *gt, const struct i915_wa *wa, u32 cur,
1801	  const char *name, const char *from)
1802{
1803	if ((cur ^ wa->set) & wa->read) {
1804		gt_err(gt,
1805		       "%s workaround lost on %s! (reg[%x]=0x%x, relevant bits were 0x%x vs expected 0x%x)\n",
1806		       name, from, i915_mmio_reg_offset(wa->reg),
1807		       cur, cur & wa->read, wa->set & wa->read);
1808
1809		return false;
1810	}
1811
1812	return true;
1813}
1814
1815static void wa_list_apply(const struct i915_wa_list *wal)
1816{
1817	struct intel_gt *gt = wal->gt;
1818	struct intel_uncore *uncore = gt->uncore;
1819	enum forcewake_domains fw;
1820	unsigned long flags;
1821	struct i915_wa *wa;
1822	unsigned int i;
1823
1824	if (!wal->count)
1825		return;
1826
1827	fw = wal_get_fw_for_rmw(uncore, wal);
1828
1829	intel_gt_mcr_lock(gt, &flags);
1830	spin_lock(&uncore->lock);
1831	intel_uncore_forcewake_get__locked(uncore, fw);
1832
1833	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1834		u32 val, old = 0;
1835
1836		/* open-coded rmw due to steering */
1837		if (wa->clr)
1838			old = wa->is_mcr ?
1839				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1840				intel_uncore_read_fw(uncore, wa->reg);
1841		val = (old & ~wa->clr) | wa->set;
1842		if (val != old || !wa->clr) {
1843			if (wa->is_mcr)
1844				intel_gt_mcr_multicast_write_fw(gt, wa->mcr_reg, val);
1845			else
1846				intel_uncore_write_fw(uncore, wa->reg, val);
1847		}
1848
1849		if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
1850			u32 val = wa->is_mcr ?
1851				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1852				intel_uncore_read_fw(uncore, wa->reg);
1853
1854			wa_verify(gt, wa, val, wal->name, "application");
1855		}
1856	}
1857
1858	intel_uncore_forcewake_put__locked(uncore, fw);
1859	spin_unlock(&uncore->lock);
1860	intel_gt_mcr_unlock(gt, flags);
1861}
1862
1863void intel_gt_apply_workarounds(struct intel_gt *gt)
1864{
1865	wa_list_apply(&gt->wa_list);
1866}
1867
1868static bool wa_list_verify(struct intel_gt *gt,
1869			   const struct i915_wa_list *wal,
1870			   const char *from)
1871{
1872	struct intel_uncore *uncore = gt->uncore;
1873	struct i915_wa *wa;
1874	enum forcewake_domains fw;
1875	unsigned long flags;
1876	unsigned int i;
1877	bool ok = true;
1878
1879	fw = wal_get_fw_for_rmw(uncore, wal);
1880
1881	intel_gt_mcr_lock(gt, &flags);
1882	spin_lock(&uncore->lock);
1883	intel_uncore_forcewake_get__locked(uncore, fw);
1884
1885	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1886		ok &= wa_verify(wal->gt, wa, wa->is_mcr ?
1887				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1888				intel_uncore_read_fw(uncore, wa->reg),
1889				wal->name, from);
1890
1891	intel_uncore_forcewake_put__locked(uncore, fw);
1892	spin_unlock(&uncore->lock);
1893	intel_gt_mcr_unlock(gt, flags);
1894
1895	return ok;
1896}
1897
1898bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from)
1899{
1900	return wa_list_verify(gt, &gt->wa_list, from);
1901}
1902
1903__maybe_unused
1904static bool is_nonpriv_flags_valid(u32 flags)
1905{
1906	/* Check only valid flag bits are set */
1907	if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID)
1908		return false;
1909
1910	/* NB: Only 3 out of 4 enum values are valid for access field */
1911	if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) ==
1912	    RING_FORCE_TO_NONPRIV_ACCESS_INVALID)
1913		return false;
1914
1915	return true;
1916}
1917
1918static void
1919whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags)
1920{
1921	struct i915_wa wa = {
1922		.reg = reg
1923	};
1924
1925	if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1926		return;
1927
1928	if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1929		return;
1930
1931	wa.reg.reg |= flags;
1932	_wa_add(wal, &wa);
1933}
1934
1935static void
1936whitelist_mcr_reg_ext(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 flags)
1937{
1938	struct i915_wa wa = {
1939		.mcr_reg = reg,
1940		.is_mcr = 1,
1941	};
1942
1943	if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1944		return;
1945
1946	if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1947		return;
1948
1949	wa.mcr_reg.reg |= flags;
1950	_wa_add(wal, &wa);
1951}
1952
1953static void
1954whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
1955{
1956	whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1957}
1958
1959static void
1960whitelist_mcr_reg(struct i915_wa_list *wal, i915_mcr_reg_t reg)
1961{
1962	whitelist_mcr_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1963}
1964
1965static void gen9_whitelist_build(struct i915_wa_list *w)
1966{
1967	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
1968	whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
1969
1970	/* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */
1971	whitelist_reg(w, GEN8_CS_CHICKEN1);
1972
1973	/* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */
1974	whitelist_reg(w, GEN8_HDC_CHICKEN1);
1975
1976	/* WaSendPushConstantsFromMMIO:skl,bxt */
1977	whitelist_reg(w, COMMON_SLICE_CHICKEN2);
1978}
1979
1980static void skl_whitelist_build(struct intel_engine_cs *engine)
1981{
1982	struct i915_wa_list *w = &engine->whitelist;
1983
1984	if (engine->class != RENDER_CLASS)
1985		return;
1986
1987	gen9_whitelist_build(w);
1988
1989	/* WaDisableLSQCROPERFforOCL:skl */
1990	whitelist_mcr_reg(w, GEN8_L3SQCREG4);
1991}
1992
1993static void bxt_whitelist_build(struct intel_engine_cs *engine)
1994{
1995	if (engine->class != RENDER_CLASS)
1996		return;
1997
1998	gen9_whitelist_build(&engine->whitelist);
1999}
2000
2001static void kbl_whitelist_build(struct intel_engine_cs *engine)
2002{
2003	struct i915_wa_list *w = &engine->whitelist;
2004
2005	if (engine->class != RENDER_CLASS)
2006		return;
2007
2008	gen9_whitelist_build(w);
2009
2010	/* WaDisableLSQCROPERFforOCL:kbl */
2011	whitelist_mcr_reg(w, GEN8_L3SQCREG4);
2012}
2013
2014static void glk_whitelist_build(struct intel_engine_cs *engine)
2015{
2016	struct i915_wa_list *w = &engine->whitelist;
2017
2018	if (engine->class != RENDER_CLASS)
2019		return;
2020
2021	gen9_whitelist_build(w);
2022
2023	/* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */
2024	whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
2025}
2026
2027static void cfl_whitelist_build(struct intel_engine_cs *engine)
2028{
2029	struct i915_wa_list *w = &engine->whitelist;
2030
2031	if (engine->class != RENDER_CLASS)
2032		return;
2033
2034	gen9_whitelist_build(w);
2035
2036	/*
2037	 * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml
2038	 *
2039	 * This covers 4 register which are next to one another :
2040	 *   - PS_INVOCATION_COUNT
2041	 *   - PS_INVOCATION_COUNT_UDW
2042	 *   - PS_DEPTH_COUNT
2043	 *   - PS_DEPTH_COUNT_UDW
2044	 */
2045	whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2046			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
2047			  RING_FORCE_TO_NONPRIV_RANGE_4);
2048}
2049
2050static void allow_read_ctx_timestamp(struct intel_engine_cs *engine)
2051{
2052	struct i915_wa_list *w = &engine->whitelist;
2053
2054	if (engine->class != RENDER_CLASS)
2055		whitelist_reg_ext(w,
2056				  RING_CTX_TIMESTAMP(engine->mmio_base),
2057				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2058}
2059
2060static void cml_whitelist_build(struct intel_engine_cs *engine)
2061{
2062	allow_read_ctx_timestamp(engine);
2063
2064	cfl_whitelist_build(engine);
2065}
2066
2067static void icl_whitelist_build(struct intel_engine_cs *engine)
2068{
2069	struct i915_wa_list *w = &engine->whitelist;
2070
2071	allow_read_ctx_timestamp(engine);
2072
2073	switch (engine->class) {
2074	case RENDER_CLASS:
2075		/* WaAllowUMDToModifyHalfSliceChicken7:icl */
2076		whitelist_mcr_reg(w, GEN9_HALF_SLICE_CHICKEN7);
2077
2078		/* WaAllowUMDToModifySamplerMode:icl */
2079		whitelist_mcr_reg(w, GEN10_SAMPLER_MODE);
2080
2081		/* WaEnableStateCacheRedirectToCS:icl */
2082		whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
2083
2084		/*
2085		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl
2086		 *
2087		 * This covers 4 register which are next to one another :
2088		 *   - PS_INVOCATION_COUNT
2089		 *   - PS_INVOCATION_COUNT_UDW
2090		 *   - PS_DEPTH_COUNT
2091		 *   - PS_DEPTH_COUNT_UDW
2092		 */
2093		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2094				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
2095				  RING_FORCE_TO_NONPRIV_RANGE_4);
2096		break;
2097
2098	case VIDEO_DECODE_CLASS:
2099		/* hucStatusRegOffset */
2100		whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base),
2101				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2102		/* hucUKernelHdrInfoRegOffset */
2103		whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base),
2104				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2105		/* hucStatus2RegOffset */
2106		whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base),
2107				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2108		break;
2109
2110	default:
2111		break;
2112	}
2113}
2114
2115static void tgl_whitelist_build(struct intel_engine_cs *engine)
2116{
2117	struct i915_wa_list *w = &engine->whitelist;
2118
2119	allow_read_ctx_timestamp(engine);
2120
2121	switch (engine->class) {
2122	case RENDER_CLASS:
2123		/*
2124		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl
2125		 * Wa_1408556865:tgl
2126		 *
2127		 * This covers 4 registers which are next to one another :
2128		 *   - PS_INVOCATION_COUNT
2129		 *   - PS_INVOCATION_COUNT_UDW
2130		 *   - PS_DEPTH_COUNT
2131		 *   - PS_DEPTH_COUNT_UDW
2132		 */
2133		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2134				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
2135				  RING_FORCE_TO_NONPRIV_RANGE_4);
2136
2137		/*
2138		 * Wa_1808121037:tgl
2139		 * Wa_14012131227:dg1
2140		 * Wa_1508744258:tgl,rkl,dg1,adl-s,adl-p
2141		 */
2142		whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
2143
2144		/* Wa_1806527549:tgl */
2145		whitelist_reg(w, HIZ_CHICKEN);
2146
2147		/* Required by recommended tuning setting (not a workaround) */
2148		whitelist_reg(w, GEN11_COMMON_SLICE_CHICKEN3);
2149
2150		break;
2151	default:
2152		break;
2153	}
2154}
2155
2156static void dg2_whitelist_build(struct intel_engine_cs *engine)
2157{
2158	struct i915_wa_list *w = &engine->whitelist;
2159
2160	switch (engine->class) {
2161	case RENDER_CLASS:
2162		/* Required by recommended tuning setting (not a workaround) */
2163		whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3);
2164
2165		break;
2166	default:
2167		break;
2168	}
2169}
2170
2171static void blacklist_trtt(struct intel_engine_cs *engine)
2172{
2173	struct i915_wa_list *w = &engine->whitelist;
2174
2175	/*
2176	 * Prevent read/write access to [0x4400, 0x4600) which covers
2177	 * the TRTT range across all engines. Note that normally userspace
2178	 * cannot access the other engines' trtt control, but for simplicity
2179	 * we cover the entire range on each engine.
2180	 */
2181	whitelist_reg_ext(w, _MMIO(0x4400),
2182			  RING_FORCE_TO_NONPRIV_DENY |
2183			  RING_FORCE_TO_NONPRIV_RANGE_64);
2184	whitelist_reg_ext(w, _MMIO(0x4500),
2185			  RING_FORCE_TO_NONPRIV_DENY |
2186			  RING_FORCE_TO_NONPRIV_RANGE_64);
2187}
2188
2189static void pvc_whitelist_build(struct intel_engine_cs *engine)
2190{
2191	/* Wa_16014440446:pvc */
2192	blacklist_trtt(engine);
2193}
2194
2195static void xelpg_whitelist_build(struct intel_engine_cs *engine)
2196{
2197	struct i915_wa_list *w = &engine->whitelist;
2198
2199	switch (engine->class) {
2200	case RENDER_CLASS:
2201		/* Required by recommended tuning setting (not a workaround) */
2202		whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3);
2203
2204		break;
2205	default:
2206		break;
2207	}
2208}
2209
2210void intel_engine_init_whitelist(struct intel_engine_cs *engine)
2211{
2212	struct drm_i915_private *i915 = engine->i915;
2213	struct i915_wa_list *w = &engine->whitelist;
2214
2215	wa_init_start(w, engine->gt, "whitelist", engine->name);
2216
2217	if (engine->gt->type == GT_MEDIA)
2218		; /* none yet */
2219	else if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 71)))
2220		xelpg_whitelist_build(engine);
2221	else if (IS_PONTEVECCHIO(i915))
2222		pvc_whitelist_build(engine);
2223	else if (IS_DG2(i915))
2224		dg2_whitelist_build(engine);
2225	else if (IS_XEHPSDV(i915))
2226		; /* none needed */
2227	else if (GRAPHICS_VER(i915) == 12)
2228		tgl_whitelist_build(engine);
2229	else if (GRAPHICS_VER(i915) == 11)
2230		icl_whitelist_build(engine);
2231	else if (IS_COMETLAKE(i915))
2232		cml_whitelist_build(engine);
2233	else if (IS_COFFEELAKE(i915))
2234		cfl_whitelist_build(engine);
2235	else if (IS_GEMINILAKE(i915))
2236		glk_whitelist_build(engine);
2237	else if (IS_KABYLAKE(i915))
2238		kbl_whitelist_build(engine);
2239	else if (IS_BROXTON(i915))
2240		bxt_whitelist_build(engine);
2241	else if (IS_SKYLAKE(i915))
2242		skl_whitelist_build(engine);
2243	else if (GRAPHICS_VER(i915) <= 8)
2244		;
2245	else
2246		MISSING_CASE(GRAPHICS_VER(i915));
2247
2248	wa_init_finish(w);
2249}
2250
2251void intel_engine_apply_whitelist(struct intel_engine_cs *engine)
2252{
2253	const struct i915_wa_list *wal = &engine->whitelist;
2254	struct intel_uncore *uncore = engine->uncore;
2255	const u32 base = engine->mmio_base;
2256	struct i915_wa *wa;
2257	unsigned int i;
2258
2259	if (!wal->count)
2260		return;
2261
2262	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
2263		intel_uncore_write(uncore,
2264				   RING_FORCE_TO_NONPRIV(base, i),
2265				   i915_mmio_reg_offset(wa->reg));
2266
2267	/* And clear the rest just in case of garbage */
2268	for (; i < RING_MAX_NONPRIV_SLOTS; i++)
2269		intel_uncore_write(uncore,
2270				   RING_FORCE_TO_NONPRIV(base, i),
2271				   i915_mmio_reg_offset(RING_NOPID(base)));
2272}
2273
2274/*
2275 * engine_fake_wa_init(), a place holder to program the registers
2276 * which are not part of an official workaround defined by the
2277 * hardware team.
2278 * Adding programming of those register inside workaround will
2279 * allow utilizing wa framework to proper application and verification.
2280 */
2281static void
2282engine_fake_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2283{
2284	u8 mocs_w, mocs_r;
2285
2286	/*
2287	 * RING_CMD_CCTL specifies the default MOCS entry that will be used
2288	 * by the command streamer when executing commands that don't have
2289	 * a way to explicitly specify a MOCS setting.  The default should
2290	 * usually reference whichever MOCS entry corresponds to uncached
2291	 * behavior, although use of a WB cached entry is recommended by the
2292	 * spec in certain circumstances on specific platforms.
2293	 */
2294	if (GRAPHICS_VER(engine->i915) >= 12) {
2295		mocs_r = engine->gt->mocs.uc_index;
2296		mocs_w = engine->gt->mocs.uc_index;
2297
2298		if (HAS_L3_CCS_READ(engine->i915) &&
2299		    engine->class == COMPUTE_CLASS) {
2300			mocs_r = engine->gt->mocs.wb_index;
2301
2302			/*
2303			 * Even on the few platforms where MOCS 0 is a
2304			 * legitimate table entry, it's never the correct
2305			 * setting to use here; we can assume the MOCS init
2306			 * just forgot to initialize wb_index.
2307			 */
2308			drm_WARN_ON(&engine->i915->drm, mocs_r == 0);
2309		}
2310
2311		wa_masked_field_set(wal,
2312				    RING_CMD_CCTL(engine->mmio_base),
2313				    CMD_CCTL_MOCS_MASK,
2314				    CMD_CCTL_MOCS_OVERRIDE(mocs_w, mocs_r));
2315	}
2316}
2317
2318static void
2319rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2320{
2321	struct drm_i915_private *i915 = engine->i915;
2322	struct intel_gt *gt = engine->gt;
2323
2324	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2325	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
2326		/* Wa_22014600077 */
2327		wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2328				 ENABLE_EU_COUNT_FOR_TDL_FLUSH);
2329	}
2330
2331	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2332	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2333	    IS_DG2(i915)) {
2334		/* Wa_1509727124 */
2335		wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2336				 SC_DISABLE_POWER_OPTIMIZATION_EBB);
2337	}
2338
2339	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2340	    IS_DG2(i915)) {
2341		/* Wa_22012856258 */
2342		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2343				 GEN12_DISABLE_READ_SUPPRESSION);
2344	}
2345
2346	if (IS_DG2(i915)) {
2347		/*
2348		 * Wa_22010960976:dg2
2349		 * Wa_14013347512:dg2
2350		 */
2351		wa_mcr_masked_dis(wal, XEHP_HDC_CHICKEN0,
2352				  LSC_L1_FLUSH_CTL_3D_DATAPORT_FLUSH_EVENTS_MASK);
2353	}
2354
2355	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) ||
2356	    IS_DG2(i915)) {
2357		/* Wa_14015150844 */
2358		wa_mcr_add(wal, XEHP_HDC_CHICKEN0, 0,
2359			   _MASKED_BIT_ENABLE(DIS_ATOMIC_CHAINING_TYPED_WRITES),
2360			   0, true);
2361	}
2362
2363	if (IS_DG2(i915) || IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2364	    IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2365		/*
2366		 * Wa_1606700617:tgl,dg1,adl-p
2367		 * Wa_22010271021:tgl,rkl,dg1,adl-s,adl-p
2368		 * Wa_14010826681:tgl,dg1,rkl,adl-p
2369		 * Wa_18019627453:dg2
2370		 */
2371		wa_masked_en(wal,
2372			     GEN9_CS_DEBUG_MODE1,
2373			     FF_DOP_CLOCK_GATE_DISABLE);
2374	}
2375
2376	if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || IS_DG1(i915) ||
2377	    IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2378		/* Wa_1606931601:tgl,rkl,dg1,adl-s,adl-p */
2379		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ);
2380
2381		/*
2382		 * Wa_1407928979:tgl A*
2383		 * Wa_18011464164:tgl[B0+],dg1[B0+]
2384		 * Wa_22010931296:tgl[B0+],dg1[B0+]
2385		 * Wa_14010919138:rkl,dg1,adl-s,adl-p
2386		 */
2387		wa_write_or(wal, GEN7_FF_THREAD_MODE,
2388			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2389
2390		/* Wa_1406941453:tgl,rkl,dg1,adl-s,adl-p */
2391		wa_mcr_masked_en(wal,
2392				 GEN10_SAMPLER_MODE,
2393				 ENABLE_SMALLPL);
2394	}
2395
2396	if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2397	    IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2398		/* Wa_1409804808 */
2399		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2400				 GEN12_PUSH_CONST_DEREF_HOLD_DIS);
2401
2402		/* Wa_14010229206 */
2403		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH);
2404	}
2405
2406	if (IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) || IS_ALDERLAKE_P(i915)) {
2407		/*
2408		 * Wa_1607297627
2409		 *
2410		 * On TGL and RKL there are multiple entries for this WA in the
2411		 * BSpec; some indicate this is an A0-only WA, others indicate
2412		 * it applies to all steppings so we trust the "all steppings."
2413		 */
2414		wa_masked_en(wal,
2415			     RING_PSMI_CTL(RENDER_RING_BASE),
2416			     GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE |
2417			     GEN8_RC_SEMA_IDLE_MSG_DISABLE);
2418	}
2419
 
 
 
 
 
 
 
 
 
2420	if (GRAPHICS_VER(i915) == 11) {
2421		/* This is not an Wa. Enable for better image quality */
2422		wa_masked_en(wal,
2423			     _3D_CHICKEN3,
2424			     _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE);
2425
2426		/*
2427		 * Wa_1405543622:icl
2428		 * Formerly known as WaGAPZPriorityScheme
2429		 */
2430		wa_write_or(wal,
2431			    GEN8_GARBCNTL,
2432			    GEN11_ARBITRATION_PRIO_ORDER_MASK);
2433
2434		/*
2435		 * Wa_1604223664:icl
2436		 * Formerly known as WaL3BankAddressHashing
2437		 */
2438		wa_write_clr_set(wal,
2439				 GEN8_GARBCNTL,
2440				 GEN11_HASH_CTRL_EXCL_MASK,
2441				 GEN11_HASH_CTRL_EXCL_BIT0);
2442		wa_write_clr_set(wal,
2443				 GEN11_GLBLINVL,
2444				 GEN11_BANK_HASH_ADDR_EXCL_MASK,
2445				 GEN11_BANK_HASH_ADDR_EXCL_BIT0);
2446
2447		/*
2448		 * Wa_1405733216:icl
2449		 * Formerly known as WaDisableCleanEvicts
2450		 */
2451		wa_mcr_write_or(wal,
2452				GEN8_L3SQCREG4,
2453				GEN11_LQSC_CLEAN_EVICT_DISABLE);
2454
2455		/* Wa_1606682166:icl */
2456		wa_write_or(wal,
2457			    GEN7_SARCHKMD,
2458			    GEN7_DISABLE_SAMPLER_PREFETCH);
2459
2460		/* Wa_1409178092:icl */
2461		wa_mcr_write_clr_set(wal,
2462				     GEN11_SCRATCH2,
2463				     GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE,
2464				     0);
2465
2466		/* WaEnable32PlaneMode:icl */
2467		wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS,
2468			     GEN11_ENABLE_32_PLANE_MODE);
2469
2470		/*
2471		 * Wa_1408767742:icl[a2..forever],ehl[all]
2472		 * Wa_1605460711:icl[a0..c0]
2473		 */
2474		wa_write_or(wal,
2475			    GEN7_FF_THREAD_MODE,
2476			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2477
2478		/* Wa_22010271021 */
2479		wa_masked_en(wal,
2480			     GEN9_CS_DEBUG_MODE1,
2481			     FF_DOP_CLOCK_GATE_DISABLE);
2482	}
2483
2484	/*
2485	 * Intel platforms that support fine-grained preemption (i.e., gen9 and
2486	 * beyond) allow the kernel-mode driver to choose between two different
2487	 * options for controlling preemption granularity and behavior.
2488	 *
2489	 * Option 1 (hardware default):
2490	 *   Preemption settings are controlled in a global manner via
2491	 *   kernel-only register CS_DEBUG_MODE1 (0x20EC).  Any granularity
2492	 *   and settings chosen by the kernel-mode driver will apply to all
2493	 *   userspace clients.
2494	 *
2495	 * Option 2:
2496	 *   Preemption settings are controlled on a per-context basis via
2497	 *   register CS_CHICKEN1 (0x2580).  CS_CHICKEN1 is saved/restored on
2498	 *   context switch and is writable by userspace (e.g., via
2499	 *   MI_LOAD_REGISTER_IMMEDIATE instructions placed in a batch buffer)
2500	 *   which allows different userspace drivers/clients to select
2501	 *   different settings, or to change those settings on the fly in
2502	 *   response to runtime needs.  This option was known by name
2503	 *   "FtrPerCtxtPreemptionGranularityControl" at one time, although
2504	 *   that name is somewhat misleading as other non-granularity
2505	 *   preemption settings are also impacted by this decision.
2506	 *
2507	 * On Linux, our policy has always been to let userspace drivers
2508	 * control preemption granularity/settings (Option 2).  This was
2509	 * originally mandatory on gen9 to prevent ABI breakage (old gen9
2510	 * userspace developed before object-level preemption was enabled would
2511	 * not behave well if i915 were to go with Option 1 and enable that
2512	 * preemption in a global manner).  On gen9 each context would have
2513	 * object-level preemption disabled by default (see
2514	 * WaDisable3DMidCmdPreemption in gen9_ctx_workarounds_init), but
2515	 * userspace drivers could opt-in to object-level preemption as they
2516	 * saw fit.  For post-gen9 platforms, we continue to utilize Option 2;
2517	 * even though it is no longer necessary for ABI compatibility when
2518	 * enabling a new platform, it does ensure that userspace will be able
2519	 * to implement any workarounds that show up requiring temporary
2520	 * adjustments to preemption behavior at runtime.
2521	 *
2522	 * Notes/Workarounds:
2523	 *  - Wa_14015141709:  On DG2 and early steppings of MTL,
2524	 *      CS_CHICKEN1[0] does not disable object-level preemption as
2525	 *      it is supposed to (nor does CS_DEBUG_MODE1[0] if we had been
2526	 *      using Option 1).  Effectively this means userspace is unable
2527	 *      to disable object-level preemption on these platforms/steppings
2528	 *      despite the setting here.
2529	 *
2530	 *  - Wa_16013994831:  May require that userspace program
2531	 *      CS_CHICKEN1[10] when certain runtime conditions are true.
2532	 *      Userspace requires Option 2 to be in effect for their update of
2533	 *      CS_CHICKEN1[10] to be effective.
2534	 *
2535	 * Other workarounds may appear in the future that will also require
2536	 * Option 2 behavior to allow proper userspace implementation.
2537	 */
2538	if (GRAPHICS_VER(i915) >= 9)
2539		wa_masked_en(wal,
2540			     GEN7_FF_SLICE_CS_CHICKEN1,
2541			     GEN9_FFSC_PERCTX_PREEMPT_CTRL);
2542
2543	if (IS_SKYLAKE(i915) ||
2544	    IS_KABYLAKE(i915) ||
2545	    IS_COFFEELAKE(i915) ||
2546	    IS_COMETLAKE(i915)) {
2547		/* WaEnableGapsTsvCreditFix:skl,kbl,cfl */
2548		wa_write_or(wal,
2549			    GEN8_GARBCNTL,
2550			    GEN9_GAPS_TSV_CREDIT_DISABLE);
2551	}
2552
2553	if (IS_BROXTON(i915)) {
2554		/* WaDisablePooledEuLoadBalancingFix:bxt */
2555		wa_masked_en(wal,
2556			     FF_SLICE_CS_CHICKEN2,
2557			     GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
2558	}
2559
2560	if (GRAPHICS_VER(i915) == 9) {
2561		/* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */
2562		wa_masked_en(wal,
2563			     GEN9_CSFE_CHICKEN1_RCS,
2564			     GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE);
2565
2566		/* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */
2567		wa_mcr_write_or(wal,
2568				BDW_SCRATCH1,
2569				GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
2570
2571		/* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */
2572		if (IS_GEN9_LP(i915))
2573			wa_mcr_write_clr_set(wal,
2574					     GEN8_L3SQCREG1,
2575					     L3_PRIO_CREDITS_MASK,
2576					     L3_GENERAL_PRIO_CREDITS(62) |
2577					     L3_HIGH_PRIO_CREDITS(2));
2578
2579		/* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */
2580		wa_mcr_write_or(wal,
2581				GEN8_L3SQCREG4,
2582				GEN8_LQSC_FLUSH_COHERENT_LINES);
2583
2584		/* Disable atomics in L3 to prevent unrecoverable hangs */
2585		wa_write_clr_set(wal, GEN9_SCRATCH_LNCF1,
2586				 GEN9_LNCF_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2587		wa_mcr_write_clr_set(wal, GEN8_L3SQCREG4,
2588				     GEN8_LQSQ_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2589		wa_mcr_write_clr_set(wal, GEN9_SCRATCH1,
2590				     EVICTION_PERF_FIX_ENABLE, 0);
2591	}
2592
2593	if (IS_HASWELL(i915)) {
2594		/* WaSampleCChickenBitEnable:hsw */
2595		wa_masked_en(wal,
2596			     HSW_HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE);
2597
2598		wa_masked_dis(wal,
2599			      CACHE_MODE_0_GEN7,
2600			      /* enable HiZ Raw Stall Optimization */
2601			      HIZ_RAW_STALL_OPT_DISABLE);
2602	}
2603
2604	if (IS_VALLEYVIEW(i915)) {
2605		/* WaDisableEarlyCull:vlv */
2606		wa_masked_en(wal,
2607			     _3D_CHICKEN3,
2608			     _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2609
2610		/*
2611		 * WaVSThreadDispatchOverride:ivb,vlv
2612		 *
2613		 * This actually overrides the dispatch
2614		 * mode for all thread types.
2615		 */
2616		wa_write_clr_set(wal,
2617				 GEN7_FF_THREAD_MODE,
2618				 GEN7_FF_SCHED_MASK,
2619				 GEN7_FF_TS_SCHED_HW |
2620				 GEN7_FF_VS_SCHED_HW |
2621				 GEN7_FF_DS_SCHED_HW);
2622
2623		/* WaPsdDispatchEnable:vlv */
2624		/* WaDisablePSDDualDispatchEnable:vlv */
2625		wa_masked_en(wal,
2626			     GEN7_HALF_SLICE_CHICKEN1,
2627			     GEN7_MAX_PS_THREAD_DEP |
2628			     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2629	}
2630
2631	if (IS_IVYBRIDGE(i915)) {
2632		/* WaDisableEarlyCull:ivb */
2633		wa_masked_en(wal,
2634			     _3D_CHICKEN3,
2635			     _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2636
2637		if (0) { /* causes HiZ corruption on ivb:gt1 */
2638			/* enable HiZ Raw Stall Optimization */
2639			wa_masked_dis(wal,
2640				      CACHE_MODE_0_GEN7,
2641				      HIZ_RAW_STALL_OPT_DISABLE);
2642		}
2643
2644		/*
2645		 * WaVSThreadDispatchOverride:ivb,vlv
2646		 *
2647		 * This actually overrides the dispatch
2648		 * mode for all thread types.
2649		 */
2650		wa_write_clr_set(wal,
2651				 GEN7_FF_THREAD_MODE,
2652				 GEN7_FF_SCHED_MASK,
2653				 GEN7_FF_TS_SCHED_HW |
2654				 GEN7_FF_VS_SCHED_HW |
2655				 GEN7_FF_DS_SCHED_HW);
2656
2657		/* WaDisablePSDDualDispatchEnable:ivb */
2658		if (IS_IVB_GT1(i915))
2659			wa_masked_en(wal,
2660				     GEN7_HALF_SLICE_CHICKEN1,
2661				     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2662	}
2663
2664	if (GRAPHICS_VER(i915) == 7) {
2665		/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
2666		wa_masked_en(wal,
2667			     RING_MODE_GEN7(RENDER_RING_BASE),
2668			     GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
2669
2670		/* WaDisable_RenderCache_OperationalFlush:ivb,vlv,hsw */
2671		wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
2672
2673		/*
2674		 * BSpec says this must be set, even though
2675		 * WaDisable4x2SubspanOptimization:ivb,hsw
2676		 * WaDisable4x2SubspanOptimization isn't listed for VLV.
2677		 */
2678		wa_masked_en(wal,
2679			     CACHE_MODE_1,
2680			     PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
2681
2682		/*
2683		 * BSpec recommends 8x4 when MSAA is used,
2684		 * however in practice 16x4 seems fastest.
2685		 *
2686		 * Note that PS/WM thread counts depend on the WIZ hashing
2687		 * disable bit, which we don't touch here, but it's good
2688		 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2689		 */
2690		wa_masked_field_set(wal,
2691				    GEN7_GT_MODE,
2692				    GEN6_WIZ_HASHING_MASK,
2693				    GEN6_WIZ_HASHING_16x4);
2694	}
2695
2696	if (IS_GRAPHICS_VER(i915, 6, 7))
2697		/*
2698		 * We need to disable the AsyncFlip performance optimisations in
2699		 * order to use MI_WAIT_FOR_EVENT within the CS. It should
2700		 * already be programmed to '1' on all products.
2701		 *
2702		 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
2703		 */
2704		wa_masked_en(wal,
2705			     RING_MI_MODE(RENDER_RING_BASE),
2706			     ASYNC_FLIP_PERF_DISABLE);
2707
2708	if (GRAPHICS_VER(i915) == 6) {
2709		/*
2710		 * Required for the hardware to program scanline values for
2711		 * waiting
2712		 * WaEnableFlushTlbInvalidationMode:snb
2713		 */
2714		wa_masked_en(wal,
2715			     GFX_MODE,
2716			     GFX_TLB_INVALIDATE_EXPLICIT);
2717
2718		/* WaDisableHiZPlanesWhenMSAAEnabled:snb */
2719		wa_masked_en(wal,
2720			     _3D_CHICKEN,
2721			     _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB);
2722
2723		wa_masked_en(wal,
2724			     _3D_CHICKEN3,
2725			     /* WaStripsFansDisableFastClipPerformanceFix:snb */
2726			     _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL |
2727			     /*
2728			      * Bspec says:
2729			      * "This bit must be set if 3DSTATE_CLIP clip mode is set
2730			      * to normal and 3DSTATE_SF number of SF output attributes
2731			      * is more than 16."
2732			      */
2733			     _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH);
2734
2735		/*
2736		 * BSpec recommends 8x4 when MSAA is used,
2737		 * however in practice 16x4 seems fastest.
2738		 *
2739		 * Note that PS/WM thread counts depend on the WIZ hashing
2740		 * disable bit, which we don't touch here, but it's good
2741		 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2742		 */
2743		wa_masked_field_set(wal,
2744				    GEN6_GT_MODE,
2745				    GEN6_WIZ_HASHING_MASK,
2746				    GEN6_WIZ_HASHING_16x4);
2747
2748		/* WaDisable_RenderCache_OperationalFlush:snb */
2749		wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
2750
2751		/*
2752		 * From the Sandybridge PRM, volume 1 part 3, page 24:
2753		 * "If this bit is set, STCunit will have LRA as replacement
2754		 *  policy. [...] This bit must be reset. LRA replacement
2755		 *  policy is not supported."
2756		 */
2757		wa_masked_dis(wal,
2758			      CACHE_MODE_0,
2759			      CM0_STC_EVICT_DISABLE_LRA_SNB);
2760	}
2761
2762	if (IS_GRAPHICS_VER(i915, 4, 6))
2763		/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
2764		wa_add(wal, RING_MI_MODE(RENDER_RING_BASE),
2765		       0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH),
2766		       /* XXX bit doesn't stick on Broadwater */
2767		       IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH, true);
2768
2769	if (GRAPHICS_VER(i915) == 4)
2770		/*
2771		 * Disable CONSTANT_BUFFER before it is loaded from the context
2772		 * image. For as it is loaded, it is executed and the stored
2773		 * address may no longer be valid, leading to a GPU hang.
2774		 *
2775		 * This imposes the requirement that userspace reload their
2776		 * CONSTANT_BUFFER on every batch, fortunately a requirement
2777		 * they are already accustomed to from before contexts were
2778		 * enabled.
2779		 */
2780		wa_add(wal, ECOSKPD(RENDER_RING_BASE),
2781		       0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE),
2782		       0 /* XXX bit doesn't stick on Broadwater */,
2783		       true);
2784}
2785
2786static void
2787xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2788{
2789	struct drm_i915_private *i915 = engine->i915;
2790
2791	/* WaKBLVECSSemaphoreWaitPoll:kbl */
2792	if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_A0, STEP_F0)) {
2793		wa_write(wal,
2794			 RING_SEMA_WAIT_POLL(engine->mmio_base),
2795			 1);
2796	}
2797	/* Wa_16018031267, Wa_16018063123 */
2798	if (NEEDS_FASTCOLOR_BLT_WABB(engine))
2799		wa_masked_field_set(wal, ECOSKPD(engine->mmio_base),
2800				    XEHP_BLITTER_SCHEDULING_MODE_MASK,
2801				    XEHP_BLITTER_ROUND_ROBIN_MODE);
2802}
2803
2804static void
2805ccs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2806{
2807	if (IS_PVC_CT_STEP(engine->i915, STEP_A0, STEP_C0)) {
2808		/* Wa_14014999345:pvc */
2809		wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS, DISABLE_ECC);
2810	}
2811}
2812
2813/*
2814 * The bspec performance guide has recommended MMIO tuning settings.  These
2815 * aren't truly "workarounds" but we want to program them with the same
2816 * workaround infrastructure to ensure that they're automatically added to
2817 * the GuC save/restore lists, re-applied at the right times, and checked for
2818 * any conflicting programming requested by real workarounds.
2819 *
2820 * Programming settings should be added here only if their registers are not
2821 * part of an engine's register state context.  If a register is part of a
2822 * context, then any tuning settings should be programmed in an appropriate
2823 * function invoked by __intel_engine_init_ctx_wa().
2824 */
2825static void
2826add_render_compute_tuning_settings(struct intel_gt *gt,
2827				   struct i915_wa_list *wal)
2828{
2829	struct drm_i915_private *i915 = gt->i915;
2830
2831	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) || IS_DG2(i915))
2832		wa_mcr_write_clr_set(wal, RT_CTRL, STACKID_CTRL, STACKID_CTRL_512);
2833
2834	/*
2835	 * This tuning setting proves beneficial only on ATS-M designs; the
2836	 * default "age based" setting is optimal on regular DG2 and other
2837	 * platforms.
2838	 */
2839	if (INTEL_INFO(i915)->tuning_thread_rr_after_dep)
2840		wa_mcr_masked_field_set(wal, GEN9_ROW_CHICKEN4, THREAD_EX_ARB_MODE,
2841					THREAD_EX_ARB_MODE_RR_AFTER_DEP);
2842
2843	if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 50))
2844		wa_write_clr(wal, GEN8_GARBCNTL, GEN12_BUS_HASH_CTL_BIT_EXC);
2845}
2846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2847/*
2848 * The workarounds in this function apply to shared registers in
2849 * the general render reset domain that aren't tied to a
2850 * specific engine.  Since all render+compute engines get reset
2851 * together, and the contents of these registers are lost during
2852 * the shared render domain reset, we'll define such workarounds
2853 * here and then add them to just a single RCS or CCS engine's
2854 * workaround list (whichever engine has the XXXX flag).
2855 */
2856static void
2857general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2858{
2859	struct drm_i915_private *i915 = engine->i915;
2860	struct intel_gt *gt = engine->gt;
2861
2862	add_render_compute_tuning_settings(gt, wal);
2863
2864	if (GRAPHICS_VER(i915) >= 11) {
2865		/* This is not a Wa (although referred to as
2866		 * WaSetInidrectStateOverride in places), this allows
2867		 * applications that reference sampler states through
2868		 * the BindlessSamplerStateBaseAddress to have their
2869		 * border color relative to DynamicStateBaseAddress
2870		 * rather than BindlessSamplerStateBaseAddress.
2871		 *
2872		 * Otherwise SAMPLER_STATE border colors have to be
2873		 * copied in multiple heaps (DynamicStateBaseAddress &
2874		 * BindlessSamplerStateBaseAddress)
2875		 *
2876		 * BSpec: 46052
2877		 */
2878		wa_mcr_masked_en(wal,
2879				 GEN10_SAMPLER_MODE,
2880				 GEN11_INDIRECT_STATE_BASE_ADDR_OVERRIDE);
2881	}
2882
2883	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_B0, STEP_FOREVER) ||
2884	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B0, STEP_FOREVER))
 
2885		/* Wa_14017856879 */
2886		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN3, MTL_DISABLE_FIX_FOR_EOT_FLUSH);
2887
 
 
 
 
2888	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2889	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2890		/*
2891		 * Wa_14017066071
2892		 * Wa_14017654203
2893		 */
2894		wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2895				 MTL_DISABLE_SAMPLER_SC_OOO);
2896
2897	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2898		/* Wa_22015279794 */
2899		wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2900				 DISABLE_PREFETCH_INTO_IC);
2901
2902	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2903	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2904	    IS_DG2(i915)) {
2905		/* Wa_22013037850 */
2906		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW,
2907				DISABLE_128B_EVICTION_COMMAND_UDW);
2908
2909		/* Wa_18017747507 */
2910		wa_masked_en(wal, VFG_PREEMPTION_CHICKEN, POLYGON_TRIFAN_LINELOOP_DISABLE);
2911	}
2912
2913	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2914	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2915	    IS_PONTEVECCHIO(i915) ||
2916	    IS_DG2(i915)) {
2917		/* Wa_22014226127 */
2918		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0, DISABLE_D8_D16_COASLESCE);
2919	}
2920
2921	if (IS_PONTEVECCHIO(i915) || IS_DG2(i915)) {
2922		/* Wa_14015227452:dg2,pvc */
2923		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE);
2924
2925		/* Wa_16015675438:dg2,pvc */
2926		wa_masked_en(wal, FF_SLICE_CS_CHICKEN2, GEN12_PERF_FIX_BALANCING_CFE_DISABLE);
2927	}
2928
2929	if (IS_DG2(i915)) {
2930		/*
2931		 * Wa_16011620976:dg2_g11
2932		 * Wa_22015475538:dg2
2933		 */
2934		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8);
2935
2936		/* Wa_18028616096 */
2937		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, UGM_FRAGMENT_THRESHOLD_TO_3);
2938	}
2939
2940	if (IS_DG2_G11(i915)) {
2941		/*
2942		 * Wa_22012826095:dg2
2943		 * Wa_22013059131:dg2
2944		 */
2945		wa_mcr_write_clr_set(wal, LSC_CHICKEN_BIT_0_UDW,
2946				     MAXREQS_PER_BANK,
2947				     REG_FIELD_PREP(MAXREQS_PER_BANK, 2));
2948
2949		/* Wa_22013059131:dg2 */
2950		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0,
2951				FORCE_1_SUB_MESSAGE_PER_FRAGMENT);
2952
2953		/*
2954		 * Wa_22012654132
2955		 *
2956		 * Note that register 0xE420 is write-only and cannot be read
2957		 * back for verification on DG2 (due to Wa_14012342262), so
2958		 * we need to explicitly skip the readback.
2959		 */
2960		wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
2961			   _MASKED_BIT_ENABLE(ENABLE_PREFETCH_INTO_IC),
2962			   0 /* write-only, so skip validation */,
2963			   true);
2964	}
2965
2966	if (IS_XEHPSDV(i915)) {
2967		/* Wa_1409954639 */
2968		wa_mcr_masked_en(wal,
2969				 GEN8_ROW_CHICKEN,
2970				 SYSTOLIC_DOP_CLOCK_GATING_DIS);
2971
2972		/* Wa_1607196519 */
2973		wa_mcr_masked_en(wal,
2974				 GEN9_ROW_CHICKEN4,
2975				 GEN12_DISABLE_GRF_CLEAR);
2976
2977		/* Wa_14010449647:xehpsdv */
2978		wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
2979				 GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2980	}
2981}
2982
2983static void
2984engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2985{
2986	if (GRAPHICS_VER(engine->i915) < 4)
2987		return;
2988
2989	engine_fake_wa_init(engine, wal);
2990
2991	/*
2992	 * These are common workarounds that just need to applied
2993	 * to a single RCS/CCS engine's workaround list since
2994	 * they're reset as part of the general render domain reset.
2995	 */
2996	if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE)
2997		general_render_compute_wa_init(engine, wal);
 
 
2998
2999	if (engine->class == COMPUTE_CLASS)
3000		ccs_engine_wa_init(engine, wal);
3001	else if (engine->class == RENDER_CLASS)
3002		rcs_engine_wa_init(engine, wal);
3003	else
3004		xcs_engine_wa_init(engine, wal);
3005}
3006
3007void intel_engine_init_workarounds(struct intel_engine_cs *engine)
3008{
3009	struct i915_wa_list *wal = &engine->wa_list;
3010
3011	wa_init_start(wal, engine->gt, "engine", engine->name);
3012	engine_init_workarounds(engine, wal);
3013	wa_init_finish(wal);
3014}
3015
3016void intel_engine_apply_workarounds(struct intel_engine_cs *engine)
3017{
3018	wa_list_apply(&engine->wa_list);
3019}
3020
3021static const struct i915_range mcr_ranges_gen8[] = {
3022	{ .start = 0x5500, .end = 0x55ff },
3023	{ .start = 0x7000, .end = 0x7fff },
3024	{ .start = 0x9400, .end = 0x97ff },
3025	{ .start = 0xb000, .end = 0xb3ff },
3026	{ .start = 0xe000, .end = 0xe7ff },
3027	{},
3028};
3029
3030static const struct i915_range mcr_ranges_gen12[] = {
3031	{ .start =  0x8150, .end =  0x815f },
3032	{ .start =  0x9520, .end =  0x955f },
3033	{ .start =  0xb100, .end =  0xb3ff },
3034	{ .start =  0xde80, .end =  0xe8ff },
3035	{ .start = 0x24a00, .end = 0x24a7f },
3036	{},
3037};
3038
3039static const struct i915_range mcr_ranges_xehp[] = {
3040	{ .start =  0x4000, .end =  0x4aff },
3041	{ .start =  0x5200, .end =  0x52ff },
3042	{ .start =  0x5400, .end =  0x7fff },
3043	{ .start =  0x8140, .end =  0x815f },
3044	{ .start =  0x8c80, .end =  0x8dff },
3045	{ .start =  0x94d0, .end =  0x955f },
3046	{ .start =  0x9680, .end =  0x96ff },
3047	{ .start =  0xb000, .end =  0xb3ff },
3048	{ .start =  0xc800, .end =  0xcfff },
3049	{ .start =  0xd800, .end =  0xd8ff },
3050	{ .start =  0xdc00, .end =  0xffff },
3051	{ .start = 0x17000, .end = 0x17fff },
3052	{ .start = 0x24a00, .end = 0x24a7f },
3053	{},
3054};
3055
3056static bool mcr_range(struct drm_i915_private *i915, u32 offset)
3057{
3058	const struct i915_range *mcr_ranges;
3059	int i;
3060
3061	if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50))
3062		mcr_ranges = mcr_ranges_xehp;
3063	else if (GRAPHICS_VER(i915) >= 12)
3064		mcr_ranges = mcr_ranges_gen12;
3065	else if (GRAPHICS_VER(i915) >= 8)
3066		mcr_ranges = mcr_ranges_gen8;
3067	else
3068		return false;
3069
3070	/*
3071	 * Registers in these ranges are affected by the MCR selector
3072	 * which only controls CPU initiated MMIO. Routing does not
3073	 * work for CS access so we cannot verify them on this path.
3074	 */
3075	for (i = 0; mcr_ranges[i].start; i++)
3076		if (offset >= mcr_ranges[i].start &&
3077		    offset <= mcr_ranges[i].end)
3078			return true;
3079
3080	return false;
3081}
3082
3083static int
3084wa_list_srm(struct i915_request *rq,
3085	    const struct i915_wa_list *wal,
3086	    struct i915_vma *vma)
3087{
3088	struct drm_i915_private *i915 = rq->i915;
3089	unsigned int i, count = 0;
3090	const struct i915_wa *wa;
3091	u32 srm, *cs;
3092
3093	srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
3094	if (GRAPHICS_VER(i915) >= 8)
3095		srm++;
3096
3097	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3098		if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg)))
3099			count++;
3100	}
3101
3102	cs = intel_ring_begin(rq, 4 * count);
3103	if (IS_ERR(cs))
3104		return PTR_ERR(cs);
3105
3106	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3107		u32 offset = i915_mmio_reg_offset(wa->reg);
3108
3109		if (mcr_range(i915, offset))
3110			continue;
3111
3112		*cs++ = srm;
3113		*cs++ = offset;
3114		*cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i;
3115		*cs++ = 0;
3116	}
3117	intel_ring_advance(rq, cs);
3118
3119	return 0;
3120}
3121
3122static int engine_wa_list_verify(struct intel_context *ce,
3123				 const struct i915_wa_list * const wal,
3124				 const char *from)
3125{
3126	const struct i915_wa *wa;
3127	struct i915_request *rq;
3128	struct i915_vma *vma;
3129	struct i915_gem_ww_ctx ww;
3130	unsigned int i;
3131	u32 *results;
3132	int err;
3133
3134	if (!wal->count)
3135		return 0;
3136
3137	vma = __vm_create_scratch_for_read(&ce->engine->gt->ggtt->vm,
3138					   wal->count * sizeof(u32));
3139	if (IS_ERR(vma))
3140		return PTR_ERR(vma);
3141
3142	intel_engine_pm_get(ce->engine);
3143	i915_gem_ww_ctx_init(&ww, false);
3144retry:
3145	err = i915_gem_object_lock(vma->obj, &ww);
3146	if (err == 0)
3147		err = intel_context_pin_ww(ce, &ww);
3148	if (err)
3149		goto err_pm;
3150
3151	err = i915_vma_pin_ww(vma, &ww, 0, 0,
3152			   i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
3153	if (err)
3154		goto err_unpin;
3155
3156	rq = i915_request_create(ce);
3157	if (IS_ERR(rq)) {
3158		err = PTR_ERR(rq);
3159		goto err_vma;
3160	}
3161
3162	err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
3163	if (err == 0)
3164		err = wa_list_srm(rq, wal, vma);
3165
3166	i915_request_get(rq);
3167	if (err)
3168		i915_request_set_error_once(rq, err);
3169	i915_request_add(rq);
3170
3171	if (err)
3172		goto err_rq;
3173
3174	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
3175		err = -ETIME;
3176		goto err_rq;
3177	}
3178
3179	results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
3180	if (IS_ERR(results)) {
3181		err = PTR_ERR(results);
3182		goto err_rq;
3183	}
3184
3185	err = 0;
3186	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3187		if (mcr_range(rq->i915, i915_mmio_reg_offset(wa->reg)))
3188			continue;
3189
3190		if (!wa_verify(wal->gt, wa, results[i], wal->name, from))
3191			err = -ENXIO;
3192	}
3193
3194	i915_gem_object_unpin_map(vma->obj);
3195
3196err_rq:
3197	i915_request_put(rq);
3198err_vma:
3199	i915_vma_unpin(vma);
3200err_unpin:
3201	intel_context_unpin(ce);
3202err_pm:
3203	if (err == -EDEADLK) {
3204		err = i915_gem_ww_ctx_backoff(&ww);
3205		if (!err)
3206			goto retry;
3207	}
3208	i915_gem_ww_ctx_fini(&ww);
3209	intel_engine_pm_put(ce->engine);
3210	i915_vma_put(vma);
3211	return err;
3212}
3213
3214int intel_engine_verify_workarounds(struct intel_engine_cs *engine,
3215				    const char *from)
3216{
3217	return engine_wa_list_verify(engine->kernel_context,
3218				     &engine->wa_list,
3219				     from);
3220}
3221
3222#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
3223#include "selftest_workarounds.c"
3224#endif