intel_workarounds.c - drivers/gpu/drm/i915/gt/intel_workarounds.c - Linux diff v6.13.7

   1// SPDX-License-Identifier: MIT
   2/*
 
 
   3 * Copyright © 2014-2018 Intel Corporation
   4 */
   5
   6#include "i915_drv.h"
   7#include "i915_reg.h"
   8#include "intel_context.h"
   9#include "intel_engine_pm.h"
  10#include "intel_engine_regs.h"
  11#include "intel_gpu_commands.h"
  12#include "intel_gt.h"
  13#include "intel_gt_ccs_mode.h"
  14#include "intel_gt_mcr.h"
  15#include "intel_gt_print.h"
  16#include "intel_gt_regs.h"
  17#include "intel_ring.h"
  18#include "intel_workarounds.h"
  19
  20#include "display/intel_fbc_regs.h"
  21
  22/**
  23 * DOC: Hardware workarounds
  24 *
  25 * Hardware workarounds are register programming documented to be executed in
  26 * the driver that fall outside of the normal programming sequences for a
  27 * platform. There are some basic categories of workarounds, depending on
  28 * how/when they are applied:
  29 *
  30 * - Context workarounds: workarounds that touch registers that are
  31 *   saved/restored to/from the HW context image. The list is emitted (via Load
  32 *   Register Immediate commands) once when initializing the device and saved in
  33 *   the default context. That default context is then used on every context
  34 *   creation to have a "primed golden context", i.e. a context image that
  35 *   already contains the changes needed to all the registers.
  36 *
  37 *   Context workarounds should be implemented in the \*_ctx_workarounds_init()
  38 *   variants respective to the targeted platforms.
  39 *
  40 * - Engine workarounds: the list of these WAs is applied whenever the specific
  41 *   engine is reset. It's also possible that a set of engine classes share a
  42 *   common power domain and they are reset together. This happens on some
  43 *   platforms with render and compute engines. In this case (at least) one of
  44 *   them need to keeep the workaround programming: the approach taken in the
  45 *   driver is to tie those workarounds to the first compute/render engine that
  46 *   is registered.  When executing with GuC submission, engine resets are
  47 *   outside of kernel driver control, hence the list of registers involved in
  48 *   written once, on engine initialization, and then passed to GuC, that
  49 *   saves/restores their values before/after the reset takes place. See
  50 *   ``drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c`` for reference.
  51 *
  52 *   Workarounds for registers specific to RCS and CCS should be implemented in
  53 *   rcs_engine_wa_init() and ccs_engine_wa_init(), respectively; those for
  54 *   registers belonging to BCS, VCS or VECS should be implemented in
  55 *   xcs_engine_wa_init(). Workarounds for registers not belonging to a specific
  56 *   engine's MMIO range but that are part of of the common RCS/CCS reset domain
  57 *   should be implemented in general_render_compute_wa_init(). The settings
  58 *   about the CCS load balancing should be added in ccs_engine_wa_mode().
  59 *
  60 * - GT workarounds: the list of these WAs is applied whenever these registers
  61 *   revert to their default values: on GPU reset, suspend/resume [1]_, etc.
 
 
 
 
 
 
 
 
 
 
 
  62 *
  63 *   GT workarounds should be implemented in the \*_gt_workarounds_init()
  64 *   variants respective to the targeted platforms.
 
  65 *
  66 * - Register whitelist: some workarounds need to be implemented in userspace,
  67 *   but need to touch privileged registers. The whitelist in the kernel
  68 *   instructs the hardware to allow the access to happen. From the kernel side,
  69 *   this is just a special case of a MMIO workaround (as we write the list of
  70 *   these to/be-whitelisted registers to some special HW registers).
  71 *
  72 *   Register whitelisting should be done in the \*_whitelist_build() variants
  73 *   respective to the targeted platforms.
  74 *
  75 * - Workaround batchbuffers: buffers that get executed automatically by the
  76 *   hardware on every HW context restore. These buffers are created and
  77 *   programmed in the default context so the hardware always go through those
  78 *   programming sequences when switching contexts. The support for workaround
  79 *   batchbuffers is enabled these hardware mechanisms:
  80 *
  81 *   #. INDIRECT_CTX: A batchbuffer and an offset are provided in the default
  82 *      context, pointing the hardware to jump to that location when that offset
  83 *      is reached in the context restore. Workaround batchbuffer in the driver
  84 *      currently uses this mechanism for all platforms.
  85 *
  86 *   #. BB_PER_CTX_PTR: A batchbuffer is provided in the default context,
  87 *      pointing the hardware to a buffer to continue executing after the
  88 *      engine registers are restored in a context restore sequence. This is
  89 *      currently not used in the driver.
  90 *
  91 * - Other:  There are WAs that, due to their nature, cannot be applied from a
  92 *   central place. Those are peppered around the rest of the code, as needed.
  93 *   Workarounds related to the display IP are the main example.
  94 *
  95 * .. [1] Technically, some registers are powercontext saved & restored, so they
  96 *    survive a suspend/resume. In practice, writing them again is not too
  97 *    costly and simplifies things, so it's the approach taken in the driver.
  98 */
  99
 100static void wa_init_start(struct i915_wa_list *wal, struct intel_gt *gt,
 101			  const char *name, const char *engine_name)
 102{
 103	wal->gt = gt;
 104	wal->name = name;
 105	wal->engine_name = engine_name;
 106}
 107
 108#define WA_LIST_CHUNK (1 << 4)
 109
 110static void wa_init_finish(struct i915_wa_list *wal)
 111{
 112	/* Trim unused entries. */
 113	if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) {
 114		struct i915_wa *list = kmemdup_array(wal->list, wal->count,
 115						     sizeof(*list), GFP_KERNEL);
 
 116
 117		if (list) {
 118			kfree(wal->list);
 119			wal->list = list;
 120		}
 121	}
 122
 123	if (!wal->count)
 124		return;
 125
 126	gt_dbg(wal->gt, "Initialized %u %s workarounds on %s\n",
 127	       wal->wa_count, wal->name, wal->engine_name);
 128}
 129
 130static enum forcewake_domains
 131wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal)
 132{
 133	enum forcewake_domains fw = 0;
 134	struct i915_wa *wa;
 135	unsigned int i;
 136
 137	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
 138		fw |= intel_uncore_forcewake_for_reg(uncore,
 139						     wa->reg,
 140						     FW_REG_READ |
 141						     FW_REG_WRITE);
 142
 143	return fw;
 144}
 145
 146static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa)
 147{
 148	unsigned int addr = i915_mmio_reg_offset(wa->reg);
 149	struct drm_i915_private *i915 = wal->gt->i915;
 150	unsigned int start = 0, end = wal->count;
 151	const unsigned int grow = WA_LIST_CHUNK;
 152	struct i915_wa *wa_;
 153
 154	GEM_BUG_ON(!is_power_of_2(grow));
 155
 156	if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */
 157		struct i915_wa *list;
 158
 159		list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*wa),
 160				     GFP_KERNEL);
 161		if (!list) {
 162			drm_err(&i915->drm, "No space for workaround init!\n");
 163			return;
 164		}
 165
 166		if (wal->list) {
 167			memcpy(list, wal->list, sizeof(*wa) * wal->count);
 168			kfree(wal->list);
 169		}
 170
 171		wal->list = list;
 172	}
 173
 174	while (start < end) {
 175		unsigned int mid = start + (end - start) / 2;
 176
 177		if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) {
 178			start = mid + 1;
 179		} else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) {
 180			end = mid;
 181		} else {
 182			wa_ = &wal->list[mid];
 183
 184			if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) {
 185				drm_err(&i915->drm,
 186					"Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n",
 187					i915_mmio_reg_offset(wa_->reg),
 188					wa_->clr, wa_->set);
 189
 190				wa_->set &= ~wa->clr;
 191			}
 192
 193			wal->wa_count++;
 194			wa_->set |= wa->set;
 195			wa_->clr |= wa->clr;
 196			wa_->read |= wa->read;
 197			return;
 198		}
 199	}
 200
 201	wal->wa_count++;
 202	wa_ = &wal->list[wal->count++];
 203	*wa_ = *wa;
 204
 205	while (wa_-- > wal->list) {
 206		GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) ==
 207			   i915_mmio_reg_offset(wa_[1].reg));
 208		if (i915_mmio_reg_offset(wa_[1].reg) >
 209		    i915_mmio_reg_offset(wa_[0].reg))
 210			break;
 211
 212		swap(wa_[1], wa_[0]);
 213	}
 214}
 215
 216static void wa_add(struct i915_wa_list *wal, i915_reg_t reg,
 217		   u32 clear, u32 set, u32 read_mask, bool masked_reg)
 218{
 219	struct i915_wa wa = {
 220		.reg  = reg,
 221		.clr  = clear,
 222		.set  = set,
 223		.read = read_mask,
 224		.masked_reg = masked_reg,
 225	};
 226
 227	_wa_add(wal, &wa);
 228}
 229
 230static void wa_mcr_add(struct i915_wa_list *wal, i915_mcr_reg_t reg,
 231		       u32 clear, u32 set, u32 read_mask, bool masked_reg)
 232{
 233	struct i915_wa wa = {
 234		.mcr_reg = reg,
 235		.clr  = clear,
 236		.set  = set,
 237		.read = read_mask,
 238		.masked_reg = masked_reg,
 239		.is_mcr = 1,
 240	};
 241
 242	_wa_add(wal, &wa);
 243}
 244
 245static void
 246wa_write_clr_set(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set)
 247{
 248	wa_add(wal, reg, clear, set, clear | set, false);
 249}
 250
 251static void
 252wa_mcr_write_clr_set(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clear, u32 set)
 253{
 254	wa_mcr_add(wal, reg, clear, set, clear | set, false);
 255}
 256
 257static void
 258wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
 259{
 260	wa_write_clr_set(wal, reg, ~0, set);
 261}
 262
 263static void
 264wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
 265{
 266	wa_write_clr_set(wal, reg, set, set);
 267}
 268
 269static void
 270wa_mcr_write_or(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set)
 271{
 272	wa_mcr_write_clr_set(wal, reg, set, set);
 273}
 274
 275static void
 276wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr)
 277{
 278	wa_write_clr_set(wal, reg, clr, 0);
 279}
 280
 281static void
 282wa_mcr_write_clr(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clr)
 283{
 284	wa_mcr_write_clr_set(wal, reg, clr, 0);
 285}
 286
 287/*
 288 * WA operations on "masked register". A masked register has the upper 16 bits
 289 * documented as "masked" in b-spec. Its purpose is to allow writing to just a
 290 * portion of the register without a rmw: you simply write in the upper 16 bits
 291 * the mask of bits you are going to modify.
 292 *
 293 * The wa_masked_* family of functions already does the necessary operations to
 294 * calculate the mask based on the parameters passed, so user only has to
 295 * provide the lower 16 bits of that register.
 296 */
 297
 298static void
 299wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
 300{
 301	wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
 302}
 303
 304static void
 305wa_mcr_masked_en(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
 306{
 307	wa_mcr_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
 308}
 309
 310static void
 311wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
 312{
 313	wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
 314}
 315
 316static void
 317wa_mcr_masked_dis(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
 318{
 319	wa_mcr_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
 320}
 321
 322static void
 323wa_masked_field_set(struct i915_wa_list *wal, i915_reg_t reg,
 324		    u32 mask, u32 val)
 325{
 326	wa_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
 327}
 328
 329static void
 330wa_mcr_masked_field_set(struct i915_wa_list *wal, i915_mcr_reg_t reg,
 331			u32 mask, u32 val)
 332{
 333	wa_mcr_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
 334}
 335
 336static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine,
 337				      struct i915_wa_list *wal)
 338{
 339	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
 340}
 341
 342static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine,
 343				      struct i915_wa_list *wal)
 344{
 345	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
 346}
 347
 348static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine,
 349				      struct i915_wa_list *wal)
 350{
 351	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
 352
 353	/* WaDisableAsyncFlipPerfMode:bdw,chv */
 354	wa_masked_en(wal, RING_MI_MODE(RENDER_RING_BASE), ASYNC_FLIP_PERF_DISABLE);
 355
 356	/* WaDisablePartialInstShootdown:bdw,chv */
 357	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
 358			 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
 359
 360	/* Use Force Non-Coherent whenever executing a 3D context. This is a
 361	 * workaround for a possible hang in the unlikely event a TLB
 362	 * invalidation occurs during a PSD flush.
 363	 */
 364	/* WaForceEnableNonCoherent:bdw,chv */
 365	/* WaHdcDisableFetchWhenMasked:bdw,chv */
 366	wa_masked_en(wal, HDC_CHICKEN0,
 367		     HDC_DONOT_FETCH_MEM_WHEN_MASKED |
 368		     HDC_FORCE_NON_COHERENT);
 369
 370	/* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
 371	 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
 372	 *  polygons in the same 8x4 pixel/sample area to be processed without
 373	 *  stalling waiting for the earlier ones to write to Hierarchical Z
 374	 *  buffer."
 375	 *
 376	 * This optimization is off by default for BDW and CHV; turn it on.
 377	 */
 378	wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
 379
 380	/* Wa4x4STCOptimizationDisable:bdw,chv */
 381	wa_masked_en(wal, CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
 382
 383	/*
 384	 * BSpec recommends 8x4 when MSAA is used,
 385	 * however in practice 16x4 seems fastest.
 386	 *
 387	 * Note that PS/WM thread counts depend on the WIZ hashing
 388	 * disable bit, which we don't touch here, but it's good
 389	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
 390	 */
 391	wa_masked_field_set(wal, GEN7_GT_MODE,
 392			    GEN6_WIZ_HASHING_MASK,
 393			    GEN6_WIZ_HASHING_16x4);
 394}
 395
 396static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine,
 397				     struct i915_wa_list *wal)
 398{
 399	struct drm_i915_private *i915 = engine->i915;
 400
 401	gen8_ctx_workarounds_init(engine, wal);
 402
 403	/* WaDisableThreadStallDopClockGating:bdw (pre-production) */
 404	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
 405
 406	/* WaDisableDopClockGating:bdw
 407	 *
 408	 * Also see the related UCGTCL1 write in bdw_init_clock_gating()
 409	 * to disable EUTC clock gating.
 410	 */
 411	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
 412			 DOP_CLOCK_GATING_DISABLE);
 413
 414	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
 415			 GEN8_SAMPLER_POWER_BYPASS_DIS);
 416
 417	wa_masked_en(wal, HDC_CHICKEN0,
 418		     /* WaForceContextSaveRestoreNonCoherent:bdw */
 419		     HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
 420		     /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
 421		     (INTEL_INFO(i915)->gt == 3 ? HDC_FENCE_DEST_SLM_DISABLE : 0));
 422}
 423
 424static void chv_ctx_workarounds_init(struct intel_engine_cs *engine,
 425				     struct i915_wa_list *wal)
 426{
 427	gen8_ctx_workarounds_init(engine, wal);
 428
 429	/* WaDisableThreadStallDopClockGating:chv */
 430	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
 431
 432	/* Improve HiZ throughput on CHV. */
 433	wa_masked_en(wal, HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
 434}
 435
 436static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine,
 437				      struct i915_wa_list *wal)
 438{
 439	struct drm_i915_private *i915 = engine->i915;
 440
 441	if (HAS_LLC(i915)) {
 442		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
 443		 *
 444		 * Must match Display Engine. See
 445		 * WaCompressedResourceDisplayNewHashMode.
 446		 */
 447		wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 448			     GEN9_PBE_COMPRESSED_HASH_SELECTION);
 449		wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
 450				 GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR);
 451	}
 452
 453	/* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */
 454	/* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */
 455	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
 456			 FLOW_CONTROL_ENABLE |
 457			 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
 458
 459	/* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */
 460	/* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */
 461	wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
 462			 GEN9_ENABLE_YV12_BUGFIX |
 463			 GEN9_ENABLE_GPGPU_PREEMPTION);
 464
 465	/* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */
 466	/* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */
 467	wa_masked_en(wal, CACHE_MODE_1,
 468		     GEN8_4x4_STC_OPTIMIZATION_DISABLE |
 469		     GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE);
 470
 471	/* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */
 472	wa_mcr_masked_dis(wal, GEN9_HALF_SLICE_CHICKEN5,
 473			  GEN9_CCS_TLB_PREFETCH_ENABLE);
 474
 475	/* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */
 476	wa_masked_en(wal, HDC_CHICKEN0,
 477		     HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
 478		     HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
 479
 480	/* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
 481	 * both tied to WaForceContextSaveRestoreNonCoherent
 482	 * in some hsds for skl. We keep the tie for all gen9. The
 483	 * documentation is a bit hazy and so we want to get common behaviour,
 484	 * even though there is no clear evidence we would need both on kbl/bxt.
 485	 * This area has been source of system hangs so we play it safe
 486	 * and mimic the skl regardless of what bspec says.
 487	 *
 488	 * Use Force Non-Coherent whenever executing a 3D context. This
 489	 * is a workaround for a possible hang in the unlikely event
 490	 * a TLB invalidation occurs during a PSD flush.
 491	 */
 492
 493	/* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */
 494	wa_masked_en(wal, HDC_CHICKEN0,
 495		     HDC_FORCE_NON_COHERENT);
 496
 497	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
 498	if (IS_SKYLAKE(i915) ||
 499	    IS_KABYLAKE(i915) ||
 500	    IS_COFFEELAKE(i915) ||
 501	    IS_COMETLAKE(i915))
 502		wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
 503				 GEN8_SAMPLER_POWER_BYPASS_DIS);
 504
 505	/* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */
 506	wa_mcr_masked_en(wal, HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
 507
 508	/*
 509	 * Supporting preemption with fine-granularity requires changes in the
 510	 * batch buffer programming. Since we can't break old userspace, we
 511	 * need to set our default preemption level to safe value. Userspace is
 512	 * still able to use more fine-grained preemption levels, since in
 513	 * WaEnablePreemptionGranularityControlByUMD we're whitelisting the
 514	 * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are
 515	 * not real HW workarounds, but merely a way to start using preemption
 516	 * while maintaining old contract with userspace.
 517	 */
 518
 519	/* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */
 520	wa_masked_dis(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
 521
 522	/* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */
 523	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
 524			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 525			    GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
 526
 527	/* WaClearHIZ_WM_CHICKEN3:bxt,glk */
 528	if (IS_GEN9_LP(i915))
 529		wa_masked_en(wal, GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
 530}
 531
 532static void skl_tune_iz_hashing(struct intel_engine_cs *engine,
 533				struct i915_wa_list *wal)
 534{
 535	struct intel_gt *gt = engine->gt;
 536	u8 vals[3] = { 0, 0, 0 };
 537	unsigned int i;
 538
 539	for (i = 0; i < 3; i++) {
 540		u8 ss;
 541
 542		/*
 543		 * Only consider slices where one, and only one, subslice has 7
 544		 * EUs
 545		 */
 546		if (!is_power_of_2(gt->info.sseu.subslice_7eu[i]))
 547			continue;
 548
 549		/*
 550		 * subslice_7eu[i] != 0 (because of the check above) and
 551		 * ss_max == 4 (maximum number of subslices possible per slice)
 552		 *
 553		 * ->    0 <= ss <= 3;
 554		 */
 555		ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1;
 556		vals[i] = 3 - ss;
 557	}
 558
 559	if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
 560		return;
 561
 562	/* Tune IZ hashing. See intel_device_info_runtime_init() */
 563	wa_masked_field_set(wal, GEN7_GT_MODE,
 564			    GEN9_IZ_HASHING_MASK(2) |
 565			    GEN9_IZ_HASHING_MASK(1) |
 566			    GEN9_IZ_HASHING_MASK(0),
 567			    GEN9_IZ_HASHING(2, vals[2]) |
 568			    GEN9_IZ_HASHING(1, vals[1]) |
 569			    GEN9_IZ_HASHING(0, vals[0]));
 570}
 571
 572static void skl_ctx_workarounds_init(struct intel_engine_cs *engine,
 573				     struct i915_wa_list *wal)
 574{
 575	gen9_ctx_workarounds_init(engine, wal);
 576	skl_tune_iz_hashing(engine, wal);
 577}
 578
 579static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine,
 580				     struct i915_wa_list *wal)
 581{
 582	gen9_ctx_workarounds_init(engine, wal);
 583
 584	/* WaDisableThreadStallDopClockGating:bxt */
 585	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
 586			 STALL_DOP_GATING_DISABLE);
 587
 588	/* WaToEnableHwFixForPushConstHWBug:bxt */
 589	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 590		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 591}
 592
 593static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine,
 594				     struct i915_wa_list *wal)
 595{
 596	struct drm_i915_private *i915 = engine->i915;
 597
 598	gen9_ctx_workarounds_init(engine, wal);
 599
 600	/* WaToEnableHwFixForPushConstHWBug:kbl */
 601	if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_C0, STEP_FOREVER))
 602		wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 603			     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 604
 605	/* WaDisableSbeCacheDispatchPortSharing:kbl */
 606	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
 607			 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
 608}
 609
 610static void glk_ctx_workarounds_init(struct intel_engine_cs *engine,
 611				     struct i915_wa_list *wal)
 612{
 613	gen9_ctx_workarounds_init(engine, wal);
 614
 615	/* WaToEnableHwFixForPushConstHWBug:glk */
 616	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 617		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 618}
 619
 620static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine,
 621				     struct i915_wa_list *wal)
 622{
 623	gen9_ctx_workarounds_init(engine, wal);
 624
 625	/* WaToEnableHwFixForPushConstHWBug:cfl */
 626	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
 627		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 628
 629	/* WaDisableSbeCacheDispatchPortSharing:cfl */
 630	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
 631			 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 632}
 633
 634static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
 635				     struct i915_wa_list *wal)
 636{
 637	/* Wa_1406697149 (WaDisableBankHangMode:icl) */
 638	wa_write(wal, GEN8_L3CNTLREG, GEN8_ERRDETBCTRL);
 
 
 
 
 
 
 
 
 
 
 
 
 639
 640	/* WaForceEnableNonCoherent:icl
 641	 * This is not the same workaround as in early Gen9 platforms, where
 642	 * lacking this could cause system hangs, but coherency performance
 643	 * overhead is high and only a few compute workloads really need it
 644	 * (the register is whitelisted in hardware now, so UMDs can opt in
 645	 * for coherency if they have a good reason).
 646	 */
 647	wa_mcr_masked_en(wal, ICL_HDC_MODE, HDC_FORCE_NON_COHERENT);
 
 
 
 
 
 
 
 
 
 
 
 
 648
 649	/* WaEnableFloatBlendOptimization:icl */
 650	wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
 651		   _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE),
 652		   0 /* write-only, so skip validation */,
 653		   true);
 654
 655	/* WaDisableGPGPUMidThreadPreemption:icl */
 656	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
 657			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 658			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
 659
 660	/* allow headerless messages for preemptible GPGPU context */
 661	wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
 662			 GEN11_SAMPLER_ENABLE_HEADLESS_MSG);
 663
 664	/* Wa_1604278689:icl,ehl */
 665	wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID);
 666	wa_write_clr_set(wal, IVB_FBC_RT_BASE_UPPER,
 667			 0,
 668			 0xFFFFFFFF);
 669
 670	/* Wa_1406306137:icl,ehl */
 671	wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU);
 672}
 673
 674/*
 675 * These settings aren't actually workarounds, but general tuning settings that
 676 * need to be programmed on dg2 platform.
 677 */
 678static void dg2_ctx_gt_tuning_init(struct intel_engine_cs *engine,
 679				   struct i915_wa_list *wal)
 680{
 681	wa_mcr_masked_en(wal, CHICKEN_RASTER_2, TBIMR_FAST_CLIP);
 682	wa_mcr_write_clr_set(wal, XEHP_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK,
 683			     REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f));
 684	wa_mcr_write_clr_set(wal, XEHP_FF_MODE2, FF_MODE2_TDS_TIMER_MASK,
 685			     FF_MODE2_TDS_TIMER_128);
 686}
 687
 688static void gen12_ctx_workarounds_init(struct intel_engine_cs *engine,
 689				       struct i915_wa_list *wal)
 690{
 691	struct drm_i915_private *i915 = engine->i915;
 692
 693	/*
 694	 * Wa_1409142259:tgl,dg1,adl-p
 695	 * Wa_1409347922:tgl,dg1,adl-p
 696	 * Wa_1409252684:tgl,dg1,adl-p
 697	 * Wa_1409217633:tgl,dg1,adl-p
 698	 * Wa_1409207793:tgl,dg1,adl-p
 699	 * Wa_1409178076:tgl,dg1,adl-p
 700	 * Wa_1408979724:tgl,dg1,adl-p
 701	 * Wa_14010443199:tgl,rkl,dg1,adl-p
 702	 * Wa_14010698770:tgl,rkl,dg1,adl-s,adl-p
 703	 * Wa_1409342910:tgl,rkl,dg1,adl-s,adl-p
 
 
 
 
 
 
 
 704	 */
 705	wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3,
 706		     GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
 
 
 
 707
 708	/* WaDisableGPGPUMidThreadPreemption:gen12 */
 709	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
 710			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 711			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
 712
 713	/*
 714	 * Wa_16011163337 - GS_TIMER
 715	 *
 716	 * TDS_TIMER: Although some platforms refer to it as Wa_1604555607, we
 717	 * need to program it even on those that don't explicitly list that
 718	 * workaround.
 719	 *
 720	 * Note that the programming of GEN12_FF_MODE2 is further modified
 721	 * according to the FF_MODE2 guidance given by Wa_1608008084.
 722	 * Wa_1608008084 tells us the FF_MODE2 register will return the wrong
 723	 * value when read from the CPU.
 724	 *
 725	 * The default value for this register is zero for all fields.
 726	 * So instead of doing a RMW we should just write the desired values
 727	 * for TDS and GS timers. Note that since the readback can't be trusted,
 728	 * the clear mask is just set to ~0 to make sure other bits are not
 729	 * inadvertently set. For the same reason read verification is ignored.
 730	 */
 731	wa_add(wal,
 732	       GEN12_FF_MODE2,
 733	       ~0,
 734	       FF_MODE2_TDS_TIMER_128 | FF_MODE2_GS_TIMER_224,
 735	       0, false);
 736
 737	if (!IS_DG1(i915)) {
 738		/* Wa_1806527549 */
 739		wa_masked_en(wal, HIZ_CHICKEN, HZ_DEPTH_TEST_LE_GE_OPT_DISABLE);
 740
 741		/* Wa_1606376872 */
 742		wa_masked_en(wal, COMMON_SLICE_CHICKEN4, DISABLE_TDC_LOAD_BALANCING_CALC);
 743	}
 744}
 745
 746static void dg1_ctx_workarounds_init(struct intel_engine_cs *engine,
 747				     struct i915_wa_list *wal)
 748{
 749	gen12_ctx_workarounds_init(engine, wal);
 750
 751	/* Wa_1409044764 */
 752	wa_masked_dis(wal, GEN11_COMMON_SLICE_CHICKEN3,
 753		      DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN);
 754
 755	/* Wa_22010493298 */
 756	wa_masked_en(wal, HIZ_CHICKEN,
 757		     DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE);
 758}
 759
 760static void dg2_ctx_workarounds_init(struct intel_engine_cs *engine,
 761				     struct i915_wa_list *wal)
 762{
 763	dg2_ctx_gt_tuning_init(engine, wal);
 764
 765	/* Wa_16013271637:dg2 */
 766	wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
 767			 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
 768
 769	/* Wa_14014947963:dg2 */
 770	wa_masked_field_set(wal, VF_PREEMPTION, PREEMPTION_VERTEX_COUNT, 0x4000);
 771
 772	/* Wa_18018764978:dg2 */
 773	wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
 774
 775	/* Wa_18019271663:dg2 */
 776	wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
 777
 778	/* Wa_14019877138:dg2 */
 779	wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT);
 780}
 781
 782static void xelpg_ctx_gt_tuning_init(struct intel_engine_cs *engine,
 783				     struct i915_wa_list *wal)
 784{
 785	struct intel_gt *gt = engine->gt;
 786
 787	dg2_ctx_gt_tuning_init(engine, wal);
 788
 789	/*
 790	 * Due to Wa_16014892111, the DRAW_WATERMARK tuning must be done in
 791	 * gen12_emit_indirect_ctx_rcs() rather than here on some early
 792	 * steppings.
 793	 */
 794	if (!(IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
 795	      IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)))
 796		wa_add(wal, DRAW_WATERMARK, VERT_WM_VAL, 0x3FF, 0, false);
 797}
 798
 799static void xelpg_ctx_workarounds_init(struct intel_engine_cs *engine,
 800				       struct i915_wa_list *wal)
 801{
 802	struct intel_gt *gt = engine->gt;
 803
 804	xelpg_ctx_gt_tuning_init(engine, wal);
 805
 806	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
 807	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
 808		/* Wa_14014947963 */
 809		wa_masked_field_set(wal, VF_PREEMPTION,
 810				    PREEMPTION_VERTEX_COUNT, 0x4000);
 811
 812		/* Wa_16013271637 */
 813		wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
 814				 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
 815
 816		/* Wa_18019627453 */
 817		wa_mcr_masked_en(wal, VFLSKPD, VF_PREFETCH_TLB_DIS);
 818
 819		/* Wa_18018764978 */
 820		wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
 821	}
 822
 823	/* Wa_18019271663 */
 824	wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
 825
 826	/* Wa_14019877138 */
 827	wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT);
 828}
 829
 830static void fakewa_disable_nestedbb_mode(struct intel_engine_cs *engine,
 831					 struct i915_wa_list *wal)
 832{
 833	/*
 834	 * This is a "fake" workaround defined by software to ensure we
 835	 * maintain reliable, backward-compatible behavior for userspace with
 836	 * regards to how nested MI_BATCH_BUFFER_START commands are handled.
 837	 *
 838	 * The per-context setting of MI_MODE[12] determines whether the bits
 839	 * of a nested MI_BATCH_BUFFER_START instruction should be interpreted
 840	 * in the traditional manner or whether they should instead use a new
 841	 * tgl+ meaning that breaks backward compatibility, but allows nesting
 842	 * into 3rd-level batchbuffers.  When this new capability was first
 843	 * added in TGL, it remained off by default unless a context
 844	 * intentionally opted in to the new behavior.  However Xe_HPG now
 845	 * flips this on by default and requires that we explicitly opt out if
 846	 * we don't want the new behavior.
 847	 *
 848	 * From a SW perspective, we want to maintain the backward-compatible
 849	 * behavior for userspace, so we'll apply a fake workaround to set it
 850	 * back to the legacy behavior on platforms where the hardware default
 851	 * is to break compatibility.  At the moment there is no Linux
 852	 * userspace that utilizes third-level batchbuffers, so this will avoid
 853	 * userspace from needing to make any changes.  using the legacy
 854	 * meaning is the correct thing to do.  If/when we have userspace
 855	 * consumers that want to utilize third-level batch nesting, we can
 856	 * provide a context parameter to allow them to opt-in.
 857	 */
 858	wa_masked_dis(wal, RING_MI_MODE(engine->mmio_base), TGL_NESTED_BB_EN);
 859}
 860
 861static void gen12_ctx_gt_mocs_init(struct intel_engine_cs *engine,
 862				   struct i915_wa_list *wal)
 863{
 864	u8 mocs;
 865
 866	/*
 867	 * Some blitter commands do not have a field for MOCS, those
 868	 * commands will use MOCS index pointed by BLIT_CCTL.
 869	 * BLIT_CCTL registers are needed to be programmed to un-cached.
 870	 */
 871	if (engine->class == COPY_ENGINE_CLASS) {
 872		mocs = engine->gt->mocs.uc_index;
 873		wa_write_clr_set(wal,
 874				 BLIT_CCTL(engine->mmio_base),
 875				 BLIT_CCTL_MASK,
 876				 BLIT_CCTL_MOCS(mocs, mocs));
 877	}
 878}
 879
 880/*
 881 * gen12_ctx_gt_fake_wa_init() aren't programmingan official workaround
 882 * defined by the hardware team, but it programming general context registers.
 883 * Adding those context register programming in context workaround
 884 * allow us to use the wa framework for proper application and validation.
 885 */
 886static void
 887gen12_ctx_gt_fake_wa_init(struct intel_engine_cs *engine,
 888			  struct i915_wa_list *wal)
 889{
 890	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
 891		fakewa_disable_nestedbb_mode(engine, wal);
 892
 893	gen12_ctx_gt_mocs_init(engine, wal);
 894}
 895
 896static void
 897__intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
 898			   struct i915_wa_list *wal,
 899			   const char *name)
 900{
 901	struct drm_i915_private *i915 = engine->i915;
 902
 903	wa_init_start(wal, engine->gt, name, engine->name);
 904
 905	/* Applies to all engines */
 906	/*
 907	 * Fake workarounds are not the actual workaround but
 908	 * programming of context registers using workaround framework.
 909	 */
 910	if (GRAPHICS_VER(i915) >= 12)
 911		gen12_ctx_gt_fake_wa_init(engine, wal);
 912
 913	if (engine->class != RENDER_CLASS)
 914		goto done;
 
 
 915
 916	if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
 917		xelpg_ctx_workarounds_init(engine, wal);
 918	else if (IS_DG2(i915))
 919		dg2_ctx_workarounds_init(engine, wal);
 920	else if (IS_DG1(i915))
 921		dg1_ctx_workarounds_init(engine, wal);
 922	else if (GRAPHICS_VER(i915) == 12)
 923		gen12_ctx_workarounds_init(engine, wal);
 924	else if (GRAPHICS_VER(i915) == 11)
 925		icl_ctx_workarounds_init(engine, wal);
 
 
 926	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
 927		cfl_ctx_workarounds_init(engine, wal);
 928	else if (IS_GEMINILAKE(i915))
 929		glk_ctx_workarounds_init(engine, wal);
 930	else if (IS_KABYLAKE(i915))
 931		kbl_ctx_workarounds_init(engine, wal);
 932	else if (IS_BROXTON(i915))
 933		bxt_ctx_workarounds_init(engine, wal);
 934	else if (IS_SKYLAKE(i915))
 935		skl_ctx_workarounds_init(engine, wal);
 936	else if (IS_CHERRYVIEW(i915))
 937		chv_ctx_workarounds_init(engine, wal);
 938	else if (IS_BROADWELL(i915))
 939		bdw_ctx_workarounds_init(engine, wal);
 940	else if (GRAPHICS_VER(i915) == 7)
 941		gen7_ctx_workarounds_init(engine, wal);
 942	else if (GRAPHICS_VER(i915) == 6)
 943		gen6_ctx_workarounds_init(engine, wal);
 944	else if (GRAPHICS_VER(i915) < 8)
 945		;
 946	else
 947		MISSING_CASE(GRAPHICS_VER(i915));
 948
 949done:
 950	wa_init_finish(wal);
 951}
 952
 953void intel_engine_init_ctx_wa(struct intel_engine_cs *engine)
 954{
 955	__intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context");
 956}
 957
 958int intel_engine_emit_ctx_wa(struct i915_request *rq)
 959{
 960	struct i915_wa_list *wal = &rq->engine->ctx_wa_list;
 961	struct intel_uncore *uncore = rq->engine->uncore;
 962	enum forcewake_domains fw;
 963	unsigned long flags;
 964	struct i915_wa *wa;
 965	unsigned int i;
 966	u32 *cs;
 967	int ret;
 968
 969	if (wal->count == 0)
 970		return 0;
 971
 972	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
 973	if (ret)
 974		return ret;
 975
 976	if ((IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(12, 70), IP_VER(12, 74)) ||
 977	     IS_DG2(rq->i915)) && rq->engine->class == RENDER_CLASS)
 978		cs = intel_ring_begin(rq, (wal->count * 2 + 6));
 979	else
 980		cs = intel_ring_begin(rq, (wal->count * 2 + 2));
 981
 982	if (IS_ERR(cs))
 983		return PTR_ERR(cs);
 984
 985	fw = wal_get_fw_for_rmw(uncore, wal);
 986
 987	intel_gt_mcr_lock(wal->gt, &flags);
 988	spin_lock(&uncore->lock);
 989	intel_uncore_forcewake_get__locked(uncore, fw);
 990
 991	*cs++ = MI_LOAD_REGISTER_IMM(wal->count);
 992	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
 993		u32 val;
 994
 995		/* Skip reading the register if it's not really needed */
 996		if (wa->masked_reg || (wa->clr | wa->set) == U32_MAX) {
 997			val = wa->set;
 998		} else {
 999			val = wa->is_mcr ?
1000				intel_gt_mcr_read_any_fw(wal->gt, wa->mcr_reg) :
1001				intel_uncore_read_fw(uncore, wa->reg);
1002			val &= ~wa->clr;
1003			val |= wa->set;
1004		}
1005
1006		*cs++ = i915_mmio_reg_offset(wa->reg);
1007		*cs++ = val;
1008	}
1009	*cs++ = MI_NOOP;
1010
1011	/* Wa_14019789679 */
1012	if ((IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(12, 70), IP_VER(12, 74)) ||
1013	     IS_DG2(rq->i915)) && rq->engine->class == RENDER_CLASS) {
1014		*cs++ = CMD_3DSTATE_MESH_CONTROL;
1015		*cs++ = 0;
1016		*cs++ = 0;
1017		*cs++ = MI_NOOP;
1018	}
1019
1020	intel_uncore_forcewake_put__locked(uncore, fw);
1021	spin_unlock(&uncore->lock);
1022	intel_gt_mcr_unlock(wal->gt, flags);
1023
1024	intel_ring_advance(rq, cs);
1025
1026	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
1027	if (ret)
1028		return ret;
1029
1030	return 0;
1031}
1032
1033static void
1034gen4_gt_workarounds_init(struct intel_gt *gt,
1035			 struct i915_wa_list *wal)
1036{
1037	/* WaDisable_RenderCache_OperationalFlush:gen4,ilk */
1038	wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
1039}
1040
1041static void
1042g4x_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1043{
1044	gen4_gt_workarounds_init(gt, wal);
1045
1046	/* WaDisableRenderCachePipelinedFlush:g4x,ilk */
1047	wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE);
1048}
1049
1050static void
1051ilk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1052{
1053	g4x_gt_workarounds_init(gt, wal);
1054
1055	wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED);
1056}
1057
1058static void
1059snb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1060{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1061}
1062
1063static void
1064ivb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1065{
 
 
 
 
 
 
 
 
 
 
 
 
1066	/* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
1067	wa_masked_dis(wal,
1068		      GEN7_COMMON_SLICE_CHICKEN1,
1069		      GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
1070
1071	/* WaApplyL3ControlAndL3ChickenMode:ivb */
1072	wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL);
1073	wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
1074
1075	/* WaForceL3Serialization:ivb */
1076	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1077}
1078
1079static void
1080vlv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1081{
 
 
 
 
 
 
 
 
 
 
 
 
 
1082	/* WaForceL3Serialization:vlv */
1083	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1084
1085	/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1086	 * WaIncreaseL3CreditsForVLVB0:vlv
1087	 * This is the hardware default actually.
1088	 */
1089	wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE);
1090}
1091
1092static void
1093hsw_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1094{
1095	/* L3 caching of data atomics doesn't work -- disable it. */
1096	wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
1097
1098	wa_add(wal,
1099	       HSW_ROW_CHICKEN3, 0,
1100	       _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE),
1101	       0 /* XXX does this reg exist? */, true);
1102
1103	/* WaVSRefCountFullforceMissDisable:hsw */
1104	wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME);
1105}
1106
1107static void
1108gen9_wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal)
1109{
1110	const struct sseu_dev_info *sseu = &to_gt(i915)->info.sseu;
1111	unsigned int slice, subslice;
1112	u32 mcr, mcr_mask;
1113
1114	GEM_BUG_ON(GRAPHICS_VER(i915) != 9);
 
 
 
 
 
1115
1116	/*
1117	 * WaProgramMgsrForCorrectSliceSpecificMmioReads:gen9,glk,kbl,cml
1118	 * Before any MMIO read into slice/subslice specific registers, MCR
1119	 * packet control register needs to be programmed to point to any
1120	 * enabled s/ss pair. Otherwise, incorrect values will be returned.
1121	 * This means each subsequent MMIO read will be forwarded to an
1122	 * specific s/ss combination, but this is OK since these registers
1123	 * are consistent across s/ss in almost all cases. In the rare
1124	 * occasions, such as INSTDONE, where this value is dependent
1125	 * on s/ss combo, the read should be done with read_subslice_reg.
1126	 */
1127	slice = ffs(sseu->slice_mask) - 1;
1128	GEM_BUG_ON(slice >= ARRAY_SIZE(sseu->subslice_mask.hsw));
1129	subslice = ffs(intel_sseu_get_hsw_subslices(sseu, slice));
1130	GEM_BUG_ON(!subslice);
1131	subslice--;
1132
1133	/*
1134	 * We use GEN8_MCR..() macros to calculate the |mcr| value for
1135	 * Gen9 to address WaProgramMgsrForCorrectSliceSpecificMmioReads
 
 
 
 
1136	 */
1137	mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice);
1138	mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
1139
1140	drm_dbg(&i915->drm, "MCR slice:%d/subslice:%d = %x\n", slice, subslice, mcr);
1141
1142	wa_write_clr_set(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr);
 
1143}
1144
1145static void
1146gen9_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1147{
1148	struct drm_i915_private *i915 = gt->i915;
1149
1150	/* WaProgramMgsrForCorrectSliceSpecificMmioReads:glk,kbl,cml,gen9 */
1151	gen9_wa_init_mcr(i915, wal);
1152
1153	/* WaDisableKillLogic:bxt,skl,kbl */
1154	if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915))
1155		wa_write_or(wal,
1156			    GAM_ECOCHK,
1157			    ECOCHK_DIS_TLB);
1158
1159	if (HAS_LLC(i915)) {
1160		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
1161		 *
1162		 * Must match Display Engine. See
1163		 * WaCompressedResourceDisplayNewHashMode.
1164		 */
1165		wa_write_or(wal,
1166			    MMCD_MISC_CTRL,
1167			    MMCD_PCLA | MMCD_HOTSPOT_EN);
1168	}
1169
1170	/* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */
1171	wa_write_or(wal,
1172		    GAM_ECOCHK,
1173		    BDW_DISABLE_HDC_INVALIDATION);
1174}
1175
1176static void
1177skl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1178{
1179	gen9_gt_workarounds_init(gt, wal);
1180
1181	/* WaDisableGafsUnitClkGating:skl */
1182	wa_write_or(wal,
1183		    GEN7_UCGCTL4,
1184		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1185
1186	/* WaInPlaceDecompressionHang:skl */
1187	if (IS_SKYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, STEP_A0, STEP_H0))
1188		wa_write_or(wal,
1189			    GEN9_GAMT_ECO_REG_RW_IA,
1190			    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1191}
1192
1193static void
1194kbl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
 
 
 
 
 
 
 
 
 
 
 
1195{
1196	gen9_gt_workarounds_init(gt, wal);
1197
1198	/* WaDisableDynamicCreditSharing:kbl */
1199	if (IS_KABYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, 0, STEP_C0))
1200		wa_write_or(wal,
1201			    GAMT_CHKN_BIT_REG,
1202			    GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);
1203
1204	/* WaDisableGafsUnitClkGating:kbl */
1205	wa_write_or(wal,
1206		    GEN7_UCGCTL4,
1207		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1208
1209	/* WaInPlaceDecompressionHang:kbl */
1210	wa_write_or(wal,
1211		    GEN9_GAMT_ECO_REG_RW_IA,
1212		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1213}
1214
1215static void
1216glk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1217{
1218	gen9_gt_workarounds_init(gt, wal);
1219}
1220
1221static void
1222cfl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1223{
1224	gen9_gt_workarounds_init(gt, wal);
1225
1226	/* WaDisableGafsUnitClkGating:cfl */
1227	wa_write_or(wal,
1228		    GEN7_UCGCTL4,
1229		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1230
1231	/* WaInPlaceDecompressionHang:cfl */
1232	wa_write_or(wal,
1233		    GEN9_GAMT_ECO_REG_RW_IA,
1234		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1235}
1236
1237static void __set_mcr_steering(struct i915_wa_list *wal,
1238			       i915_reg_t steering_reg,
1239			       unsigned int slice, unsigned int subslice)
1240{
1241	u32 mcr, mcr_mask;
1242
1243	mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice);
1244	mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
1245
1246	wa_write_clr_set(wal, steering_reg, mcr_mask, mcr);
1247}
1248
1249static void debug_dump_steering(struct intel_gt *gt)
1250{
1251	struct drm_printer p = drm_dbg_printer(&gt->i915->drm, DRM_UT_DRIVER,
1252					       "MCR Steering:");
1253
1254	if (drm_debug_enabled(DRM_UT_DRIVER))
1255		intel_gt_mcr_report_steering(&p, gt, false);
1256}
1257
1258static void __add_mcr_wa(struct intel_gt *gt, struct i915_wa_list *wal,
1259			 unsigned int slice, unsigned int subslice)
1260{
1261	__set_mcr_steering(wal, GEN8_MCR_SELECTOR, slice, subslice);
1262
1263	gt->default_steering.groupid = slice;
1264	gt->default_steering.instanceid = subslice;
1265
1266	debug_dump_steering(gt);
1267}
1268
1269static void
1270icl_wa_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1271{
1272	const struct sseu_dev_info *sseu = &gt->info.sseu;
1273	unsigned int subslice;
1274
1275	GEM_BUG_ON(GRAPHICS_VER(gt->i915) < 11);
1276	GEM_BUG_ON(hweight8(sseu->slice_mask) > 1);
1277
1278	/*
1279	 * Although a platform may have subslices, we need to always steer
1280	 * reads to the lowest instance that isn't fused off.  When Render
1281	 * Power Gating is enabled, grabbing forcewake will only power up a
1282	 * single subslice (the "minconfig") if there isn't a real workload
1283	 * that needs to be run; this means that if we steer register reads to
1284	 * one of the higher subslices, we run the risk of reading back 0's or
1285	 * random garbage.
1286	 */
1287	subslice = __ffs(intel_sseu_get_hsw_subslices(sseu, 0));
1288
1289	/*
1290	 * If the subslice we picked above also steers us to a valid L3 bank,
1291	 * then we can just rely on the default steering and won't need to
1292	 * worry about explicitly re-steering L3BANK reads later.
1293	 */
1294	if (gt->info.l3bank_mask & BIT(subslice))
1295		gt->steering_table[L3BANK] = NULL;
1296
1297	__add_mcr_wa(gt, wal, 0, subslice);
1298}
1299
1300static void
1301xehp_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1302{
1303	const struct sseu_dev_info *sseu = &gt->info.sseu;
1304	unsigned long slice, subslice = 0, slice_mask = 0;
1305	u32 lncf_mask = 0;
1306	int i;
1307
1308	/*
1309	 * On Xe_HP the steering increases in complexity. There are now several
1310	 * more units that require steering and we're not guaranteed to be able
1311	 * to find a common setting for all of them. These are:
1312	 * - GSLICE (fusable)
1313	 * - DSS (sub-unit within gslice; fusable)
1314	 * - L3 Bank (fusable)
1315	 * - MSLICE (fusable)
1316	 * - LNCF (sub-unit within mslice; always present if mslice is present)
1317	 *
1318	 * We'll do our default/implicit steering based on GSLICE (in the
1319	 * sliceid field) and DSS (in the subsliceid field).  If we can
1320	 * find overlap between the valid MSLICE and/or LNCF values with
1321	 * a suitable GSLICE, then we can just re-use the default value and
1322	 * skip and explicit steering at runtime.
1323	 *
1324	 * We only need to look for overlap between GSLICE/MSLICE/LNCF to find
1325	 * a valid sliceid value.  DSS steering is the only type of steering
1326	 * that utilizes the 'subsliceid' bits.
 
 
 
 
 
 
1327	 *
1328	 * Also note that, even though the steering domain is called "GSlice"
1329	 * and it is encoded in the register using the gslice format, the spec
1330	 * says that the combined (geometry | compute) fuse should be used to
1331	 * select the steering.
1332	 */
1333
1334	/* Find the potential gslice candidates */
1335	slice_mask = intel_slicemask_from_xehp_dssmask(sseu->subslice_mask,
1336						       GEN_DSS_PER_GSLICE);
1337
1338	/*
1339	 * Find the potential LNCF candidates.  Either LNCF within a valid
1340	 * mslice is fine.
1341	 */
1342	for_each_set_bit(i, &gt->info.mslice_mask, GEN12_MAX_MSLICES)
1343		lncf_mask |= (0x3 << (i * 2));
1344
1345	/*
1346	 * Are there any sliceid values that work for both GSLICE and LNCF
1347	 * steering?
1348	 */
1349	if (slice_mask & lncf_mask) {
1350		slice_mask &= lncf_mask;
1351		gt->steering_table[LNCF] = NULL;
 
 
 
 
 
 
1352	}
 
1353
1354	/* How about sliceid values that also work for MSLICE steering? */
1355	if (slice_mask & gt->info.mslice_mask) {
1356		slice_mask &= gt->info.mslice_mask;
1357		gt->steering_table[MSLICE] = NULL;
 
 
1358	}
1359
1360	slice = __ffs(slice_mask);
1361	subslice = intel_sseu_find_first_xehp_dss(sseu, GEN_DSS_PER_GSLICE, slice) %
1362		GEN_DSS_PER_GSLICE;
1363
1364	__add_mcr_wa(gt, wal, slice, subslice);
 
1365
1366	/*
1367	 * SQIDI ranges are special because they use different steering
1368	 * registers than everything else we work with.  On XeHP SDV and
1369	 * DG2-G10, any value in the steering registers will work fine since
1370	 * all instances are present, but DG2-G11 only has SQIDI instances at
1371	 * ID's 2 and 3, so we need to steer to one of those.  For simplicity
1372	 * we'll just steer to a hardcoded "2" since that value will work
1373	 * everywhere.
1374	 */
1375	__set_mcr_steering(wal, MCFG_MCR_SELECTOR, 0, 2);
1376	__set_mcr_steering(wal, SF_MCR_SELECTOR, 0, 2);
1377
1378	/*
1379	 * On DG2, GAM registers have a dedicated steering control register
1380	 * and must always be programmed to a hardcoded groupid of "1."
1381	 */
1382	if (IS_DG2(gt->i915))
1383		__set_mcr_steering(wal, GAM_MCR_SELECTOR, 1, 0);
1384}
1385
1386static void
1387icl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1388{
1389	struct drm_i915_private *i915 = gt->i915;
1390
1391	icl_wa_init_mcr(gt, wal);
 
 
 
1392
1393	/* WaModifyGamTlbPartitioning:icl */
1394	wa_write_clr_set(wal,
1395			 GEN11_GACB_PERF_CTRL,
1396			 GEN11_HASH_CTRL_MASK,
1397			 GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4);
1398
1399	/* Wa_1405766107:icl
1400	 * Formerly known as WaCL2SFHalfMaxAlloc
1401	 */
1402	wa_write_or(wal,
1403		    GEN11_LSN_UNSLCVC,
1404		    GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC |
1405		    GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC);
1406
1407	/* Wa_220166154:icl
1408	 * Formerly known as WaDisCtxReload
1409	 */
1410	wa_write_or(wal,
1411		    GEN8_GAMW_ECO_DEV_RW_IA,
1412		    GAMW_ECO_DEV_CTX_RELOAD_DISABLE);
1413
 
 
 
 
 
 
 
 
 
 
 
 
1414	/* Wa_1406463099:icl
1415	 * Formerly known as WaGamTlbPendError
1416	 */
1417	wa_write_or(wal,
1418		    GAMT_CHKN_BIT_REG,
1419		    GAMT_CHKN_DISABLE_L3_COH_PIPE);
1420
1421	/*
1422	 * Wa_1408615072:icl,ehl  (vsunit)
1423	 * Wa_1407596294:icl,ehl  (hsunit)
1424	 */
1425	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1426		    VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS);
1427
1428	/* Wa_1407352427:icl,ehl */
1429	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1430		    PSDUNIT_CLKGATE_DIS);
1431
1432	/* Wa_1406680159:icl,ehl */
1433	wa_mcr_write_or(wal,
1434			GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1435			GWUNIT_CLKGATE_DIS);
1436
1437	/* Wa_1607087056:icl,ehl,jsl */
1438	if (IS_ICELAKE(i915) ||
1439		((IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) &&
1440		IS_GRAPHICS_STEP(i915, STEP_A0, STEP_B0)))
1441		wa_write_or(wal,
1442			    GEN11_SLICE_UNIT_LEVEL_CLKGATE,
1443			    L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1444
1445	/*
1446	 * This is not a documented workaround, but rather an optimization
1447	 * to reduce sampler power.
1448	 */
1449	wa_mcr_write_clr(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1450}
1451
1452/*
1453 * Though there are per-engine instances of these registers,
1454 * they retain their value through engine resets and should
1455 * only be provided on the GT workaround list rather than
1456 * the engine-specific workaround list.
1457 */
1458static void
1459wa_14011060649(struct intel_gt *gt, struct i915_wa_list *wal)
1460{
1461	struct intel_engine_cs *engine;
1462	int id;
1463
1464	for_each_engine(engine, gt, id) {
1465		if (engine->class != VIDEO_DECODE_CLASS ||
1466		    (engine->instance % 2))
1467			continue;
1468
1469		wa_write_or(wal, VDBOX_CGCTL3F10(engine->mmio_base),
1470			    IECPUNIT_CLKGATE_DIS);
1471	}
1472}
1473
1474static void
1475gen12_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1476{
1477	icl_wa_init_mcr(gt, wal);
1478
1479	/* Wa_14011060649:tgl,rkl,dg1,adl-s,adl-p */
1480	wa_14011060649(gt, wal);
1481
1482	/* Wa_14011059788:tgl,rkl,adl-s,dg1,adl-p */
1483	wa_mcr_write_or(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1484
1485	/*
1486	 * Wa_14015795083
1487	 *
1488	 * Firmware on some gen12 platforms locks the MISCCPCTL register,
1489	 * preventing i915 from modifying it for this workaround.  Skip the
1490	 * readback verification for this workaround on debug builds; if the
1491	 * workaround doesn't stick due to firmware behavior, it's not an error
1492	 * that we want CI to flag.
1493	 */
1494	wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE,
1495	       0, 0, false);
1496}
1497
1498static void
1499dg1_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1500{
1501	gen12_gt_workarounds_init(gt, wal);
1502
1503	/* Wa_1409420604:dg1 */
1504	wa_mcr_write_or(wal, SUBSLICE_UNIT_LEVEL_CLKGATE2,
1505			CPSSUNIT_CLKGATE_DIS);
1506
1507	/* Wa_1408615072:dg1 */
1508	/* Empirical testing shows this register is unaffected by engine reset. */
1509	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, VSUNIT_CLKGATE_DIS_TGL);
1510}
1511
1512static void
1513dg2_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1514{
1515	xehp_init_mcr(gt, wal);
1516
1517	/* Wa_14011060649:dg2 */
1518	wa_14011060649(gt, wal);
1519
1520	if (IS_DG2_G10(gt->i915)) {
1521		/* Wa_22010523718:dg2 */
1522		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1523			    CG3DDISCFEG_CLKGATE_DIS);
1524
1525		/* Wa_14011006942:dg2 */
1526		wa_mcr_write_or(wal, GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1527				DSS_ROUTER_CLKGATE_DIS);
1528	}
1529
1530	/* Wa_14014830051:dg2 */
1531	wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1532
1533	/*
1534	 * Wa_14015795083
1535	 * Skip verification for possibly locked register.
1536	 */
1537	wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE,
1538	       0, 0, false);
1539
1540	/* Wa_18018781329 */
1541	wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1542	wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1543	wa_mcr_write_or(wal, XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1544	wa_mcr_write_or(wal, XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB);
1545
1546	/* Wa_1509235366:dg2 */
1547	wa_mcr_write_or(wal, XEHP_GAMCNTRL_CTRL,
1548			INVALIDATION_BROADCAST_MODE_DIS | GLOBAL_INVALIDATION_MODE);
1549
1550	/* Wa_14010648519:dg2 */
1551	wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
1552}
1553
1554static void
1555xelpg_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1556{
1557	/* Wa_14018575942 / Wa_18018781329 */
1558	wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1559	wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1560
1561	/* Wa_22016670082 */
1562	wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE);
1563
1564	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1565	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
1566		/* Wa_14014830051 */
1567		wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1568
1569		/* Wa_14015795083 */
1570		wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1571	}
1572
1573	/*
1574	 * Unlike older platforms, we no longer setup implicit steering here;
1575	 * all MCR accesses are explicitly steered.
1576	 */
1577	debug_dump_steering(gt);
1578}
1579
1580static void
1581wa_16021867713(struct intel_gt *gt, struct i915_wa_list *wal)
1582{
1583	struct intel_engine_cs *engine;
1584	int id;
1585
1586	for_each_engine(engine, gt, id)
1587		if (engine->class == VIDEO_DECODE_CLASS)
1588			wa_write_or(wal, VDBOX_CGCTL3F1C(engine->mmio_base),
1589				    MFXPIPE_CLKGATE_DIS);
1590}
1591
1592static void
1593xelpmp_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1594{
1595	wa_16021867713(gt, wal);
1596
1597	/*
1598	 * Wa_14018778641
1599	 * Wa_18018781329
1600	 *
1601	 * Note that although these registers are MCR on the primary
1602	 * GT, the media GT's versions are regular singleton registers.
1603	 */
1604	wa_write_or(wal, XELPMP_GSC_MOD_CTRL, FORCE_MISS_FTLB);
1605
1606	/*
1607	 * Wa_14018575942
1608	 *
1609	 * Issue is seen on media KPI test running on VDBOX engine
1610	 * especially VP9 encoding WLs
1611	 */
1612	wa_write_or(wal, XELPMP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1613
1614	/* Wa_22016670082 */
1615	wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE);
1616
1617	debug_dump_steering(gt);
1618}
1619
1620/*
1621 * The bspec performance guide has recommended MMIO tuning settings.  These
1622 * aren't truly "workarounds" but we want to program them through the
1623 * workaround infrastructure to make sure they're (re)applied at the proper
1624 * times.
1625 *
1626 * The programming in this function is for settings that persist through
1627 * engine resets and also are not part of any engine's register state context.
1628 * I.e., settings that only need to be re-applied in the event of a full GT
1629 * reset.
1630 */
1631static void gt_tuning_settings(struct intel_gt *gt, struct i915_wa_list *wal)
1632{
1633	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74))) {
1634		wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1635		wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1636	}
1637
1638	if (IS_DG2(gt->i915)) {
1639		wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1640		wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1641	}
 
1642}
1643
1644static void
1645gt_init_workarounds(struct intel_gt *gt, struct i915_wa_list *wal)
1646{
1647	struct drm_i915_private *i915 = gt->i915;
1648
1649	gt_tuning_settings(gt, wal);
1650
1651	if (gt->type == GT_MEDIA) {
1652		if (MEDIA_VER_FULL(i915) == IP_VER(13, 0))
1653			xelpmp_gt_workarounds_init(gt, wal);
1654		else
1655			MISSING_CASE(MEDIA_VER_FULL(i915));
1656
1657		return;
1658	}
1659
1660	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)))
1661		xelpg_gt_workarounds_init(gt, wal);
1662	else if (IS_DG2(i915))
1663		dg2_gt_workarounds_init(gt, wal);
1664	else if (IS_DG1(i915))
1665		dg1_gt_workarounds_init(gt, wal);
1666	else if (GRAPHICS_VER(i915) == 12)
1667		gen12_gt_workarounds_init(gt, wal);
1668	else if (GRAPHICS_VER(i915) == 11)
1669		icl_gt_workarounds_init(gt, wal);
1670	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
1671		cfl_gt_workarounds_init(gt, wal);
1672	else if (IS_GEMINILAKE(i915))
1673		glk_gt_workarounds_init(gt, wal);
1674	else if (IS_KABYLAKE(i915))
1675		kbl_gt_workarounds_init(gt, wal);
1676	else if (IS_BROXTON(i915))
1677		gen9_gt_workarounds_init(gt, wal);
1678	else if (IS_SKYLAKE(i915))
1679		skl_gt_workarounds_init(gt, wal);
1680	else if (IS_HASWELL(i915))
1681		hsw_gt_workarounds_init(gt, wal);
1682	else if (IS_VALLEYVIEW(i915))
1683		vlv_gt_workarounds_init(gt, wal);
1684	else if (IS_IVYBRIDGE(i915))
1685		ivb_gt_workarounds_init(gt, wal);
1686	else if (GRAPHICS_VER(i915) == 6)
1687		snb_gt_workarounds_init(gt, wal);
1688	else if (GRAPHICS_VER(i915) == 5)
1689		ilk_gt_workarounds_init(gt, wal);
1690	else if (IS_G4X(i915))
1691		g4x_gt_workarounds_init(gt, wal);
1692	else if (GRAPHICS_VER(i915) == 4)
1693		gen4_gt_workarounds_init(gt, wal);
1694	else if (GRAPHICS_VER(i915) <= 8)
1695		;
1696	else
1697		MISSING_CASE(GRAPHICS_VER(i915));
1698}
1699
1700void intel_gt_init_workarounds(struct intel_gt *gt)
1701{
1702	struct i915_wa_list *wal = &gt->wa_list;
1703
1704	wa_init_start(wal, gt, "GT", "global");
1705	gt_init_workarounds(gt, wal);
1706	wa_init_finish(wal);
1707}
1708
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1709static bool
1710wa_verify(struct intel_gt *gt, const struct i915_wa *wa, u32 cur,
1711	  const char *name, const char *from)
1712{
1713	if ((cur ^ wa->set) & wa->read) {
1714		gt_err(gt,
1715		       "%s workaround lost on %s! (reg[%x]=0x%x, relevant bits were 0x%x vs expected 0x%x)\n",
1716		       name, from, i915_mmio_reg_offset(wa->reg),
1717		       cur, cur & wa->read, wa->set & wa->read);
1718
1719		return false;
1720	}
1721
1722	return true;
1723}
1724
1725static void wa_list_apply(const struct i915_wa_list *wal)
 
1726{
1727	struct intel_gt *gt = wal->gt;
1728	struct intel_uncore *uncore = gt->uncore;
1729	enum forcewake_domains fw;
1730	unsigned long flags;
1731	struct i915_wa *wa;
1732	unsigned int i;
1733
1734	if (!wal->count)
1735		return;
1736
1737	fw = wal_get_fw_for_rmw(uncore, wal);
1738
1739	intel_gt_mcr_lock(gt, &flags);
1740	spin_lock(&uncore->lock);
1741	intel_uncore_forcewake_get__locked(uncore, fw);
1742
1743	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1744		u32 val, old = 0;
1745
1746		/* open-coded rmw due to steering */
1747		if (wa->clr)
1748			old = wa->is_mcr ?
1749				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1750				intel_uncore_read_fw(uncore, wa->reg);
1751		val = (old & ~wa->clr) | wa->set;
1752		if (val != old || !wa->clr) {
1753			if (wa->is_mcr)
1754				intel_gt_mcr_multicast_write_fw(gt, wa->mcr_reg, val);
1755			else
1756				intel_uncore_write_fw(uncore, wa->reg, val);
1757		}
1758
1759		if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
1760			u32 val = wa->is_mcr ?
1761				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1762				intel_uncore_read_fw(uncore, wa->reg);
1763
1764			wa_verify(gt, wa, val, wal->name, "application");
1765		}
1766	}
1767
1768	intel_uncore_forcewake_put__locked(uncore, fw);
1769	spin_unlock(&uncore->lock);
1770	intel_gt_mcr_unlock(gt, flags);
1771}
1772
1773void intel_gt_apply_workarounds(struct intel_gt *gt)
1774{
1775	wa_list_apply(&gt->wa_list);
1776}
1777
1778static bool wa_list_verify(struct intel_gt *gt,
1779			   const struct i915_wa_list *wal,
1780			   const char *from)
1781{
1782	struct intel_uncore *uncore = gt->uncore;
1783	struct i915_wa *wa;
1784	enum forcewake_domains fw;
1785	unsigned long flags;
1786	unsigned int i;
1787	bool ok = true;
1788
1789	fw = wal_get_fw_for_rmw(uncore, wal);
1790
1791	intel_gt_mcr_lock(gt, &flags);
1792	spin_lock(&uncore->lock);
1793	intel_uncore_forcewake_get__locked(uncore, fw);
1794
1795	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1796		ok &= wa_verify(wal->gt, wa, wa->is_mcr ?
1797				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1798				intel_uncore_read_fw(uncore, wa->reg),
1799				wal->name, from);
1800
1801	intel_uncore_forcewake_put__locked(uncore, fw);
1802	spin_unlock(&uncore->lock);
1803	intel_gt_mcr_unlock(gt, flags);
1804
1805	return ok;
1806}
1807
1808bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from)
1809{
1810	return wa_list_verify(gt, &gt->wa_list, from);
1811}
1812
1813__maybe_unused
1814static bool is_nonpriv_flags_valid(u32 flags)
1815{
1816	/* Check only valid flag bits are set */
1817	if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID)
1818		return false;
1819
1820	/* NB: Only 3 out of 4 enum values are valid for access field */
1821	if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) ==
1822	    RING_FORCE_TO_NONPRIV_ACCESS_INVALID)
1823		return false;
1824
1825	return true;
1826}
1827
1828static void
1829whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags)
1830{
1831	struct i915_wa wa = {
1832		.reg = reg
1833	};
1834
1835	if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1836		return;
1837
1838	if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1839		return;
1840
1841	wa.reg.reg |= flags;
1842	_wa_add(wal, &wa);
1843}
1844
1845static void
1846whitelist_mcr_reg_ext(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 flags)
1847{
1848	struct i915_wa wa = {
1849		.mcr_reg = reg,
1850		.is_mcr = 1,
1851	};
1852
1853	if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1854		return;
1855
1856	if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1857		return;
1858
1859	wa.mcr_reg.reg |= flags;
1860	_wa_add(wal, &wa);
1861}
1862
1863static void
1864whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
1865{
1866	whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1867}
1868
1869static void
1870whitelist_mcr_reg(struct i915_wa_list *wal, i915_mcr_reg_t reg)
1871{
1872	whitelist_mcr_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1873}
1874
1875static void gen9_whitelist_build(struct i915_wa_list *w)
1876{
1877	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
1878	whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
1879
1880	/* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */
1881	whitelist_reg(w, GEN8_CS_CHICKEN1);
1882
1883	/* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */
1884	whitelist_reg(w, GEN8_HDC_CHICKEN1);
1885
1886	/* WaSendPushConstantsFromMMIO:skl,bxt */
1887	whitelist_reg(w, COMMON_SLICE_CHICKEN2);
1888}
1889
1890static void skl_whitelist_build(struct intel_engine_cs *engine)
1891{
1892	struct i915_wa_list *w = &engine->whitelist;
1893
1894	if (engine->class != RENDER_CLASS)
1895		return;
1896
1897	gen9_whitelist_build(w);
1898
1899	/* WaDisableLSQCROPERFforOCL:skl */
1900	whitelist_mcr_reg(w, GEN8_L3SQCREG4);
1901}
1902
1903static void bxt_whitelist_build(struct intel_engine_cs *engine)
1904{
1905	if (engine->class != RENDER_CLASS)
1906		return;
1907
1908	gen9_whitelist_build(&engine->whitelist);
1909}
1910
1911static void kbl_whitelist_build(struct intel_engine_cs *engine)
1912{
1913	struct i915_wa_list *w = &engine->whitelist;
1914
1915	if (engine->class != RENDER_CLASS)
1916		return;
1917
1918	gen9_whitelist_build(w);
1919
1920	/* WaDisableLSQCROPERFforOCL:kbl */
1921	whitelist_mcr_reg(w, GEN8_L3SQCREG4);
1922}
1923
1924static void glk_whitelist_build(struct intel_engine_cs *engine)
1925{
1926	struct i915_wa_list *w = &engine->whitelist;
1927
1928	if (engine->class != RENDER_CLASS)
1929		return;
1930
1931	gen9_whitelist_build(w);
1932
1933	/* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */
1934	whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1935}
1936
1937static void cfl_whitelist_build(struct intel_engine_cs *engine)
1938{
1939	struct i915_wa_list *w = &engine->whitelist;
1940
1941	if (engine->class != RENDER_CLASS)
1942		return;
1943
1944	gen9_whitelist_build(w);
1945
1946	/*
1947	 * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml
1948	 *
1949	 * This covers 4 register which are next to one another :
1950	 *   - PS_INVOCATION_COUNT
1951	 *   - PS_INVOCATION_COUNT_UDW
1952	 *   - PS_DEPTH_COUNT
1953	 *   - PS_DEPTH_COUNT_UDW
1954	 */
1955	whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1956			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
1957			  RING_FORCE_TO_NONPRIV_RANGE_4);
1958}
1959
1960static void allow_read_ctx_timestamp(struct intel_engine_cs *engine)
1961{
1962	struct i915_wa_list *w = &engine->whitelist;
1963
1964	if (engine->class != RENDER_CLASS)
1965		whitelist_reg_ext(w,
1966				  RING_CTX_TIMESTAMP(engine->mmio_base),
1967				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
 
 
1968}
1969
1970static void cml_whitelist_build(struct intel_engine_cs *engine)
1971{
1972	allow_read_ctx_timestamp(engine);
1973
1974	cfl_whitelist_build(engine);
 
 
 
 
1975}
1976
1977static void icl_whitelist_build(struct intel_engine_cs *engine)
1978{
1979	struct i915_wa_list *w = &engine->whitelist;
1980
1981	allow_read_ctx_timestamp(engine);
1982
1983	switch (engine->class) {
1984	case RENDER_CLASS:
1985		/* WaAllowUMDToModifyHalfSliceChicken7:icl */
1986		whitelist_mcr_reg(w, GEN9_HALF_SLICE_CHICKEN7);
1987
1988		/* WaAllowUMDToModifySamplerMode:icl */
1989		whitelist_mcr_reg(w, GEN10_SAMPLER_MODE);
1990
1991		/* WaEnableStateCacheRedirectToCS:icl */
1992		whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1993
1994		/*
1995		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl
1996		 *
1997		 * This covers 4 register which are next to one another :
1998		 *   - PS_INVOCATION_COUNT
1999		 *   - PS_INVOCATION_COUNT_UDW
2000		 *   - PS_DEPTH_COUNT
2001		 *   - PS_DEPTH_COUNT_UDW
2002		 */
2003		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2004				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
2005				  RING_FORCE_TO_NONPRIV_RANGE_4);
2006		break;
2007
2008	case VIDEO_DECODE_CLASS:
2009		/* hucStatusRegOffset */
2010		whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base),
2011				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2012		/* hucUKernelHdrInfoRegOffset */
2013		whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base),
2014				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2015		/* hucStatus2RegOffset */
2016		whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base),
2017				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
 
 
 
2018		break;
2019
2020	default:
 
 
 
2021		break;
2022	}
2023}
2024
2025static void tgl_whitelist_build(struct intel_engine_cs *engine)
2026{
2027	struct i915_wa_list *w = &engine->whitelist;
2028
2029	allow_read_ctx_timestamp(engine);
2030
2031	switch (engine->class) {
2032	case RENDER_CLASS:
2033		/*
2034		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl
2035		 * Wa_1408556865:tgl
2036		 *
2037		 * This covers 4 registers which are next to one another :
2038		 *   - PS_INVOCATION_COUNT
2039		 *   - PS_INVOCATION_COUNT_UDW
2040		 *   - PS_DEPTH_COUNT
2041		 *   - PS_DEPTH_COUNT_UDW
2042		 */
2043		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2044				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
2045				  RING_FORCE_TO_NONPRIV_RANGE_4);
2046
2047		/*
2048		 * Wa_1808121037:tgl
2049		 * Wa_14012131227:dg1
2050		 * Wa_1508744258:tgl,rkl,dg1,adl-s,adl-p
2051		 */
2052		whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
2053
2054		/* Wa_1806527549:tgl */
2055		whitelist_reg(w, HIZ_CHICKEN);
2056
2057		/* Required by recommended tuning setting (not a workaround) */
2058		whitelist_reg(w, GEN11_COMMON_SLICE_CHICKEN3);
2059
2060		break;
2061	default:
2062		break;
2063	}
2064}
2065
2066static void dg2_whitelist_build(struct intel_engine_cs *engine)
2067{
2068	struct i915_wa_list *w = &engine->whitelist;
2069
2070	switch (engine->class) {
2071	case RENDER_CLASS:
2072		/* Required by recommended tuning setting (not a workaround) */
2073		whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3);
2074		whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
2075		break;
2076	default:
2077		break;
2078	}
2079}
2080
2081static void xelpg_whitelist_build(struct intel_engine_cs *engine)
2082{
2083	struct i915_wa_list *w = &engine->whitelist;
2084
2085	switch (engine->class) {
2086	case RENDER_CLASS:
2087		/* Required by recommended tuning setting (not a workaround) */
2088		whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3);
2089		whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
2090		break;
2091	default:
 
 
 
2092		break;
2093	}
2094}
2095
2096void intel_engine_init_whitelist(struct intel_engine_cs *engine)
2097{
2098	struct drm_i915_private *i915 = engine->i915;
2099	struct i915_wa_list *w = &engine->whitelist;
2100
2101	wa_init_start(w, engine->gt, "whitelist", engine->name);
2102
2103	if (engine->gt->type == GT_MEDIA)
2104		; /* none yet */
2105	else if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
2106		xelpg_whitelist_build(engine);
2107	else if (IS_DG2(i915))
2108		dg2_whitelist_build(engine);
2109	else if (GRAPHICS_VER(i915) == 12)
2110		tgl_whitelist_build(engine);
2111	else if (GRAPHICS_VER(i915) == 11)
2112		icl_whitelist_build(engine);
 
 
2113	else if (IS_COMETLAKE(i915))
2114		cml_whitelist_build(engine);
2115	else if (IS_COFFEELAKE(i915))
2116		cfl_whitelist_build(engine);
2117	else if (IS_GEMINILAKE(i915))
2118		glk_whitelist_build(engine);
2119	else if (IS_KABYLAKE(i915))
2120		kbl_whitelist_build(engine);
2121	else if (IS_BROXTON(i915))
2122		bxt_whitelist_build(engine);
2123	else if (IS_SKYLAKE(i915))
2124		skl_whitelist_build(engine);
2125	else if (GRAPHICS_VER(i915) <= 8)
2126		;
2127	else
2128		MISSING_CASE(GRAPHICS_VER(i915));
2129
2130	wa_init_finish(w);
2131}
2132
2133void intel_engine_apply_whitelist(struct intel_engine_cs *engine)
2134{
2135	const struct i915_wa_list *wal = &engine->whitelist;
2136	struct intel_uncore *uncore = engine->uncore;
2137	const u32 base = engine->mmio_base;
2138	struct i915_wa *wa;
2139	unsigned int i;
2140
2141	if (!wal->count)
2142		return;
2143
2144	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
2145		intel_uncore_write(uncore,
2146				   RING_FORCE_TO_NONPRIV(base, i),
2147				   i915_mmio_reg_offset(wa->reg));
2148
2149	/* And clear the rest just in case of garbage */
2150	for (; i < RING_MAX_NONPRIV_SLOTS; i++)
2151		intel_uncore_write(uncore,
2152				   RING_FORCE_TO_NONPRIV(base, i),
2153				   i915_mmio_reg_offset(RING_NOPID(base)));
2154}
2155
2156/*
2157 * engine_fake_wa_init(), a place holder to program the registers
2158 * which are not part of an official workaround defined by the
2159 * hardware team.
2160 * Adding programming of those register inside workaround will
2161 * allow utilizing wa framework to proper application and verification.
2162 */
2163static void
2164engine_fake_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2165{
2166	u8 mocs_w, mocs_r;
2167
2168	/*
2169	 * RING_CMD_CCTL specifies the default MOCS entry that will be used
2170	 * by the command streamer when executing commands that don't have
2171	 * a way to explicitly specify a MOCS setting.  The default should
2172	 * usually reference whichever MOCS entry corresponds to uncached
2173	 * behavior, although use of a WB cached entry is recommended by the
2174	 * spec in certain circumstances on specific platforms.
2175	 */
2176	if (GRAPHICS_VER(engine->i915) >= 12) {
2177		mocs_r = engine->gt->mocs.uc_index;
2178		mocs_w = engine->gt->mocs.uc_index;
2179
2180		if (HAS_L3_CCS_READ(engine->i915) &&
2181		    engine->class == COMPUTE_CLASS) {
2182			mocs_r = engine->gt->mocs.wb_index;
2183
2184			/*
2185			 * Even on the few platforms where MOCS 0 is a
2186			 * legitimate table entry, it's never the correct
2187			 * setting to use here; we can assume the MOCS init
2188			 * just forgot to initialize wb_index.
2189			 */
2190			drm_WARN_ON(&engine->i915->drm, mocs_r == 0);
2191		}
2192
2193		wa_masked_field_set(wal,
2194				    RING_CMD_CCTL(engine->mmio_base),
2195				    CMD_CCTL_MOCS_MASK,
2196				    CMD_CCTL_MOCS_OVERRIDE(mocs_w, mocs_r));
2197	}
2198}
2199
2200static void
2201rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2202{
2203	struct drm_i915_private *i915 = engine->i915;
2204	struct intel_gt *gt = engine->gt;
2205
2206	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2207	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
2208		/* Wa_22014600077 */
2209		wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2210				 ENABLE_EU_COUNT_FOR_TDL_FLUSH);
2211	}
2212
2213	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2214	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2215	    IS_DG2(i915)) {
2216		/* Wa_1509727124 */
2217		wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2218				 SC_DISABLE_POWER_OPTIMIZATION_EBB);
2219	}
2220
2221	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2222	    IS_DG2(i915)) {
2223		/* Wa_22012856258 */
2224		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2225				 GEN12_DISABLE_READ_SUPPRESSION);
2226	}
2227
2228	if (IS_DG2(i915)) {
2229		/*
2230		 * Wa_22010960976:dg2
2231		 * Wa_14013347512:dg2
2232		 */
2233		wa_mcr_masked_dis(wal, XEHP_HDC_CHICKEN0,
2234				  LSC_L1_FLUSH_CTL_3D_DATAPORT_FLUSH_EVENTS_MASK);
2235	}
2236
2237	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) ||
2238	    IS_DG2(i915)) {
2239		/* Wa_14015150844 */
2240		wa_mcr_add(wal, XEHP_HDC_CHICKEN0, 0,
2241			   _MASKED_BIT_ENABLE(DIS_ATOMIC_CHAINING_TYPED_WRITES),
2242			   0, true);
2243	}
2244
2245	if (IS_DG2(i915) || IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2246	    IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2247		/*
2248		 * Wa_1606700617:tgl,dg1,adl-p
2249		 * Wa_22010271021:tgl,rkl,dg1,adl-s,adl-p
2250		 * Wa_14010826681:tgl,dg1,rkl,adl-p
2251		 * Wa_18019627453:dg2
 
2252		 */
2253		wa_masked_en(wal,
2254			     GEN9_CS_DEBUG_MODE1,
2255			     FF_DOP_CLOCK_GATE_DISABLE);
2256	}
2257
2258	if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || IS_DG1(i915) ||
2259	    IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2260		/* Wa_1606931601:tgl,rkl,dg1,adl-s,adl-p */
2261		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ);
2262
2263		/*
2264		 * Wa_1407928979:tgl A*
2265		 * Wa_18011464164:tgl[B0+],dg1[B0+]
2266		 * Wa_22010931296:tgl[B0+],dg1[B0+]
2267		 * Wa_14010919138:rkl,dg1,adl-s,adl-p
2268		 */
2269		wa_write_or(wal, GEN7_FF_THREAD_MODE,
2270			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
 
2271
2272		/* Wa_1406941453:tgl,rkl,dg1,adl-s,adl-p */
2273		wa_mcr_masked_en(wal,
2274				 GEN10_SAMPLER_MODE,
2275				 ENABLE_SMALLPL);
2276	}
2277
2278	if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2279	    IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2280		/* Wa_1409804808 */
2281		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2282				 GEN12_PUSH_CONST_DEREF_HOLD_DIS);
2283
2284		/* Wa_14010229206 */
2285		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH);
2286	}
 
 
 
 
 
2287
2288	if (IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) || IS_ALDERLAKE_P(i915)) {
2289		/*
2290		 * Wa_1607297627
2291		 *
2292		 * On TGL and RKL there are multiple entries for this WA in the
2293		 * BSpec; some indicate this is an A0-only WA, others indicate
2294		 * it applies to all steppings so we trust the "all steppings."
2295		 */
2296		wa_masked_en(wal,
2297			     RING_PSMI_CTL(RENDER_RING_BASE),
2298			     GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE |
2299			     GEN8_RC_SEMA_IDLE_MSG_DISABLE);
2300	}
2301
2302	if (IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) {
2303		/*
2304		 * "Disable Repacking for Compression (masked R/W access)
2305		 *  before rendering compressed surfaces for display."
 
2306		 */
2307		wa_masked_en(wal, CACHE_MODE_0_GEN7,
2308			     DISABLE_REPACKING_FOR_COMPRESSION);
2309	}
2310
2311	if (GRAPHICS_VER(i915) == 11) {
2312		/* This is not an Wa. Enable for better image quality */
2313		wa_masked_en(wal,
2314			     _3D_CHICKEN3,
2315			     _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE);
2316
 
 
 
 
 
2317		/*
2318		 * Wa_1405543622:icl
2319		 * Formerly known as WaGAPZPriorityScheme
2320		 */
2321		wa_write_or(wal,
2322			    GEN8_GARBCNTL,
2323			    GEN11_ARBITRATION_PRIO_ORDER_MASK);
2324
2325		/*
2326		 * Wa_1604223664:icl
2327		 * Formerly known as WaL3BankAddressHashing
2328		 */
2329		wa_write_clr_set(wal,
2330				 GEN8_GARBCNTL,
2331				 GEN11_HASH_CTRL_EXCL_MASK,
2332				 GEN11_HASH_CTRL_EXCL_BIT0);
2333		wa_write_clr_set(wal,
2334				 GEN11_GLBLINVL,
2335				 GEN11_BANK_HASH_ADDR_EXCL_MASK,
2336				 GEN11_BANK_HASH_ADDR_EXCL_BIT0);
2337
2338		/*
2339		 * Wa_1405733216:icl
2340		 * Formerly known as WaDisableCleanEvicts
2341		 */
2342		wa_mcr_write_or(wal,
2343				GEN8_L3SQCREG4,
2344				GEN11_LQSC_CLEAN_EVICT_DISABLE);
 
 
 
 
 
 
 
 
 
 
 
 
 
2345
2346		/* Wa_1606682166:icl */
2347		wa_write_or(wal,
2348			    GEN7_SARCHKMD,
2349			    GEN7_DISABLE_SAMPLER_PREFETCH);
2350
2351		/* Wa_1409178092:icl */
2352		wa_mcr_write_clr_set(wal,
2353				     GEN11_SCRATCH2,
2354				     GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE,
2355				     0);
2356
2357		/* WaEnable32PlaneMode:icl */
2358		wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS,
2359			     GEN11_ENABLE_32_PLANE_MODE);
2360
2361		/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2362		 * Wa_1408767742:icl[a2..forever],ehl[all]
2363		 * Wa_1605460711:icl[a0..c0]
2364		 */
2365		wa_write_or(wal,
2366			    GEN7_FF_THREAD_MODE,
2367			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2368
2369		/* Wa_22010271021 */
2370		wa_masked_en(wal,
2371			     GEN9_CS_DEBUG_MODE1,
2372			     FF_DOP_CLOCK_GATE_DISABLE);
 
2373	}
2374
2375	/*
2376	 * Intel platforms that support fine-grained preemption (i.e., gen9 and
2377	 * beyond) allow the kernel-mode driver to choose between two different
2378	 * options for controlling preemption granularity and behavior.
2379	 *
2380	 * Option 1 (hardware default):
2381	 *   Preemption settings are controlled in a global manner via
2382	 *   kernel-only register CS_DEBUG_MODE1 (0x20EC).  Any granularity
2383	 *   and settings chosen by the kernel-mode driver will apply to all
2384	 *   userspace clients.
2385	 *
2386	 * Option 2:
2387	 *   Preemption settings are controlled on a per-context basis via
2388	 *   register CS_CHICKEN1 (0x2580).  CS_CHICKEN1 is saved/restored on
2389	 *   context switch and is writable by userspace (e.g., via
2390	 *   MI_LOAD_REGISTER_IMMEDIATE instructions placed in a batch buffer)
2391	 *   which allows different userspace drivers/clients to select
2392	 *   different settings, or to change those settings on the fly in
2393	 *   response to runtime needs.  This option was known by name
2394	 *   "FtrPerCtxtPreemptionGranularityControl" at one time, although
2395	 *   that name is somewhat misleading as other non-granularity
2396	 *   preemption settings are also impacted by this decision.
2397	 *
2398	 * On Linux, our policy has always been to let userspace drivers
2399	 * control preemption granularity/settings (Option 2).  This was
2400	 * originally mandatory on gen9 to prevent ABI breakage (old gen9
2401	 * userspace developed before object-level preemption was enabled would
2402	 * not behave well if i915 were to go with Option 1 and enable that
2403	 * preemption in a global manner).  On gen9 each context would have
2404	 * object-level preemption disabled by default (see
2405	 * WaDisable3DMidCmdPreemption in gen9_ctx_workarounds_init), but
2406	 * userspace drivers could opt-in to object-level preemption as they
2407	 * saw fit.  For post-gen9 platforms, we continue to utilize Option 2;
2408	 * even though it is no longer necessary for ABI compatibility when
2409	 * enabling a new platform, it does ensure that userspace will be able
2410	 * to implement any workarounds that show up requiring temporary
2411	 * adjustments to preemption behavior at runtime.
2412	 *
2413	 * Notes/Workarounds:
2414	 *  - Wa_14015141709:  On DG2 and early steppings of MTL,
2415	 *      CS_CHICKEN1[0] does not disable object-level preemption as
2416	 *      it is supposed to (nor does CS_DEBUG_MODE1[0] if we had been
2417	 *      using Option 1).  Effectively this means userspace is unable
2418	 *      to disable object-level preemption on these platforms/steppings
2419	 *      despite the setting here.
2420	 *
2421	 *  - Wa_16013994831:  May require that userspace program
2422	 *      CS_CHICKEN1[10] when certain runtime conditions are true.
2423	 *      Userspace requires Option 2 to be in effect for their update of
2424	 *      CS_CHICKEN1[10] to be effective.
2425	 *
2426	 * Other workarounds may appear in the future that will also require
2427	 * Option 2 behavior to allow proper userspace implementation.
2428	 */
2429	if (GRAPHICS_VER(i915) >= 9)
2430		wa_masked_en(wal,
2431			     GEN7_FF_SLICE_CS_CHICKEN1,
2432			     GEN9_FFSC_PERCTX_PREEMPT_CTRL);
 
2433
2434	if (IS_SKYLAKE(i915) ||
2435	    IS_KABYLAKE(i915) ||
2436	    IS_COFFEELAKE(i915) ||
2437	    IS_COMETLAKE(i915)) {
2438		/* WaEnableGapsTsvCreditFix:skl,kbl,cfl */
2439		wa_write_or(wal,
2440			    GEN8_GARBCNTL,
2441			    GEN9_GAPS_TSV_CREDIT_DISABLE);
2442	}
2443
2444	if (IS_BROXTON(i915)) {
2445		/* WaDisablePooledEuLoadBalancingFix:bxt */
2446		wa_masked_en(wal,
2447			     FF_SLICE_CS_CHICKEN2,
2448			     GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
2449	}
2450
2451	if (GRAPHICS_VER(i915) == 9) {
2452		/* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */
2453		wa_masked_en(wal,
2454			     GEN9_CSFE_CHICKEN1_RCS,
2455			     GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE);
2456
2457		/* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */
2458		wa_mcr_write_or(wal,
2459				BDW_SCRATCH1,
2460				GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
2461
2462		/* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */
2463		if (IS_GEN9_LP(i915))
2464			wa_mcr_write_clr_set(wal,
2465					     GEN8_L3SQCREG1,
2466					     L3_PRIO_CREDITS_MASK,
2467					     L3_GENERAL_PRIO_CREDITS(62) |
2468					     L3_HIGH_PRIO_CREDITS(2));
2469
2470		/* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */
2471		wa_mcr_write_or(wal,
2472				GEN8_L3SQCREG4,
2473				GEN8_LQSC_FLUSH_COHERENT_LINES);
2474
2475		/* Disable atomics in L3 to prevent unrecoverable hangs */
2476		wa_write_clr_set(wal, GEN9_SCRATCH_LNCF1,
2477				 GEN9_LNCF_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2478		wa_mcr_write_clr_set(wal, GEN8_L3SQCREG4,
2479				     GEN8_LQSQ_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2480		wa_mcr_write_clr_set(wal, GEN9_SCRATCH1,
2481				     EVICTION_PERF_FIX_ENABLE, 0);
2482	}
2483
2484	if (IS_HASWELL(i915)) {
2485		/* WaSampleCChickenBitEnable:hsw */
2486		wa_masked_en(wal,
2487			     HSW_HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE);
2488
2489		wa_masked_dis(wal,
2490			      CACHE_MODE_0_GEN7,
2491			      /* enable HiZ Raw Stall Optimization */
2492			      HIZ_RAW_STALL_OPT_DISABLE);
2493	}
2494
2495	if (IS_VALLEYVIEW(i915)) {
2496		/* WaDisableEarlyCull:vlv */
2497		wa_masked_en(wal,
2498			     _3D_CHICKEN3,
2499			     _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2500
2501		/*
2502		 * WaVSThreadDispatchOverride:ivb,vlv
2503		 *
2504		 * This actually overrides the dispatch
2505		 * mode for all thread types.
2506		 */
2507		wa_write_clr_set(wal,
2508				 GEN7_FF_THREAD_MODE,
2509				 GEN7_FF_SCHED_MASK,
2510				 GEN7_FF_TS_SCHED_HW |
2511				 GEN7_FF_VS_SCHED_HW |
2512				 GEN7_FF_DS_SCHED_HW);
2513
2514		/* WaPsdDispatchEnable:vlv */
2515		/* WaDisablePSDDualDispatchEnable:vlv */
2516		wa_masked_en(wal,
2517			     GEN7_HALF_SLICE_CHICKEN1,
2518			     GEN7_MAX_PS_THREAD_DEP |
2519			     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2520	}
2521
2522	if (IS_IVYBRIDGE(i915)) {
2523		/* WaDisableEarlyCull:ivb */
2524		wa_masked_en(wal,
2525			     _3D_CHICKEN3,
2526			     _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2527
2528		if (0) { /* causes HiZ corruption on ivb:gt1 */
2529			/* enable HiZ Raw Stall Optimization */
2530			wa_masked_dis(wal,
2531				      CACHE_MODE_0_GEN7,
2532				      HIZ_RAW_STALL_OPT_DISABLE);
2533		}
2534
2535		/*
2536		 * WaVSThreadDispatchOverride:ivb,vlv
2537		 *
2538		 * This actually overrides the dispatch
2539		 * mode for all thread types.
2540		 */
2541		wa_write_clr_set(wal,
2542				 GEN7_FF_THREAD_MODE,
2543				 GEN7_FF_SCHED_MASK,
2544				 GEN7_FF_TS_SCHED_HW |
2545				 GEN7_FF_VS_SCHED_HW |
2546				 GEN7_FF_DS_SCHED_HW);
2547
2548		/* WaDisablePSDDualDispatchEnable:ivb */
2549		if (INTEL_INFO(i915)->gt == 1)
2550			wa_masked_en(wal,
2551				     GEN7_HALF_SLICE_CHICKEN1,
2552				     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2553	}
2554
2555	if (GRAPHICS_VER(i915) == 7) {
2556		/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
2557		wa_masked_en(wal,
2558			     RING_MODE_GEN7(RENDER_RING_BASE),
2559			     GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
2560
2561		/* WaDisable_RenderCache_OperationalFlush:ivb,vlv,hsw */
2562		wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
2563
2564		/*
2565		 * BSpec says this must be set, even though
2566		 * WaDisable4x2SubspanOptimization:ivb,hsw
2567		 * WaDisable4x2SubspanOptimization isn't listed for VLV.
2568		 */
2569		wa_masked_en(wal,
2570			     CACHE_MODE_1,
2571			     PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
2572
2573		/*
2574		 * BSpec recommends 8x4 when MSAA is used,
2575		 * however in practice 16x4 seems fastest.
2576		 *
2577		 * Note that PS/WM thread counts depend on the WIZ hashing
2578		 * disable bit, which we don't touch here, but it's good
2579		 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2580		 */
2581		wa_masked_field_set(wal,
2582				    GEN7_GT_MODE,
2583				    GEN6_WIZ_HASHING_MASK,
2584				    GEN6_WIZ_HASHING_16x4);
2585	}
2586
2587	if (IS_GRAPHICS_VER(i915, 6, 7))
2588		/*
2589		 * We need to disable the AsyncFlip performance optimisations in
2590		 * order to use MI_WAIT_FOR_EVENT within the CS. It should
2591		 * already be programmed to '1' on all products.
2592		 *
2593		 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
2594		 */
2595		wa_masked_en(wal,
2596			     RING_MI_MODE(RENDER_RING_BASE),
2597			     ASYNC_FLIP_PERF_DISABLE);
2598
2599	if (GRAPHICS_VER(i915) == 6) {
2600		/*
2601		 * Required for the hardware to program scanline values for
2602		 * waiting
2603		 * WaEnableFlushTlbInvalidationMode:snb
2604		 */
2605		wa_masked_en(wal,
2606			     GFX_MODE,
2607			     GFX_TLB_INVALIDATE_EXPLICIT);
2608
2609		/* WaDisableHiZPlanesWhenMSAAEnabled:snb */
2610		wa_masked_en(wal,
2611			     _3D_CHICKEN,
2612			     _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB);
2613
2614		wa_masked_en(wal,
2615			     _3D_CHICKEN3,
2616			     /* WaStripsFansDisableFastClipPerformanceFix:snb */
2617			     _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL |
2618			     /*
2619			      * Bspec says:
2620			      * "This bit must be set if 3DSTATE_CLIP clip mode is set
2621			      * to normal and 3DSTATE_SF number of SF output attributes
2622			      * is more than 16."
2623			      */
2624			     _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH);
2625
2626		/*
2627		 * BSpec recommends 8x4 when MSAA is used,
2628		 * however in practice 16x4 seems fastest.
2629		 *
2630		 * Note that PS/WM thread counts depend on the WIZ hashing
2631		 * disable bit, which we don't touch here, but it's good
2632		 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2633		 */
2634		wa_masked_field_set(wal,
2635				    GEN6_GT_MODE,
2636				    GEN6_WIZ_HASHING_MASK,
2637				    GEN6_WIZ_HASHING_16x4);
2638
2639		/* WaDisable_RenderCache_OperationalFlush:snb */
2640		wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
2641
2642		/*
2643		 * From the Sandybridge PRM, volume 1 part 3, page 24:
2644		 * "If this bit is set, STCunit will have LRA as replacement
2645		 *  policy. [...] This bit must be reset. LRA replacement
2646		 *  policy is not supported."
2647		 */
2648		wa_masked_dis(wal,
2649			      CACHE_MODE_0,
2650			      CM0_STC_EVICT_DISABLE_LRA_SNB);
2651	}
2652
2653	if (IS_GRAPHICS_VER(i915, 4, 6))
2654		/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
2655		wa_add(wal, RING_MI_MODE(RENDER_RING_BASE),
2656		       0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH),
2657		       /* XXX bit doesn't stick on Broadwater */
2658		       IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH, true);
2659
2660	if (GRAPHICS_VER(i915) == 4)
2661		/*
2662		 * Disable CONSTANT_BUFFER before it is loaded from the context
2663		 * image. For as it is loaded, it is executed and the stored
2664		 * address may no longer be valid, leading to a GPU hang.
2665		 *
2666		 * This imposes the requirement that userspace reload their
2667		 * CONSTANT_BUFFER on every batch, fortunately a requirement
2668		 * they are already accustomed to from before contexts were
2669		 * enabled.
2670		 */
2671		wa_add(wal, ECOSKPD(RENDER_RING_BASE),
2672		       0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE),
2673		       0 /* XXX bit doesn't stick on Broadwater */,
2674		       true);
2675}
2676
2677static void
2678xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2679{
2680	struct drm_i915_private *i915 = engine->i915;
2681
2682	/* WaKBLVECSSemaphoreWaitPoll:kbl */
2683	if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_A0, STEP_F0)) {
2684		wa_write(wal,
2685			 RING_SEMA_WAIT_POLL(engine->mmio_base),
2686			 1);
2687	}
2688	/* Wa_16018031267, Wa_16018063123 */
2689	if (NEEDS_FASTCOLOR_BLT_WABB(engine))
2690		wa_masked_field_set(wal, ECOSKPD(engine->mmio_base),
2691				    XEHP_BLITTER_SCHEDULING_MODE_MASK,
2692				    XEHP_BLITTER_ROUND_ROBIN_MODE);
2693}
2694
2695static void
2696ccs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2697{
2698	/* boilerplate for any CCS engine workaround */
2699}
2700
2701/*
2702 * The bspec performance guide has recommended MMIO tuning settings.  These
2703 * aren't truly "workarounds" but we want to program them with the same
2704 * workaround infrastructure to ensure that they're automatically added to
2705 * the GuC save/restore lists, re-applied at the right times, and checked for
2706 * any conflicting programming requested by real workarounds.
2707 *
2708 * Programming settings should be added here only if their registers are not
2709 * part of an engine's register state context.  If a register is part of a
2710 * context, then any tuning settings should be programmed in an appropriate
2711 * function invoked by __intel_engine_init_ctx_wa().
2712 */
2713static void
2714add_render_compute_tuning_settings(struct intel_gt *gt,
2715				   struct i915_wa_list *wal)
2716{
2717	struct drm_i915_private *i915 = gt->i915;
2718
2719	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)) || IS_DG2(i915))
2720		wa_mcr_write_clr_set(wal, RT_CTRL, STACKID_CTRL, STACKID_CTRL_512);
2721
2722	/*
2723	 * This tuning setting proves beneficial only on ATS-M designs; the
2724	 * default "age based" setting is optimal on regular DG2 and other
2725	 * platforms.
2726	 */
2727	if (INTEL_INFO(i915)->tuning_thread_rr_after_dep)
2728		wa_mcr_masked_field_set(wal, GEN9_ROW_CHICKEN4, THREAD_EX_ARB_MODE,
2729					THREAD_EX_ARB_MODE_RR_AFTER_DEP);
2730
2731	if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 55))
2732		wa_write_clr(wal, GEN8_GARBCNTL, GEN12_BUS_HASH_CTL_BIT_EXC);
 
 
2733}
2734
2735static void ccs_engine_wa_mode(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2736{
2737	struct intel_gt *gt = engine->gt;
2738	u32 mode;
2739
2740	if (!IS_DG2(gt->i915))
2741		return;
2742
2743	/*
2744	 * Wa_14019159160: This workaround, along with others, leads to
2745	 * significant challenges in utilizing load balancing among the
2746	 * CCS slices. Consequently, an architectural decision has been
2747	 * made to completely disable automatic CCS load balancing.
2748	 */
2749	wa_masked_en(wal, GEN12_RCU_MODE, XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE);
2750
2751	/*
2752	 * After having disabled automatic load balancing we need to
2753	 * assign all slices to a single CCS. We will call it CCS mode 1
2754	 */
2755	mode = intel_gt_apply_ccs_mode(gt);
2756	wa_masked_en(wal, XEHP_CCS_MODE, mode);
2757}
2758
2759/*
2760 * The workarounds in this function apply to shared registers in
2761 * the general render reset domain that aren't tied to a
2762 * specific engine.  Since all render+compute engines get reset
2763 * together, and the contents of these registers are lost during
2764 * the shared render domain reset, we'll define such workarounds
2765 * here and then add them to just a single RCS or CCS engine's
2766 * workaround list (whichever engine has the XXXX flag).
2767 */
2768static void
2769general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2770{
2771	struct drm_i915_private *i915 = engine->i915;
2772	struct intel_gt *gt = engine->gt;
2773
2774	add_render_compute_tuning_settings(gt, wal);
2775
2776	if (GRAPHICS_VER(i915) >= 11) {
2777		/* This is not a Wa (although referred to as
2778		 * WaSetInidrectStateOverride in places), this allows
2779		 * applications that reference sampler states through
2780		 * the BindlessSamplerStateBaseAddress to have their
2781		 * border color relative to DynamicStateBaseAddress
2782		 * rather than BindlessSamplerStateBaseAddress.
2783		 *
2784		 * Otherwise SAMPLER_STATE border colors have to be
2785		 * copied in multiple heaps (DynamicStateBaseAddress &
2786		 * BindlessSamplerStateBaseAddress)
2787		 *
2788		 * BSpec: 46052
2789		 */
2790		wa_mcr_masked_en(wal,
2791				 GEN10_SAMPLER_MODE,
2792				 GEN11_INDIRECT_STATE_BASE_ADDR_OVERRIDE);
2793	}
2794
2795	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_B0, STEP_FOREVER) ||
2796	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B0, STEP_FOREVER) ||
2797	    IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 74), IP_VER(12, 74))) {
2798		/* Wa_14017856879 */
2799		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN3, MTL_DISABLE_FIX_FOR_EOT_FLUSH);
2800
2801		/* Wa_14020495402 */
2802		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, XELPG_DISABLE_TDL_SVHS_GATING);
2803	}
2804
2805	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2806	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2807		/*
2808		 * Wa_14017066071
2809		 * Wa_14017654203
2810		 */
2811		wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2812				 MTL_DISABLE_SAMPLER_SC_OOO);
2813
2814	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2815		/* Wa_22015279794 */
2816		wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2817				 DISABLE_PREFETCH_INTO_IC);
2818
2819	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2820	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2821	    IS_DG2(i915)) {
2822		/* Wa_22013037850 */
2823		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW,
2824				DISABLE_128B_EVICTION_COMMAND_UDW);
2825
2826		/* Wa_18017747507 */
2827		wa_masked_en(wal, VFG_PREEMPTION_CHICKEN, POLYGON_TRIFAN_LINELOOP_DISABLE);
2828	}
2829
2830	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2831	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2832	    IS_DG2(i915)) {
2833		/* Wa_22014226127 */
2834		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0, DISABLE_D8_D16_COASLESCE);
2835	}
2836
2837	if (IS_DG2(i915)) {
2838		/* Wa_14015227452:dg2,pvc */
2839		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE);
2840
2841		/*
2842		 * Wa_16011620976:dg2_g11
2843		 * Wa_22015475538:dg2
2844		 */
2845		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8);
2846
2847		/* Wa_18028616096 */
2848		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, UGM_FRAGMENT_THRESHOLD_TO_3);
2849	}
2850
2851	if (IS_DG2_G11(i915)) {
2852		/*
2853		 * Wa_22012826095:dg2
2854		 * Wa_22013059131:dg2
2855		 */
2856		wa_mcr_write_clr_set(wal, LSC_CHICKEN_BIT_0_UDW,
2857				     MAXREQS_PER_BANK,
2858				     REG_FIELD_PREP(MAXREQS_PER_BANK, 2));
2859
2860		/* Wa_22013059131:dg2 */
2861		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0,
2862				FORCE_1_SUB_MESSAGE_PER_FRAGMENT);
2863
2864		/*
2865		 * Wa_22012654132
2866		 *
2867		 * Note that register 0xE420 is write-only and cannot be read
2868		 * back for verification on DG2 (due to Wa_14012342262), so
2869		 * we need to explicitly skip the readback.
2870		 */
2871		wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
2872			   _MASKED_BIT_ENABLE(ENABLE_PREFETCH_INTO_IC),
2873			   0 /* write-only, so skip validation */,
2874			   true);
2875	}
2876}
2877
2878static void
2879engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2880{
2881	if (GRAPHICS_VER(engine->i915) < 4)
2882		return;
 
 
2883
2884	engine_fake_wa_init(engine, wal);
 
 
 
2885
2886	/*
2887	 * These are common workarounds that just need to applied
2888	 * to a single RCS/CCS engine's workaround list since
2889	 * they're reset as part of the general render domain reset.
2890	 */
2891	if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE) {
2892		general_render_compute_wa_init(engine, wal);
2893		ccs_engine_wa_mode(engine, wal);
2894	}
2895
2896	if (engine->class == COMPUTE_CLASS)
2897		ccs_engine_wa_init(engine, wal);
2898	else if (engine->class == RENDER_CLASS)
2899		rcs_engine_wa_init(engine, wal);
2900	else
2901		xcs_engine_wa_init(engine, wal);
2902}
2903
2904void intel_engine_init_workarounds(struct intel_engine_cs *engine)
2905{
2906	struct i915_wa_list *wal = &engine->wa_list;
 
2907
2908	wa_init_start(wal, engine->gt, "engine", engine->name);
2909	engine_init_workarounds(engine, wal);
2910	wa_init_finish(wal);
2911}
2912
2913void intel_engine_apply_workarounds(struct intel_engine_cs *engine)
2914{
2915	wa_list_apply(&engine->wa_list);
2916}
2917
2918static const struct i915_range mcr_ranges_gen8[] = {
 
 
 
2919	{ .start = 0x5500, .end = 0x55ff },
2920	{ .start = 0x7000, .end = 0x7fff },
2921	{ .start = 0x9400, .end = 0x97ff },
2922	{ .start = 0xb000, .end = 0xb3ff },
2923	{ .start = 0xe000, .end = 0xe7ff },
2924	{},
2925};
2926
2927static const struct i915_range mcr_ranges_gen12[] = {
2928	{ .start =  0x8150, .end =  0x815f },
2929	{ .start =  0x9520, .end =  0x955f },
2930	{ .start =  0xb100, .end =  0xb3ff },
2931	{ .start =  0xde80, .end =  0xe8ff },
2932	{ .start = 0x24a00, .end = 0x24a7f },
2933	{},
2934};
2935
2936static const struct i915_range mcr_ranges_xehp[] = {
2937	{ .start =  0x4000, .end =  0x4aff },
2938	{ .start =  0x5200, .end =  0x52ff },
2939	{ .start =  0x5400, .end =  0x7fff },
2940	{ .start =  0x8140, .end =  0x815f },
2941	{ .start =  0x8c80, .end =  0x8dff },
2942	{ .start =  0x94d0, .end =  0x955f },
2943	{ .start =  0x9680, .end =  0x96ff },
2944	{ .start =  0xb000, .end =  0xb3ff },
2945	{ .start =  0xc800, .end =  0xcfff },
2946	{ .start =  0xd800, .end =  0xd8ff },
2947	{ .start =  0xdc00, .end =  0xffff },
2948	{ .start = 0x17000, .end = 0x17fff },
2949	{ .start = 0x24a00, .end = 0x24a7f },
2950	{},
2951};
2952
2953static bool mcr_range(struct drm_i915_private *i915, u32 offset)
2954{
2955	const struct i915_range *mcr_ranges;
2956	int i;
2957
2958	if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 55))
2959		mcr_ranges = mcr_ranges_xehp;
2960	else if (GRAPHICS_VER(i915) >= 12)
2961		mcr_ranges = mcr_ranges_gen12;
2962	else if (GRAPHICS_VER(i915) >= 8)
2963		mcr_ranges = mcr_ranges_gen8;
2964	else
2965		return false;
2966
2967	/*
2968	 * Registers in these ranges are affected by the MCR selector
2969	 * which only controls CPU initiated MMIO. Routing does not
2970	 * work for CS access so we cannot verify them on this path.
2971	 */
2972	for (i = 0; mcr_ranges[i].start; i++)
2973		if (offset >= mcr_ranges[i].start &&
2974		    offset <= mcr_ranges[i].end)
2975			return true;
2976
2977	return false;
2978}
2979
2980static int
2981wa_list_srm(struct i915_request *rq,
2982	    const struct i915_wa_list *wal,
2983	    struct i915_vma *vma)
2984{
2985	struct drm_i915_private *i915 = rq->i915;
2986	unsigned int i, count = 0;
2987	const struct i915_wa *wa;
2988	u32 srm, *cs;
2989
2990	srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
2991	if (GRAPHICS_VER(i915) >= 8)
2992		srm++;
2993
2994	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
2995		if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg)))
2996			count++;
2997	}
2998
2999	cs = intel_ring_begin(rq, 4 * count);
3000	if (IS_ERR(cs))
3001		return PTR_ERR(cs);
3002
3003	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3004		u32 offset = i915_mmio_reg_offset(wa->reg);
3005
3006		if (mcr_range(i915, offset))
3007			continue;
3008
3009		*cs++ = srm;
3010		*cs++ = offset;
3011		*cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i;
3012		*cs++ = 0;
3013	}
3014	intel_ring_advance(rq, cs);
3015
3016	return 0;
3017}
3018
3019static int engine_wa_list_verify(struct intel_context *ce,
3020				 const struct i915_wa_list * const wal,
3021				 const char *from)
3022{
3023	const struct i915_wa *wa;
3024	struct i915_request *rq;
3025	struct i915_vma *vma;
3026	struct i915_gem_ww_ctx ww;
3027	unsigned int i;
3028	u32 *results;
3029	int err;
3030
3031	if (!wal->count)
3032		return 0;
3033
3034	vma = __vm_create_scratch_for_read(&ce->engine->gt->ggtt->vm,
3035					   wal->count * sizeof(u32));
3036	if (IS_ERR(vma))
3037		return PTR_ERR(vma);
3038
3039	intel_engine_pm_get(ce->engine);
3040	i915_gem_ww_ctx_init(&ww, false);
3041retry:
3042	err = i915_gem_object_lock(vma->obj, &ww);
3043	if (err == 0)
3044		err = intel_context_pin_ww(ce, &ww);
3045	if (err)
3046		goto err_pm;
3047
3048	err = i915_vma_pin_ww(vma, &ww, 0, 0,
3049			   i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
3050	if (err)
3051		goto err_unpin;
3052
3053	rq = i915_request_create(ce);
3054	if (IS_ERR(rq)) {
3055		err = PTR_ERR(rq);
3056		goto err_vma;
3057	}
3058
3059	err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
 
3060	if (err == 0)
3061		err = wa_list_srm(rq, wal, vma);
3062
3063	i915_request_get(rq);
3064	if (err)
3065		i915_request_set_error_once(rq, err);
3066	i915_request_add(rq);
3067
 
3068	if (err)
3069		goto err_rq;
3070
 
 
3071	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
3072		err = -ETIME;
3073		goto err_rq;
3074	}
3075
3076	results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
3077	if (IS_ERR(results)) {
3078		err = PTR_ERR(results);
3079		goto err_rq;
3080	}
3081
3082	err = 0;
3083	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3084		if (mcr_range(rq->i915, i915_mmio_reg_offset(wa->reg)))
3085			continue;
3086
3087		if (!wa_verify(wal->gt, wa, results[i], wal->name, from))
3088			err = -ENXIO;
3089	}
3090
3091	i915_gem_object_unpin_map(vma->obj);
3092
3093err_rq:
3094	i915_request_put(rq);
3095err_vma:
3096	i915_vma_unpin(vma);
3097err_unpin:
3098	intel_context_unpin(ce);
3099err_pm:
3100	if (err == -EDEADLK) {
3101		err = i915_gem_ww_ctx_backoff(&ww);
3102		if (!err)
3103			goto retry;
3104	}
3105	i915_gem_ww_ctx_fini(&ww);
3106	intel_engine_pm_put(ce->engine);
3107	i915_vma_put(vma);
3108	return err;
3109}
3110
3111int intel_engine_verify_workarounds(struct intel_engine_cs *engine,
3112				    const char *from)
3113{
3114	return engine_wa_list_verify(engine->kernel_context,
3115				     &engine->wa_list,
3116				     from);
3117}
3118
3119#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
3120#include "selftest_workarounds.c"
3121#endif

 
   1/*
   2 * SPDX-License-Identifier: MIT
   3 *
   4 * Copyright © 2014-2018 Intel Corporation
   5 */
   6
   7#include "i915_drv.h"
 
   8#include "intel_context.h"
   9#include "intel_engine_pm.h"
 
 
  10#include "intel_gt.h"
 
 
 
 
  11#include "intel_ring.h"
  12#include "intel_workarounds.h"
  13
 
 
  14/**
  15 * DOC: Hardware workarounds
  16 *
  17 * This file is intended as a central place to implement most [1]_ of the
  18 * required workarounds for hardware to work as originally intended. They fall
  19 * in five basic categories depending on how/when they are applied:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  20 *
  21 * - Workarounds that touch registers that are saved/restored to/from the HW
  22 *   context image. The list is emitted (via Load Register Immediate commands)
  23 *   everytime a new context is created.
  24 * - GT workarounds. The list of these WAs is applied whenever these registers
  25 *   revert to default values (on GPU reset, suspend/resume [2]_, etc..).
  26 * - Display workarounds. The list is applied during display clock-gating
  27 *   initialization.
  28 * - Workarounds that whitelist a privileged register, so that UMDs can manage
  29 *   them directly. This is just a special case of a MMMIO workaround (as we
  30 *   write the list of these to/be-whitelisted registers to some special HW
  31 *   registers).
  32 * - Workaround batchbuffers, that get executed automatically by the hardware
  33 *   on every HW context restore.
  34 *
  35 * .. [1] Please notice that there are other WAs that, due to their nature,
  36 *    cannot be applied from a central place. Those are peppered around the rest
  37 *    of the code, as needed.
  38 *
  39 * .. [2] Technically, some registers are powercontext saved & restored, so they
  40 *    survive a suspend/resume. In practice, writing them again is not too
  41 *    costly and simplifies things. We can revisit this in the future.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  42 *
  43 * Layout
  44 * ~~~~~~
 
 
  45 *
  46 * Keep things in this file ordered by WA type, as per the above (context, GT,
  47 * display, register whitelist, batchbuffer). Then, inside each type, keep the
  48 * following order:
  49 *
  50 * - Infrastructure functions and macros
  51 * - WAs per platform in standard gen/chrono order
  52 * - Public functions to init or apply the given workaround type.
  53 */
  54
  55static void wa_init_start(struct i915_wa_list *wal, const char *name, const char *engine_name)
 
  56{
 
  57	wal->name = name;
  58	wal->engine_name = engine_name;
  59}
  60
  61#define WA_LIST_CHUNK (1 << 4)
  62
  63static void wa_init_finish(struct i915_wa_list *wal)
  64{
  65	/* Trim unused entries. */
  66	if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) {
  67		struct i915_wa *list = kmemdup(wal->list,
  68					       wal->count * sizeof(*list),
  69					       GFP_KERNEL);
  70
  71		if (list) {
  72			kfree(wal->list);
  73			wal->list = list;
  74		}
  75	}
  76
  77	if (!wal->count)
  78		return;
  79
  80	DRM_DEBUG_DRIVER("Initialized %u %s workarounds on %s\n",
  81			 wal->wa_count, wal->name, wal->engine_name);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  82}
  83
  84static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa)
  85{
  86	unsigned int addr = i915_mmio_reg_offset(wa->reg);
 
  87	unsigned int start = 0, end = wal->count;
  88	const unsigned int grow = WA_LIST_CHUNK;
  89	struct i915_wa *wa_;
  90
  91	GEM_BUG_ON(!is_power_of_2(grow));
  92
  93	if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */
  94		struct i915_wa *list;
  95
  96		list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*wa),
  97				     GFP_KERNEL);
  98		if (!list) {
  99			DRM_ERROR("No space for workaround init!\n");
 100			return;
 101		}
 102
 103		if (wal->list)
 104			memcpy(list, wal->list, sizeof(*wa) * wal->count);
 
 
 105
 106		wal->list = list;
 107	}
 108
 109	while (start < end) {
 110		unsigned int mid = start + (end - start) / 2;
 111
 112		if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) {
 113			start = mid + 1;
 114		} else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) {
 115			end = mid;
 116		} else {
 117			wa_ = &wal->list[mid];
 118
 119			if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) {
 120				DRM_ERROR("Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n",
 121					  i915_mmio_reg_offset(wa_->reg),
 122					  wa_->clr, wa_->set);
 
 123
 124				wa_->set &= ~wa->clr;
 125			}
 126
 127			wal->wa_count++;
 128			wa_->set |= wa->set;
 129			wa_->clr |= wa->clr;
 130			wa_->read |= wa->read;
 131			return;
 132		}
 133	}
 134
 135	wal->wa_count++;
 136	wa_ = &wal->list[wal->count++];
 137	*wa_ = *wa;
 138
 139	while (wa_-- > wal->list) {
 140		GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) ==
 141			   i915_mmio_reg_offset(wa_[1].reg));
 142		if (i915_mmio_reg_offset(wa_[1].reg) >
 143		    i915_mmio_reg_offset(wa_[0].reg))
 144			break;
 145
 146		swap(wa_[1], wa_[0]);
 147	}
 148}
 149
 150static void wa_add(struct i915_wa_list *wal, i915_reg_t reg,
 151		   u32 clear, u32 set, u32 read_mask)
 152{
 153	struct i915_wa wa = {
 154		.reg  = reg,
 155		.clr  = clear,
 156		.set  = set,
 157		.read = read_mask,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 158	};
 159
 160	_wa_add(wal, &wa);
 161}
 162
 163static void
 164wa_write_masked_or(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set)
 
 
 
 
 
 
 165{
 166	wa_add(wal, reg, clear, set, clear);
 167}
 168
 169static void
 170wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
 171{
 172	wa_write_masked_or(wal, reg, ~0, set);
 173}
 174
 175static void
 176wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
 177{
 178	wa_write_masked_or(wal, reg, set, set);
 
 
 
 
 
 
 179}
 180
 181static void
 182wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr)
 183{
 184	wa_write_masked_or(wal, reg, clr, 0);
 185}
 186
 187static void
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 188wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
 189{
 190	wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val);
 
 
 
 
 
 
 191}
 192
 193static void
 194wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
 195{
 196	wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val);
 197}
 198
 199#define WA_SET_BIT_MASKED(addr, mask) \
 200	wa_masked_en(wal, (addr), (mask))
 
 
 
 201
 202#define WA_CLR_BIT_MASKED(addr, mask) \
 203	wa_masked_dis(wal, (addr), (mask))
 
 
 
 
 204
 205#define WA_SET_FIELD_MASKED(addr, mask, value) \
 206	wa_write_masked_or(wal, (addr), 0, _MASKED_FIELD((mask), (value)))
 
 
 
 
 207
 208static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine,
 209				      struct i915_wa_list *wal)
 210{
 211	WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING);
 212}
 213
 214static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine,
 215				      struct i915_wa_list *wal)
 216{
 217	WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING);
 218}
 219
 220static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine,
 221				      struct i915_wa_list *wal)
 222{
 223	WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING);
 224
 225	/* WaDisableAsyncFlipPerfMode:bdw,chv */
 226	WA_SET_BIT_MASKED(MI_MODE, ASYNC_FLIP_PERF_DISABLE);
 227
 228	/* WaDisablePartialInstShootdown:bdw,chv */
 229	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
 230			  PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
 231
 232	/* Use Force Non-Coherent whenever executing a 3D context. This is a
 233	 * workaround for for a possible hang in the unlikely event a TLB
 234	 * invalidation occurs during a PSD flush.
 235	 */
 236	/* WaForceEnableNonCoherent:bdw,chv */
 237	/* WaHdcDisableFetchWhenMasked:bdw,chv */
 238	WA_SET_BIT_MASKED(HDC_CHICKEN0,
 239			  HDC_DONOT_FETCH_MEM_WHEN_MASKED |
 240			  HDC_FORCE_NON_COHERENT);
 241
 242	/* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
 243	 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
 244	 *  polygons in the same 8x4 pixel/sample area to be processed without
 245	 *  stalling waiting for the earlier ones to write to Hierarchical Z
 246	 *  buffer."
 247	 *
 248	 * This optimization is off by default for BDW and CHV; turn it on.
 249	 */
 250	WA_CLR_BIT_MASKED(CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
 251
 252	/* Wa4x4STCOptimizationDisable:bdw,chv */
 253	WA_SET_BIT_MASKED(CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
 254
 255	/*
 256	 * BSpec recommends 8x4 when MSAA is used,
 257	 * however in practice 16x4 seems fastest.
 258	 *
 259	 * Note that PS/WM thread counts depend on the WIZ hashing
 260	 * disable bit, which we don't touch here, but it's good
 261	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
 262	 */
 263	WA_SET_FIELD_MASKED(GEN7_GT_MODE,
 264			    GEN6_WIZ_HASHING_MASK,
 265			    GEN6_WIZ_HASHING_16x4);
 266}
 267
 268static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine,
 269				     struct i915_wa_list *wal)
 270{
 271	struct drm_i915_private *i915 = engine->i915;
 272
 273	gen8_ctx_workarounds_init(engine, wal);
 274
 275	/* WaDisableThreadStallDopClockGating:bdw (pre-production) */
 276	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
 277
 278	/* WaDisableDopClockGating:bdw
 279	 *
 280	 * Also see the related UCGTCL1 write in bdw_init_clock_gating()
 281	 * to disable EUTC clock gating.
 282	 */
 283	WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
 284			  DOP_CLOCK_GATING_DISABLE);
 285
 286	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
 287			  GEN8_SAMPLER_POWER_BYPASS_DIS);
 288
 289	WA_SET_BIT_MASKED(HDC_CHICKEN0,
 290			  /* WaForceContextSaveRestoreNonCoherent:bdw */
 291			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
 292			  /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
 293			  (IS_BDW_GT3(i915) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
 294}
 295
 296static void chv_ctx_workarounds_init(struct intel_engine_cs *engine,
 297				     struct i915_wa_list *wal)
 298{
 299	gen8_ctx_workarounds_init(engine, wal);
 300
 301	/* WaDisableThreadStallDopClockGating:chv */
 302	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
 303
 304	/* Improve HiZ throughput on CHV. */
 305	WA_SET_BIT_MASKED(HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
 306}
 307
 308static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine,
 309				      struct i915_wa_list *wal)
 310{
 311	struct drm_i915_private *i915 = engine->i915;
 312
 313	if (HAS_LLC(i915)) {
 314		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
 315		 *
 316		 * Must match Display Engine. See
 317		 * WaCompressedResourceDisplayNewHashMode.
 318		 */
 319		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
 320				  GEN9_PBE_COMPRESSED_HASH_SELECTION);
 321		WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7,
 322				  GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR);
 323	}
 324
 325	/* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */
 326	/* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */
 327	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
 328			  FLOW_CONTROL_ENABLE |
 329			  PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
 330
 331	/* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */
 332	/* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */
 333	WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7,
 334			  GEN9_ENABLE_YV12_BUGFIX |
 335			  GEN9_ENABLE_GPGPU_PREEMPTION);
 336
 337	/* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */
 338	/* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */
 339	WA_SET_BIT_MASKED(CACHE_MODE_1,
 340			  GEN8_4x4_STC_OPTIMIZATION_DISABLE |
 341			  GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE);
 342
 343	/* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */
 344	WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5,
 345			  GEN9_CCS_TLB_PREFETCH_ENABLE);
 346
 347	/* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */
 348	WA_SET_BIT_MASKED(HDC_CHICKEN0,
 349			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
 350			  HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
 351
 352	/* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
 353	 * both tied to WaForceContextSaveRestoreNonCoherent
 354	 * in some hsds for skl. We keep the tie for all gen9. The
 355	 * documentation is a bit hazy and so we want to get common behaviour,
 356	 * even though there is no clear evidence we would need both on kbl/bxt.
 357	 * This area has been source of system hangs so we play it safe
 358	 * and mimic the skl regardless of what bspec says.
 359	 *
 360	 * Use Force Non-Coherent whenever executing a 3D context. This
 361	 * is a workaround for a possible hang in the unlikely event
 362	 * a TLB invalidation occurs during a PSD flush.
 363	 */
 364
 365	/* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */
 366	WA_SET_BIT_MASKED(HDC_CHICKEN0,
 367			  HDC_FORCE_NON_COHERENT);
 368
 369	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
 370	if (IS_SKYLAKE(i915) ||
 371	    IS_KABYLAKE(i915) ||
 372	    IS_COFFEELAKE(i915) ||
 373	    IS_COMETLAKE(i915))
 374		WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
 375				  GEN8_SAMPLER_POWER_BYPASS_DIS);
 376
 377	/* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */
 378	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
 379
 380	/*
 381	 * Supporting preemption with fine-granularity requires changes in the
 382	 * batch buffer programming. Since we can't break old userspace, we
 383	 * need to set our default preemption level to safe value. Userspace is
 384	 * still able to use more fine-grained preemption levels, since in
 385	 * WaEnablePreemptionGranularityControlByUMD we're whitelisting the
 386	 * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are
 387	 * not real HW workarounds, but merely a way to start using preemption
 388	 * while maintaining old contract with userspace.
 389	 */
 390
 391	/* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */
 392	WA_CLR_BIT_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
 393
 394	/* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */
 395	WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1,
 396			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 397			    GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
 398
 399	/* WaClearHIZ_WM_CHICKEN3:bxt,glk */
 400	if (IS_GEN9_LP(i915))
 401		WA_SET_BIT_MASKED(GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
 402}
 403
 404static void skl_tune_iz_hashing(struct intel_engine_cs *engine,
 405				struct i915_wa_list *wal)
 406{
 407	struct intel_gt *gt = engine->gt;
 408	u8 vals[3] = { 0, 0, 0 };
 409	unsigned int i;
 410
 411	for (i = 0; i < 3; i++) {
 412		u8 ss;
 413
 414		/*
 415		 * Only consider slices where one, and only one, subslice has 7
 416		 * EUs
 417		 */
 418		if (!is_power_of_2(gt->info.sseu.subslice_7eu[i]))
 419			continue;
 420
 421		/*
 422		 * subslice_7eu[i] != 0 (because of the check above) and
 423		 * ss_max == 4 (maximum number of subslices possible per slice)
 424		 *
 425		 * ->    0 <= ss <= 3;
 426		 */
 427		ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1;
 428		vals[i] = 3 - ss;
 429	}
 430
 431	if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
 432		return;
 433
 434	/* Tune IZ hashing. See intel_device_info_runtime_init() */
 435	WA_SET_FIELD_MASKED(GEN7_GT_MODE,
 436			    GEN9_IZ_HASHING_MASK(2) |
 437			    GEN9_IZ_HASHING_MASK(1) |
 438			    GEN9_IZ_HASHING_MASK(0),
 439			    GEN9_IZ_HASHING(2, vals[2]) |
 440			    GEN9_IZ_HASHING(1, vals[1]) |
 441			    GEN9_IZ_HASHING(0, vals[0]));
 442}
 443
 444static void skl_ctx_workarounds_init(struct intel_engine_cs *engine,
 445				     struct i915_wa_list *wal)
 446{
 447	gen9_ctx_workarounds_init(engine, wal);
 448	skl_tune_iz_hashing(engine, wal);
 449}
 450
 451static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine,
 452				     struct i915_wa_list *wal)
 453{
 454	gen9_ctx_workarounds_init(engine, wal);
 455
 456	/* WaDisableThreadStallDopClockGating:bxt */
 457	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
 458			  STALL_DOP_GATING_DISABLE);
 459
 460	/* WaToEnableHwFixForPushConstHWBug:bxt */
 461	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
 462			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 463}
 464
 465static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine,
 466				     struct i915_wa_list *wal)
 467{
 468	struct drm_i915_private *i915 = engine->i915;
 469
 470	gen9_ctx_workarounds_init(engine, wal);
 471
 472	/* WaToEnableHwFixForPushConstHWBug:kbl */
 473	if (IS_KBL_REVID(i915, KBL_REVID_C0, REVID_FOREVER))
 474		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
 475				  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 476
 477	/* WaDisableSbeCacheDispatchPortSharing:kbl */
 478	WA_SET_BIT_MASKED(GEN7_HALF_SLICE_CHICKEN1,
 479			  GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
 480}
 481
 482static void glk_ctx_workarounds_init(struct intel_engine_cs *engine,
 483				     struct i915_wa_list *wal)
 484{
 485	gen9_ctx_workarounds_init(engine, wal);
 486
 487	/* WaToEnableHwFixForPushConstHWBug:glk */
 488	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
 489			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 490}
 491
 492static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine,
 493				     struct i915_wa_list *wal)
 494{
 495	gen9_ctx_workarounds_init(engine, wal);
 496
 497	/* WaToEnableHwFixForPushConstHWBug:cfl */
 498	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
 499			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 500
 501	/* WaDisableSbeCacheDispatchPortSharing:cfl */
 502	WA_SET_BIT_MASKED(GEN7_HALF_SLICE_CHICKEN1,
 503			  GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
 504}
 505
 506static void cnl_ctx_workarounds_init(struct intel_engine_cs *engine,
 507				     struct i915_wa_list *wal)
 508{
 509	/* WaForceContextSaveRestoreNonCoherent:cnl */
 510	WA_SET_BIT_MASKED(CNL_HDC_CHICKEN0,
 511			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT);
 512
 513	/* WaDisableReplayBufferBankArbitrationOptimization:cnl */
 514	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
 515			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 516
 517	/* WaPushConstantDereferenceHoldDisable:cnl */
 518	WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, PUSH_CONSTANT_DEREF_DISABLE);
 519
 520	/* FtrEnableFastAnisoL1BankingFix:cnl */
 521	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, CNL_FAST_ANISO_L1_BANKING_FIX);
 522
 523	/* WaDisable3DMidCmdPreemption:cnl */
 524	WA_CLR_BIT_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
 525
 526	/* WaDisableGPGPUMidCmdPreemption:cnl */
 527	WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1,
 528			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 529			    GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
 530
 531	/* WaDisableEarlyEOT:cnl */
 532	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, DISABLE_EARLY_EOT);
 533}
 534
 535static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
 536				     struct i915_wa_list *wal)
 537{
 538	struct drm_i915_private *i915 = engine->i915;
 539
 540	/* WaDisableBankHangMode:icl */
 541	wa_write(wal,
 542		 GEN8_L3CNTLREG,
 543		 intel_uncore_read(engine->uncore, GEN8_L3CNTLREG) |
 544		 GEN8_ERRDETBCTRL);
 545
 546	/* Wa_1604370585:icl (pre-prod)
 547	 * Formerly known as WaPushConstantDereferenceHoldDisable
 548	 */
 549	if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_B0))
 550		WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
 551				  PUSH_CONSTANT_DEREF_DISABLE);
 552
 553	/* WaForceEnableNonCoherent:icl
 554	 * This is not the same workaround as in early Gen9 platforms, where
 555	 * lacking this could cause system hangs, but coherency performance
 556	 * overhead is high and only a few compute workloads really need it
 557	 * (the register is whitelisted in hardware now, so UMDs can opt in
 558	 * for coherency if they have a good reason).
 559	 */
 560	WA_SET_BIT_MASKED(ICL_HDC_MODE, HDC_FORCE_NON_COHERENT);
 561
 562	/* Wa_2006611047:icl (pre-prod)
 563	 * Formerly known as WaDisableImprovedTdlClkGating
 564	 */
 565	if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0))
 566		WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
 567				  GEN11_TDL_CLOCK_GATING_FIX_DISABLE);
 568
 569	/* Wa_2006665173:icl (pre-prod) */
 570	if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0))
 571		WA_SET_BIT_MASKED(GEN11_COMMON_SLICE_CHICKEN3,
 572				  GEN11_BLEND_EMB_FIX_DISABLE_IN_RCC);
 573
 574	/* WaEnableFloatBlendOptimization:icl */
 575	wa_write_masked_or(wal,
 576			   GEN10_CACHE_MODE_SS,
 577			   0, /* write-only, so skip validation */
 578			   _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE));
 579
 580	/* WaDisableGPGPUMidThreadPreemption:icl */
 581	WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1,
 582			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 583			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
 584
 585	/* allow headerless messages for preemptible GPGPU context */
 586	WA_SET_BIT_MASKED(GEN10_SAMPLER_MODE,
 587			  GEN11_SAMPLER_ENABLE_HEADLESS_MSG);
 588
 589	/* Wa_1604278689:icl,ehl */
 590	wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID);
 591	wa_write_masked_or(wal, IVB_FBC_RT_BASE_UPPER,
 592			   0, /* write-only register; skip validation */
 593			   0xFFFFFFFF);
 594
 595	/* Wa_1406306137:icl,ehl */
 596	wa_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU);
 597}
 598
 599static void tgl_ctx_workarounds_init(struct intel_engine_cs *engine,
 600				     struct i915_wa_list *wal)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 601{
 
 
 602	/*
 603	 * Wa_1409142259:tgl
 604	 * Wa_1409347922:tgl
 605	 * Wa_1409252684:tgl
 606	 * Wa_1409217633:tgl
 607	 * Wa_1409207793:tgl
 608	 * Wa_1409178076:tgl
 609	 * Wa_1408979724:tgl
 610	 */
 611	WA_SET_BIT_MASKED(GEN11_COMMON_SLICE_CHICKEN3,
 612			  GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
 613
 614	/*
 615	 * Wa_1604555607:gen12 and Wa_1608008084:gen12
 616	 * FF_MODE2 register will return the wrong value when read. The default
 617	 * value for this register is zero for all fields and there are no bit
 618	 * masks. So instead of doing a RMW we should just write the GS Timer
 619	 * and TDS timer values for Wa_1604555607 and Wa_16011163337.
 620	 */
 621	wa_add(wal,
 622	       FF_MODE2,
 623	       FF_MODE2_GS_TIMER_MASK | FF_MODE2_TDS_TIMER_MASK,
 624	       FF_MODE2_GS_TIMER_224  | FF_MODE2_TDS_TIMER_128,
 625	       0);
 626
 627	/* WaDisableGPGPUMidThreadPreemption:tgl */
 628	WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1,
 629			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 630			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 631}
 632
 633static void
 634__intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
 635			   struct i915_wa_list *wal,
 636			   const char *name)
 637{
 638	struct drm_i915_private *i915 = engine->i915;
 639
 
 
 
 
 
 
 
 
 
 
 640	if (engine->class != RENDER_CLASS)
 641		return;
 642
 643	wa_init_start(wal, name, engine->name);
 644
 645	if (IS_GEN(i915, 12))
 646		tgl_ctx_workarounds_init(engine, wal);
 647	else if (IS_GEN(i915, 11))
 
 
 
 
 
 
 648		icl_ctx_workarounds_init(engine, wal);
 649	else if (IS_CANNONLAKE(i915))
 650		cnl_ctx_workarounds_init(engine, wal);
 651	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
 652		cfl_ctx_workarounds_init(engine, wal);
 653	else if (IS_GEMINILAKE(i915))
 654		glk_ctx_workarounds_init(engine, wal);
 655	else if (IS_KABYLAKE(i915))
 656		kbl_ctx_workarounds_init(engine, wal);
 657	else if (IS_BROXTON(i915))
 658		bxt_ctx_workarounds_init(engine, wal);
 659	else if (IS_SKYLAKE(i915))
 660		skl_ctx_workarounds_init(engine, wal);
 661	else if (IS_CHERRYVIEW(i915))
 662		chv_ctx_workarounds_init(engine, wal);
 663	else if (IS_BROADWELL(i915))
 664		bdw_ctx_workarounds_init(engine, wal);
 665	else if (IS_GEN(i915, 7))
 666		gen7_ctx_workarounds_init(engine, wal);
 667	else if (IS_GEN(i915, 6))
 668		gen6_ctx_workarounds_init(engine, wal);
 669	else if (INTEL_GEN(i915) < 8)
 670		return;
 671	else
 672		MISSING_CASE(INTEL_GEN(i915));
 673
 
 674	wa_init_finish(wal);
 675}
 676
 677void intel_engine_init_ctx_wa(struct intel_engine_cs *engine)
 678{
 679	__intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context");
 680}
 681
 682int intel_engine_emit_ctx_wa(struct i915_request *rq)
 683{
 684	struct i915_wa_list *wal = &rq->engine->ctx_wa_list;
 
 
 
 685	struct i915_wa *wa;
 686	unsigned int i;
 687	u32 *cs;
 688	int ret;
 689
 690	if (wal->count == 0)
 691		return 0;
 692
 693	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
 694	if (ret)
 695		return ret;
 696
 697	cs = intel_ring_begin(rq, (wal->count * 2 + 2));
 
 
 
 
 
 698	if (IS_ERR(cs))
 699		return PTR_ERR(cs);
 700
 
 
 
 
 
 
 701	*cs++ = MI_LOAD_REGISTER_IMM(wal->count);
 702	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 703		*cs++ = i915_mmio_reg_offset(wa->reg);
 704		*cs++ = wa->set;
 705	}
 706	*cs++ = MI_NOOP;
 707
 
 
 
 
 
 
 
 
 
 
 
 
 
 708	intel_ring_advance(rq, cs);
 709
 710	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
 711	if (ret)
 712		return ret;
 713
 714	return 0;
 715}
 716
 717static void
 718gen4_gt_workarounds_init(struct drm_i915_private *i915,
 719			 struct i915_wa_list *wal)
 720{
 721	/* WaDisable_RenderCache_OperationalFlush:gen4,ilk */
 722	wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
 723}
 724
 725static void
 726g4x_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 727{
 728	gen4_gt_workarounds_init(i915, wal);
 729
 730	/* WaDisableRenderCachePipelinedFlush:g4x,ilk */
 731	wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE);
 732}
 733
 734static void
 735ilk_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 736{
 737	g4x_gt_workarounds_init(i915, wal);
 738
 739	wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED);
 740}
 741
 742static void
 743snb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 744{
 745	/* WaDisableHiZPlanesWhenMSAAEnabled:snb */
 746	wa_masked_en(wal,
 747		     _3D_CHICKEN,
 748		     _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB);
 749
 750	/* WaDisable_RenderCache_OperationalFlush:snb */
 751	wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
 752
 753	/*
 754	 * BSpec recommends 8x4 when MSAA is used,
 755	 * however in practice 16x4 seems fastest.
 756	 *
 757	 * Note that PS/WM thread counts depend on the WIZ hashing
 758	 * disable bit, which we don't touch here, but it's good
 759	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
 760	 */
 761	wa_add(wal,
 762	       GEN6_GT_MODE, 0,
 763	       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
 764	       GEN6_WIZ_HASHING_16x4);
 765
 766	wa_masked_dis(wal, CACHE_MODE_0, CM0_STC_EVICT_DISABLE_LRA_SNB);
 767
 768	wa_masked_en(wal,
 769		     _3D_CHICKEN3,
 770		     /* WaStripsFansDisableFastClipPerformanceFix:snb */
 771		     _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL |
 772		     /*
 773		      * Bspec says:
 774		      * "This bit must be set if 3DSTATE_CLIP clip mode is set
 775		      * to normal and 3DSTATE_SF number of SF output attributes
 776		      * is more than 16."
 777		      */
 778		   _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH);
 779}
 780
 781static void
 782ivb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 783{
 784	/* WaDisableEarlyCull:ivb */
 785	wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
 786
 787	/* WaDisablePSDDualDispatchEnable:ivb */
 788	if (IS_IVB_GT1(i915))
 789		wa_masked_en(wal,
 790			     GEN7_HALF_SLICE_CHICKEN1,
 791			     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
 792
 793	/* WaDisable_RenderCache_OperationalFlush:ivb */
 794	wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
 795
 796	/* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
 797	wa_masked_dis(wal,
 798		      GEN7_COMMON_SLICE_CHICKEN1,
 799		      GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
 800
 801	/* WaApplyL3ControlAndL3ChickenMode:ivb */
 802	wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL);
 803	wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
 804
 805	/* WaForceL3Serialization:ivb */
 806	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
 807
 808	/*
 809	 * WaVSThreadDispatchOverride:ivb,vlv
 810	 *
 811	 * This actually overrides the dispatch
 812	 * mode for all thread types.
 813	 */
 814	wa_write_masked_or(wal, GEN7_FF_THREAD_MODE,
 815			   GEN7_FF_SCHED_MASK,
 816			   GEN7_FF_TS_SCHED_HW |
 817			   GEN7_FF_VS_SCHED_HW |
 818			   GEN7_FF_DS_SCHED_HW);
 819
 820	if (0) { /* causes HiZ corruption on ivb:gt1 */
 821		/* enable HiZ Raw Stall Optimization */
 822		wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
 823	}
 824
 825	/* WaDisable4x2SubspanOptimization:ivb */
 826	wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
 827
 828	/*
 829	 * BSpec recommends 8x4 when MSAA is used,
 830	 * however in practice 16x4 seems fastest.
 831	 *
 832	 * Note that PS/WM thread counts depend on the WIZ hashing
 833	 * disable bit, which we don't touch here, but it's good
 834	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
 835	 */
 836	wa_add(wal, GEN7_GT_MODE, 0,
 837	       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
 838	       GEN6_WIZ_HASHING_16x4);
 839}
 840
 841static void
 842vlv_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 843{
 844	/* WaDisableEarlyCull:vlv */
 845	wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
 846
 847	/* WaPsdDispatchEnable:vlv */
 848	/* WaDisablePSDDualDispatchEnable:vlv */
 849	wa_masked_en(wal,
 850		     GEN7_HALF_SLICE_CHICKEN1,
 851		     GEN7_MAX_PS_THREAD_DEP |
 852		     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
 853
 854	/* WaDisable_RenderCache_OperationalFlush:vlv */
 855	wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
 856
 857	/* WaForceL3Serialization:vlv */
 858	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
 859
 860	/*
 861	 * WaVSThreadDispatchOverride:ivb,vlv
 862	 *
 863	 * This actually overrides the dispatch
 864	 * mode for all thread types.
 865	 */
 866	wa_write_masked_or(wal,
 867			   GEN7_FF_THREAD_MODE,
 868			   GEN7_FF_SCHED_MASK,
 869			   GEN7_FF_TS_SCHED_HW |
 870			   GEN7_FF_VS_SCHED_HW |
 871			   GEN7_FF_DS_SCHED_HW);
 872
 873	/*
 874	 * BSpec says this must be set, even though
 875	 * WaDisable4x2SubspanOptimization isn't listed for VLV.
 876	 */
 877	wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
 878
 879	/*
 880	 * BSpec recommends 8x4 when MSAA is used,
 881	 * however in practice 16x4 seems fastest.
 882	 *
 883	 * Note that PS/WM thread counts depend on the WIZ hashing
 884	 * disable bit, which we don't touch here, but it's good
 885	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
 886	 */
 887	wa_add(wal, GEN7_GT_MODE, 0,
 888	       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
 889	       GEN6_WIZ_HASHING_16x4);
 890
 891	/*
 892	 * WaIncreaseL3CreditsForVLVB0:vlv
 893	 * This is the hardware default actually.
 894	 */
 895	wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE);
 896}
 897
 898static void
 899hsw_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 900{
 901	/* L3 caching of data atomics doesn't work -- disable it. */
 902	wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
 903
 904	wa_add(wal,
 905	       HSW_ROW_CHICKEN3, 0,
 906	       _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE),
 907		0 /* XXX does this reg exist? */);
 908
 909	/* WaVSRefCountFullforceMissDisable:hsw */
 910	wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME);
 
 
 
 
 
 
 
 
 911
 912	wa_masked_dis(wal,
 913		      CACHE_MODE_0_GEN7,
 914		      /* WaDisable_RenderCache_OperationalFlush:hsw */
 915		      RC_OP_FLUSH_ENABLE |
 916		      /* enable HiZ Raw Stall Optimization */
 917		      HIZ_RAW_STALL_OPT_DISABLE);
 918
 919	/* WaDisable4x2SubspanOptimization:hsw */
 920	wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 921
 922	/*
 923	 * BSpec recommends 8x4 when MSAA is used,
 924	 * however in practice 16x4 seems fastest.
 925	 *
 926	 * Note that PS/WM thread counts depend on the WIZ hashing
 927	 * disable bit, which we don't touch here, but it's good
 928	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
 929	 */
 930	wa_add(wal, GEN7_GT_MODE, 0,
 931	       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
 932	       GEN6_WIZ_HASHING_16x4);
 
 933
 934	/* WaSampleCChickenBitEnable:hsw */
 935	wa_masked_en(wal, HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE);
 936}
 937
 938static void
 939gen9_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 940{
 
 
 
 
 
 941	/* WaDisableKillLogic:bxt,skl,kbl */
 942	if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915))
 943		wa_write_or(wal,
 944			    GAM_ECOCHK,
 945			    ECOCHK_DIS_TLB);
 946
 947	if (HAS_LLC(i915)) {
 948		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
 949		 *
 950		 * Must match Display Engine. See
 951		 * WaCompressedResourceDisplayNewHashMode.
 952		 */
 953		wa_write_or(wal,
 954			    MMCD_MISC_CTRL,
 955			    MMCD_PCLA | MMCD_HOTSPOT_EN);
 956	}
 957
 958	/* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */
 959	wa_write_or(wal,
 960		    GAM_ECOCHK,
 961		    BDW_DISABLE_HDC_INVALIDATION);
 962}
 963
 964static void
 965skl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 966{
 967	gen9_gt_workarounds_init(i915, wal);
 968
 969	/* WaDisableGafsUnitClkGating:skl */
 970	wa_write_or(wal,
 971		    GEN7_UCGCTL4,
 972		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
 973
 974	/* WaInPlaceDecompressionHang:skl */
 975	if (IS_SKL_REVID(i915, SKL_REVID_H0, REVID_FOREVER))
 976		wa_write_or(wal,
 977			    GEN9_GAMT_ECO_REG_RW_IA,
 978			    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
 979}
 980
 981static void
 982bxt_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 983{
 984	gen9_gt_workarounds_init(i915, wal);
 985
 986	/* WaInPlaceDecompressionHang:bxt */
 987	wa_write_or(wal,
 988		    GEN9_GAMT_ECO_REG_RW_IA,
 989		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
 990}
 991
 992static void
 993kbl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 994{
 995	gen9_gt_workarounds_init(i915, wal);
 996
 997	/* WaDisableDynamicCreditSharing:kbl */
 998	if (IS_KBL_REVID(i915, 0, KBL_REVID_B0))
 999		wa_write_or(wal,
1000			    GAMT_CHKN_BIT_REG,
1001			    GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);
1002
1003	/* WaDisableGafsUnitClkGating:kbl */
1004	wa_write_or(wal,
1005		    GEN7_UCGCTL4,
1006		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1007
1008	/* WaInPlaceDecompressionHang:kbl */
1009	wa_write_or(wal,
1010		    GEN9_GAMT_ECO_REG_RW_IA,
1011		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1012}
1013
1014static void
1015glk_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
1016{
1017	gen9_gt_workarounds_init(i915, wal);
1018}
1019
1020static void
1021cfl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
1022{
1023	gen9_gt_workarounds_init(i915, wal);
1024
1025	/* WaDisableGafsUnitClkGating:cfl */
1026	wa_write_or(wal,
1027		    GEN7_UCGCTL4,
1028		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1029
1030	/* WaInPlaceDecompressionHang:cfl */
1031	wa_write_or(wal,
1032		    GEN9_GAMT_ECO_REG_RW_IA,
1033		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1034}
1035
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1036static void
1037wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal)
1038{
1039	const struct sseu_dev_info *sseu = &i915->gt.info.sseu;
1040	unsigned int slice, subslice;
1041	u32 l3_en, mcr, mcr_mask;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1042
1043	GEM_BUG_ON(INTEL_GEN(i915) < 10);
 
 
 
 
 
 
1044
1045	/*
1046	 * WaProgramMgsrForL3BankSpecificMmioReads: cnl,icl
1047	 * L3Banks could be fused off in single slice scenario. If that is
1048	 * the case, we might need to program MCR select to a valid L3Bank
1049	 * by default, to make sure we correctly read certain registers
1050	 * later on (in the range 0xB100 - 0xB3FF).
 
 
 
 
 
 
 
 
 
1051	 *
1052	 * WaProgramMgsrForCorrectSliceSpecificMmioReads:cnl,icl
1053	 * Before any MMIO read into slice/subslice specific registers, MCR
1054	 * packet control register needs to be programmed to point to any
1055	 * enabled s/ss pair. Otherwise, incorrect values will be returned.
1056	 * This means each subsequent MMIO read will be forwarded to an
1057	 * specific s/ss combination, but this is OK since these registers
1058	 * are consistent across s/ss in almost all cases. In the rare
1059	 * occasions, such as INSTDONE, where this value is dependent
1060	 * on s/ss combo, the read should be done with read_subslice_reg.
1061	 *
1062	 * Since GEN8_MCR_SELECTOR contains dual-purpose bits which select both
1063	 * to which subslice, or to which L3 bank, the respective mmio reads
1064	 * will go, we have to find a common index which works for both
1065	 * accesses.
1066	 *
1067	 * Case where we cannot find a common index fortunately should not
1068	 * happen in production hardware, so we only emit a warning instead of
1069	 * implementing something more complex that requires checking the range
1070	 * of every MMIO read.
1071	 */
1072
1073	if (INTEL_GEN(i915) >= 10 && is_power_of_2(sseu->slice_mask)) {
1074		u32 l3_fuse =
1075			intel_uncore_read(&i915->uncore, GEN10_MIRROR_FUSE3) &
1076			GEN10_L3BANK_MASK;
1077
1078		drm_dbg(&i915->drm, "L3 fuse = %x\n", l3_fuse);
1079		l3_en = ~(l3_fuse << GEN10_L3BANK_PAIR_COUNT | l3_fuse);
1080	} else {
1081		l3_en = ~0;
1082	}
1083
1084	slice = fls(sseu->slice_mask) - 1;
1085	subslice = fls(l3_en & intel_sseu_get_subslices(sseu, slice));
1086	if (!subslice) {
1087		drm_warn(&i915->drm,
1088			 "No common index found between subslice mask %x and L3 bank mask %x!\n",
1089			 intel_sseu_get_subslices(sseu, slice), l3_en);
1090		subslice = fls(l3_en);
1091		drm_WARN_ON(&i915->drm, !subslice);
1092	}
1093	subslice--;
1094
1095	if (INTEL_GEN(i915) >= 11) {
1096		mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice);
1097		mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
1098	} else {
1099		mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice);
1100		mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
1101	}
1102
1103	drm_dbg(&i915->drm, "MCR slice/subslice = %x\n", mcr);
 
 
1104
1105	wa_write_masked_or(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr);
1106}
1107
1108static void
1109cnl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
1110{
1111	wa_init_mcr(i915, wal);
 
 
 
 
 
 
 
1112
1113	/* WaInPlaceDecompressionHang:cnl */
1114	wa_write_or(wal,
1115		    GEN9_GAMT_ECO_REG_RW_IA,
1116		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
 
 
1117}
1118
1119static void
1120icl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
1121{
1122	wa_init_mcr(i915, wal);
1123
1124	/* WaInPlaceDecompressionHang:icl */
1125	wa_write_or(wal,
1126		    GEN9_GAMT_ECO_REG_RW_IA,
1127		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1128
1129	/* WaModifyGamTlbPartitioning:icl */
1130	wa_write_masked_or(wal,
1131			   GEN11_GACB_PERF_CTRL,
1132			   GEN11_HASH_CTRL_MASK,
1133			   GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4);
1134
1135	/* Wa_1405766107:icl
1136	 * Formerly known as WaCL2SFHalfMaxAlloc
1137	 */
1138	wa_write_or(wal,
1139		    GEN11_LSN_UNSLCVC,
1140		    GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC |
1141		    GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC);
1142
1143	/* Wa_220166154:icl
1144	 * Formerly known as WaDisCtxReload
1145	 */
1146	wa_write_or(wal,
1147		    GEN8_GAMW_ECO_DEV_RW_IA,
1148		    GAMW_ECO_DEV_CTX_RELOAD_DISABLE);
1149
1150	/* Wa_1405779004:icl (pre-prod) */
1151	if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0))
1152		wa_write_or(wal,
1153			    SLICE_UNIT_LEVEL_CLKGATE,
1154			    MSCUNIT_CLKGATE_DIS);
1155
1156	/* Wa_1406838659:icl (pre-prod) */
1157	if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_B0))
1158		wa_write_or(wal,
1159			    INF_UNIT_LEVEL_CLKGATE,
1160			    CGPSF_CLKGATE_DIS);
1161
1162	/* Wa_1406463099:icl
1163	 * Formerly known as WaGamTlbPendError
1164	 */
1165	wa_write_or(wal,
1166		    GAMT_CHKN_BIT_REG,
1167		    GAMT_CHKN_DISABLE_L3_COH_PIPE);
1168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1169	/* Wa_1607087056:icl,ehl,jsl */
1170	if (IS_ICELAKE(i915) ||
1171	    IS_EHL_REVID(i915, EHL_REVID_A0, EHL_REVID_A0)) {
 
1172		wa_write_or(wal,
1173			    SLICE_UNIT_LEVEL_CLKGATE,
1174			    L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1175	}
1176}
1177
1178static void
1179tgl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
1180{
1181	wa_init_mcr(i915, wal);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1182
1183	/* Wa_1409420604:tgl */
1184	if (IS_TGL_REVID(i915, TGL_REVID_A0, TGL_REVID_A0))
1185		wa_write_or(wal,
1186			    SUBSLICE_UNIT_LEVEL_CLKGATE2,
1187			    CPSSUNIT_CLKGATE_DIS);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1188
1189	/* Wa_1607087056:tgl also know as BUG:1409180338 */
1190	if (IS_TGL_REVID(i915, TGL_REVID_A0, TGL_REVID_A0))
1191		wa_write_or(wal,
1192			    SLICE_UNIT_LEVEL_CLKGATE,
1193			    L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1194}
1195
1196static void
1197gt_init_workarounds(struct drm_i915_private *i915, struct i915_wa_list *wal)
1198{
1199	if (IS_GEN(i915, 12))
1200		tgl_gt_workarounds_init(i915, wal);
1201	else if (IS_GEN(i915, 11))
1202		icl_gt_workarounds_init(i915, wal);
1203	else if (IS_CANNONLAKE(i915))
1204		cnl_gt_workarounds_init(i915, wal);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1205	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
1206		cfl_gt_workarounds_init(i915, wal);
1207	else if (IS_GEMINILAKE(i915))
1208		glk_gt_workarounds_init(i915, wal);
1209	else if (IS_KABYLAKE(i915))
1210		kbl_gt_workarounds_init(i915, wal);
1211	else if (IS_BROXTON(i915))
1212		bxt_gt_workarounds_init(i915, wal);
1213	else if (IS_SKYLAKE(i915))
1214		skl_gt_workarounds_init(i915, wal);
1215	else if (IS_HASWELL(i915))
1216		hsw_gt_workarounds_init(i915, wal);
1217	else if (IS_VALLEYVIEW(i915))
1218		vlv_gt_workarounds_init(i915, wal);
1219	else if (IS_IVYBRIDGE(i915))
1220		ivb_gt_workarounds_init(i915, wal);
1221	else if (IS_GEN(i915, 6))
1222		snb_gt_workarounds_init(i915, wal);
1223	else if (IS_GEN(i915, 5))
1224		ilk_gt_workarounds_init(i915, wal);
1225	else if (IS_G4X(i915))
1226		g4x_gt_workarounds_init(i915, wal);
1227	else if (IS_GEN(i915, 4))
1228		gen4_gt_workarounds_init(i915, wal);
1229	else if (INTEL_GEN(i915) <= 8)
1230		return;
1231	else
1232		MISSING_CASE(INTEL_GEN(i915));
1233}
1234
1235void intel_gt_init_workarounds(struct drm_i915_private *i915)
1236{
1237	struct i915_wa_list *wal = &i915->gt_wa_list;
1238
1239	wa_init_start(wal, "GT", "global");
1240	gt_init_workarounds(i915, wal);
1241	wa_init_finish(wal);
1242}
1243
1244static enum forcewake_domains
1245wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal)
1246{
1247	enum forcewake_domains fw = 0;
1248	struct i915_wa *wa;
1249	unsigned int i;
1250
1251	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1252		fw |= intel_uncore_forcewake_for_reg(uncore,
1253						     wa->reg,
1254						     FW_REG_READ |
1255						     FW_REG_WRITE);
1256
1257	return fw;
1258}
1259
1260static bool
1261wa_verify(const struct i915_wa *wa, u32 cur, const char *name, const char *from)
 
1262{
1263	if ((cur ^ wa->set) & wa->read) {
1264		DRM_ERROR("%s workaround lost on %s! (%x=%x/%x, expected %x)\n",
1265			  name, from, i915_mmio_reg_offset(wa->reg),
1266			  cur, cur & wa->read, wa->set);
 
1267
1268		return false;
1269	}
1270
1271	return true;
1272}
1273
1274static void
1275wa_list_apply(struct intel_uncore *uncore, const struct i915_wa_list *wal)
1276{
 
 
1277	enum forcewake_domains fw;
1278	unsigned long flags;
1279	struct i915_wa *wa;
1280	unsigned int i;
1281
1282	if (!wal->count)
1283		return;
1284
1285	fw = wal_get_fw_for_rmw(uncore, wal);
1286
1287	spin_lock_irqsave(&uncore->lock, flags);
 
1288	intel_uncore_forcewake_get__locked(uncore, fw);
1289
1290	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
 
 
 
1291		if (wa->clr)
1292			intel_uncore_rmw_fw(uncore, wa->reg, wa->clr, wa->set);
1293		else
1294			intel_uncore_write_fw(uncore, wa->reg, wa->set);
1295		if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1296			wa_verify(wa,
1297				  intel_uncore_read_fw(uncore, wa->reg),
1298				  wal->name, "application");
 
 
 
 
 
 
 
 
 
 
 
1299	}
1300
1301	intel_uncore_forcewake_put__locked(uncore, fw);
1302	spin_unlock_irqrestore(&uncore->lock, flags);
 
1303}
1304
1305void intel_gt_apply_workarounds(struct intel_gt *gt)
1306{
1307	wa_list_apply(gt->uncore, &gt->i915->gt_wa_list);
1308}
1309
1310static bool wa_list_verify(struct intel_uncore *uncore,
1311			   const struct i915_wa_list *wal,
1312			   const char *from)
1313{
 
1314	struct i915_wa *wa;
 
 
1315	unsigned int i;
1316	bool ok = true;
1317
 
 
 
 
 
 
1318	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1319		ok &= wa_verify(wa,
1320				intel_uncore_read(uncore, wa->reg),
 
1321				wal->name, from);
1322
 
 
 
 
1323	return ok;
1324}
1325
1326bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from)
1327{
1328	return wa_list_verify(gt->uncore, &gt->i915->gt_wa_list, from);
1329}
1330
1331static inline bool is_nonpriv_flags_valid(u32 flags)
 
1332{
1333	/* Check only valid flag bits are set */
1334	if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID)
1335		return false;
1336
1337	/* NB: Only 3 out of 4 enum values are valid for access field */
1338	if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) ==
1339	    RING_FORCE_TO_NONPRIV_ACCESS_INVALID)
1340		return false;
1341
1342	return true;
1343}
1344
1345static void
1346whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags)
1347{
1348	struct i915_wa wa = {
1349		.reg = reg
1350	};
1351
1352	if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1353		return;
1354
1355	if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1356		return;
1357
1358	wa.reg.reg |= flags;
1359	_wa_add(wal, &wa);
1360}
1361
1362static void
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1363whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
1364{
1365	whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1366}
1367
 
 
 
 
 
 
1368static void gen9_whitelist_build(struct i915_wa_list *w)
1369{
1370	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
1371	whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
1372
1373	/* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */
1374	whitelist_reg(w, GEN8_CS_CHICKEN1);
1375
1376	/* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */
1377	whitelist_reg(w, GEN8_HDC_CHICKEN1);
1378
1379	/* WaSendPushConstantsFromMMIO:skl,bxt */
1380	whitelist_reg(w, COMMON_SLICE_CHICKEN2);
1381}
1382
1383static void skl_whitelist_build(struct intel_engine_cs *engine)
1384{
1385	struct i915_wa_list *w = &engine->whitelist;
1386
1387	if (engine->class != RENDER_CLASS)
1388		return;
1389
1390	gen9_whitelist_build(w);
1391
1392	/* WaDisableLSQCROPERFforOCL:skl */
1393	whitelist_reg(w, GEN8_L3SQCREG4);
1394}
1395
1396static void bxt_whitelist_build(struct intel_engine_cs *engine)
1397{
1398	if (engine->class != RENDER_CLASS)
1399		return;
1400
1401	gen9_whitelist_build(&engine->whitelist);
1402}
1403
1404static void kbl_whitelist_build(struct intel_engine_cs *engine)
1405{
1406	struct i915_wa_list *w = &engine->whitelist;
1407
1408	if (engine->class != RENDER_CLASS)
1409		return;
1410
1411	gen9_whitelist_build(w);
1412
1413	/* WaDisableLSQCROPERFforOCL:kbl */
1414	whitelist_reg(w, GEN8_L3SQCREG4);
1415}
1416
1417static void glk_whitelist_build(struct intel_engine_cs *engine)
1418{
1419	struct i915_wa_list *w = &engine->whitelist;
1420
1421	if (engine->class != RENDER_CLASS)
1422		return;
1423
1424	gen9_whitelist_build(w);
1425
1426	/* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */
1427	whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1428}
1429
1430static void cfl_whitelist_build(struct intel_engine_cs *engine)
1431{
1432	struct i915_wa_list *w = &engine->whitelist;
1433
1434	if (engine->class != RENDER_CLASS)
1435		return;
1436
1437	gen9_whitelist_build(w);
1438
1439	/*
1440	 * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml
1441	 *
1442	 * This covers 4 register which are next to one another :
1443	 *   - PS_INVOCATION_COUNT
1444	 *   - PS_INVOCATION_COUNT_UDW
1445	 *   - PS_DEPTH_COUNT
1446	 *   - PS_DEPTH_COUNT_UDW
1447	 */
1448	whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1449			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
1450			  RING_FORCE_TO_NONPRIV_RANGE_4);
1451}
1452
1453static void cml_whitelist_build(struct intel_engine_cs *engine)
1454{
1455	struct i915_wa_list *w = &engine->whitelist;
1456
1457	if (engine->class != RENDER_CLASS)
1458		whitelist_reg_ext(w,
1459				  RING_CTX_TIMESTAMP(engine->mmio_base),
1460				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1461
1462	cfl_whitelist_build(engine);
1463}
1464
1465static void cnl_whitelist_build(struct intel_engine_cs *engine)
1466{
1467	struct i915_wa_list *w = &engine->whitelist;
1468
1469	if (engine->class != RENDER_CLASS)
1470		return;
1471
1472	/* WaEnablePreemptionGranularityControlByUMD:cnl */
1473	whitelist_reg(w, GEN8_CS_CHICKEN1);
1474}
1475
1476static void icl_whitelist_build(struct intel_engine_cs *engine)
1477{
1478	struct i915_wa_list *w = &engine->whitelist;
1479
 
 
1480	switch (engine->class) {
1481	case RENDER_CLASS:
1482		/* WaAllowUMDToModifyHalfSliceChicken7:icl */
1483		whitelist_reg(w, GEN9_HALF_SLICE_CHICKEN7);
1484
1485		/* WaAllowUMDToModifySamplerMode:icl */
1486		whitelist_reg(w, GEN10_SAMPLER_MODE);
1487
1488		/* WaEnableStateCacheRedirectToCS:icl */
1489		whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1490
1491		/*
1492		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl
1493		 *
1494		 * This covers 4 register which are next to one another :
1495		 *   - PS_INVOCATION_COUNT
1496		 *   - PS_INVOCATION_COUNT_UDW
1497		 *   - PS_DEPTH_COUNT
1498		 *   - PS_DEPTH_COUNT_UDW
1499		 */
1500		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1501				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
1502				  RING_FORCE_TO_NONPRIV_RANGE_4);
1503		break;
1504
1505	case VIDEO_DECODE_CLASS:
1506		/* hucStatusRegOffset */
1507		whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base),
1508				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1509		/* hucUKernelHdrInfoRegOffset */
1510		whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base),
1511				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1512		/* hucStatus2RegOffset */
1513		whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base),
1514				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1515		whitelist_reg_ext(w,
1516				  RING_CTX_TIMESTAMP(engine->mmio_base),
1517				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1518		break;
1519
1520	default:
1521		whitelist_reg_ext(w,
1522				  RING_CTX_TIMESTAMP(engine->mmio_base),
1523				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1524		break;
1525	}
1526}
1527
1528static void tgl_whitelist_build(struct intel_engine_cs *engine)
1529{
1530	struct i915_wa_list *w = &engine->whitelist;
1531
 
 
1532	switch (engine->class) {
1533	case RENDER_CLASS:
1534		/*
1535		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl
1536		 * Wa_1408556865:tgl
1537		 *
1538		 * This covers 4 registers which are next to one another :
1539		 *   - PS_INVOCATION_COUNT
1540		 *   - PS_INVOCATION_COUNT_UDW
1541		 *   - PS_DEPTH_COUNT
1542		 *   - PS_DEPTH_COUNT_UDW
1543		 */
1544		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1545				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
1546				  RING_FORCE_TO_NONPRIV_RANGE_4);
1547
1548		/* Wa_1808121037:tgl */
 
 
 
 
1549		whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
1550
1551		/* Wa_1806527549:tgl */
1552		whitelist_reg(w, HIZ_CHICKEN);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1553		break;
1554	default:
1555		whitelist_reg_ext(w,
1556				  RING_CTX_TIMESTAMP(engine->mmio_base),
1557				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1558		break;
1559	}
1560}
1561
1562void intel_engine_init_whitelist(struct intel_engine_cs *engine)
1563{
1564	struct drm_i915_private *i915 = engine->i915;
1565	struct i915_wa_list *w = &engine->whitelist;
1566
1567	wa_init_start(w, "whitelist", engine->name);
1568
1569	if (IS_GEN(i915, 12))
 
 
 
 
 
 
1570		tgl_whitelist_build(engine);
1571	else if (IS_GEN(i915, 11))
1572		icl_whitelist_build(engine);
1573	else if (IS_CANNONLAKE(i915))
1574		cnl_whitelist_build(engine);
1575	else if (IS_COMETLAKE(i915))
1576		cml_whitelist_build(engine);
1577	else if (IS_COFFEELAKE(i915))
1578		cfl_whitelist_build(engine);
1579	else if (IS_GEMINILAKE(i915))
1580		glk_whitelist_build(engine);
1581	else if (IS_KABYLAKE(i915))
1582		kbl_whitelist_build(engine);
1583	else if (IS_BROXTON(i915))
1584		bxt_whitelist_build(engine);
1585	else if (IS_SKYLAKE(i915))
1586		skl_whitelist_build(engine);
1587	else if (INTEL_GEN(i915) <= 8)
1588		return;
1589	else
1590		MISSING_CASE(INTEL_GEN(i915));
1591
1592	wa_init_finish(w);
1593}
1594
1595void intel_engine_apply_whitelist(struct intel_engine_cs *engine)
1596{
1597	const struct i915_wa_list *wal = &engine->whitelist;
1598	struct intel_uncore *uncore = engine->uncore;
1599	const u32 base = engine->mmio_base;
1600	struct i915_wa *wa;
1601	unsigned int i;
1602
1603	if (!wal->count)
1604		return;
1605
1606	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1607		intel_uncore_write(uncore,
1608				   RING_FORCE_TO_NONPRIV(base, i),
1609				   i915_mmio_reg_offset(wa->reg));
1610
1611	/* And clear the rest just in case of garbage */
1612	for (; i < RING_MAX_NONPRIV_SLOTS; i++)
1613		intel_uncore_write(uncore,
1614				   RING_FORCE_TO_NONPRIV(base, i),
1615				   i915_mmio_reg_offset(RING_NOPID(base)));
1616}
1617
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1618static void
1619rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
1620{
1621	struct drm_i915_private *i915 = engine->i915;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1622
1623	if (IS_TGL_REVID(i915, TGL_REVID_A0, TGL_REVID_A0)) {
1624		/*
1625		 * Wa_1607138336:tgl
1626		 * Wa_1607063988:tgl
1627		 */
1628		wa_write_or(wal,
1629			    GEN9_CTX_PREEMPT_REG,
1630			    GEN12_DISABLE_POSH_BUSY_FF_DOP_CG);
 
 
 
 
 
 
 
 
1631
 
 
1632		/*
1633		 * Wa_1607030317:tgl
1634		 * Wa_1607186500:tgl
1635		 * Wa_1607297627:tgl there is 3 entries for this WA on BSpec, 2
1636		 * of then says it is fixed on B0 the other one says it is
1637		 * permanent
1638		 */
1639		wa_masked_en(wal,
1640			     GEN6_RC_SLEEP_PSMI_CONTROL,
1641			     GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE |
1642			     GEN8_RC_SEMA_IDLE_MSG_DISABLE);
 
 
 
 
 
1643
1644		/*
1645		 * Wa_1606679103:tgl
1646		 * (see also Wa_1606682166:icl)
 
 
1647		 */
1648		wa_write_or(wal,
1649			    GEN7_SARCHKMD,
1650			    GEN7_DISABLE_SAMPLER_PREFETCH);
1651
1652		/* Wa_1408615072:tgl */
1653		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1654			    VSUNIT_CLKGATE_DIS_TGL);
 
1655	}
1656
1657	if (IS_TIGERLAKE(i915)) {
1658		/* Wa_1606931601:tgl */
1659		wa_masked_en(wal, GEN7_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ);
 
 
1660
1661		/* Wa_1409804808:tgl */
1662		wa_masked_en(wal, GEN7_ROW_CHICKEN2,
1663			     GEN12_PUSH_CONST_DEREF_HOLD_DIS);
1664
1665		/* Wa_1606700617:tgl */
1666		wa_masked_en(wal,
1667			     GEN9_CS_DEBUG_MODE1,
1668			     FF_DOP_CLOCK_GATE_DISABLE);
1669
 
1670		/*
1671		 * Wa_1409085225:tgl
1672		 * Wa_14010229206:tgl
 
 
 
1673		 */
1674		wa_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH);
 
 
 
 
1675
 
1676		/*
1677		 * Wa_1407928979:tgl A*
1678		 * Wa_18011464164:tgl B0+
1679		 * Wa_22010931296:tgl B0+
1680		 */
1681		wa_write_or(wal, GEN7_FF_THREAD_MODE,
1682			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
1683	}
1684
1685	if (IS_GEN(i915, 11)) {
1686		/* This is not an Wa. Enable for better image quality */
1687		wa_masked_en(wal,
1688			     _3D_CHICKEN3,
1689			     _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE);
1690
1691		/* WaPipelineFlushCoherentLines:icl */
1692		wa_write_or(wal,
1693			    GEN8_L3SQCREG4,
1694			    GEN8_LQSC_FLUSH_COHERENT_LINES);
1695
1696		/*
1697		 * Wa_1405543622:icl
1698		 * Formerly known as WaGAPZPriorityScheme
1699		 */
1700		wa_write_or(wal,
1701			    GEN8_GARBCNTL,
1702			    GEN11_ARBITRATION_PRIO_ORDER_MASK);
1703
1704		/*
1705		 * Wa_1604223664:icl
1706		 * Formerly known as WaL3BankAddressHashing
1707		 */
1708		wa_write_masked_or(wal,
1709				   GEN8_GARBCNTL,
1710				   GEN11_HASH_CTRL_EXCL_MASK,
1711				   GEN11_HASH_CTRL_EXCL_BIT0);
1712		wa_write_masked_or(wal,
1713				   GEN11_GLBLINVL,
1714				   GEN11_BANK_HASH_ADDR_EXCL_MASK,
1715				   GEN11_BANK_HASH_ADDR_EXCL_BIT0);
1716
1717		/*
1718		 * Wa_1405733216:icl
1719		 * Formerly known as WaDisableCleanEvicts
1720		 */
1721		wa_write_or(wal,
1722			    GEN8_L3SQCREG4,
1723			    GEN11_LQSC_CLEAN_EVICT_DISABLE);
1724
1725		/* WaForwardProgressSoftReset:icl */
1726		wa_write_or(wal,
1727			    GEN10_SCRATCH_LNCF2,
1728			    PMFLUSHDONE_LNICRSDROP |
1729			    PMFLUSH_GAPL3UNBLOCK |
1730			    PMFLUSHDONE_LNEBLK);
1731
1732		/* Wa_1406609255:icl (pre-prod) */
1733		if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_B0))
1734			wa_write_or(wal,
1735				    GEN7_SARCHKMD,
1736				    GEN7_DISABLE_DEMAND_PREFETCH);
1737
1738		/* Wa_1606682166:icl */
1739		wa_write_or(wal,
1740			    GEN7_SARCHKMD,
1741			    GEN7_DISABLE_SAMPLER_PREFETCH);
1742
1743		/* Wa_1409178092:icl */
1744		wa_write_masked_or(wal,
1745				   GEN11_SCRATCH2,
1746				   GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE,
1747				   0);
1748
1749		/* WaEnable32PlaneMode:icl */
1750		wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS,
1751			     GEN11_ENABLE_32_PLANE_MODE);
1752
1753		/*
1754		 * Wa_1408615072:icl,ehl  (vsunit)
1755		 * Wa_1407596294:icl,ehl  (hsunit)
1756		 */
1757		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1758			    VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS);
1759
1760		/* Wa_1407352427:icl,ehl */
1761		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1762			    PSDUNIT_CLKGATE_DIS);
1763
1764		/* Wa_1406680159:icl,ehl */
1765		wa_write_or(wal,
1766			    SUBSLICE_UNIT_LEVEL_CLKGATE,
1767			    GWUNIT_CLKGATE_DIS);
1768
1769		/*
1770		 * Wa_1408767742:icl[a2..forever],ehl[all]
1771		 * Wa_1605460711:icl[a0..c0]
1772		 */
1773		wa_write_or(wal,
1774			    GEN7_FF_THREAD_MODE,
1775			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
1776
1777		/* Wa_22010271021:ehl */
1778		if (IS_ELKHARTLAKE(i915))
1779			wa_masked_en(wal,
1780				     GEN9_CS_DEBUG_MODE1,
1781				     FF_DOP_CLOCK_GATE_DISABLE);
1782	}
1783
1784	if (IS_GEN_RANGE(i915, 9, 12)) {
1785		/* FtrPerCtxtPreemptionGranularityControl:skl,bxt,kbl,cfl,cnl,icl,tgl */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1786		wa_masked_en(wal,
1787			     GEN7_FF_SLICE_CS_CHICKEN1,
1788			     GEN9_FFSC_PERCTX_PREEMPT_CTRL);
1789	}
1790
1791	if (IS_SKYLAKE(i915) ||
1792	    IS_KABYLAKE(i915) ||
1793	    IS_COFFEELAKE(i915) ||
1794	    IS_COMETLAKE(i915)) {
1795		/* WaEnableGapsTsvCreditFix:skl,kbl,cfl */
1796		wa_write_or(wal,
1797			    GEN8_GARBCNTL,
1798			    GEN9_GAPS_TSV_CREDIT_DISABLE);
1799	}
1800
1801	if (IS_BROXTON(i915)) {
1802		/* WaDisablePooledEuLoadBalancingFix:bxt */
1803		wa_masked_en(wal,
1804			     FF_SLICE_CS_CHICKEN2,
1805			     GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
1806	}
1807
1808	if (IS_GEN(i915, 9)) {
1809		/* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */
1810		wa_masked_en(wal,
1811			     GEN9_CSFE_CHICKEN1_RCS,
1812			     GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE);
1813
1814		/* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */
1815		wa_write_or(wal,
1816			    BDW_SCRATCH1,
1817			    GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
1818
1819		/* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */
1820		if (IS_GEN9_LP(i915))
1821			wa_write_masked_or(wal,
1822					   GEN8_L3SQCREG1,
1823					   L3_PRIO_CREDITS_MASK,
1824					   L3_GENERAL_PRIO_CREDITS(62) |
1825					   L3_HIGH_PRIO_CREDITS(2));
1826
1827		/* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */
1828		wa_write_or(wal,
1829			    GEN8_L3SQCREG4,
1830			    GEN8_LQSC_FLUSH_COHERENT_LINES);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1831	}
1832
1833	if (IS_GEN(i915, 7))
1834		/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
1835		wa_masked_en(wal,
1836			     GFX_MODE_GEN7,
1837			     GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
1838
1839	if (IS_GEN_RANGE(i915, 6, 7))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1840		/*
1841		 * We need to disable the AsyncFlip performance optimisations in
1842		 * order to use MI_WAIT_FOR_EVENT within the CS. It should
1843		 * already be programmed to '1' on all products.
1844		 *
1845		 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
1846		 */
1847		wa_masked_en(wal,
1848			     MI_MODE,
1849			     ASYNC_FLIP_PERF_DISABLE);
1850
1851	if (IS_GEN(i915, 6)) {
1852		/*
1853		 * Required for the hardware to program scanline values for
1854		 * waiting
1855		 * WaEnableFlushTlbInvalidationMode:snb
1856		 */
1857		wa_masked_en(wal,
1858			     GFX_MODE,
1859			     GFX_TLB_INVALIDATE_EXPLICIT);
1860
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1861		/*
1862		 * From the Sandybridge PRM, volume 1 part 3, page 24:
1863		 * "If this bit is set, STCunit will have LRA as replacement
1864		 *  policy. [...] This bit must be reset. LRA replacement
1865		 *  policy is not supported."
1866		 */
1867		wa_masked_dis(wal,
1868			      CACHE_MODE_0,
1869			      CM0_STC_EVICT_DISABLE_LRA_SNB);
1870	}
1871
1872	if (IS_GEN_RANGE(i915, 4, 6))
1873		/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
1874		wa_add(wal, MI_MODE,
1875		       0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH),
1876		       /* XXX bit doesn't stick on Broadwater */
1877		       IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH);
1878
1879	if (IS_GEN(i915, 4))
1880		/*
1881		 * Disable CONSTANT_BUFFER before it is loaded from the context
1882		 * image. For as it is loaded, it is executed and the stored
1883		 * address may no longer be valid, leading to a GPU hang.
1884		 *
1885		 * This imposes the requirement that userspace reload their
1886		 * CONSTANT_BUFFER on every batch, fortunately a requirement
1887		 * they are already accustomed to from before contexts were
1888		 * enabled.
1889		 */
1890		wa_add(wal, ECOSKPD,
1891		       0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE),
1892		       0 /* XXX bit doesn't stick on Broadwater */);
 
1893}
1894
1895static void
1896xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
1897{
1898	struct drm_i915_private *i915 = engine->i915;
1899
1900	/* WaKBLVECSSemaphoreWaitPoll:kbl */
1901	if (IS_KBL_REVID(i915, KBL_REVID_A0, KBL_REVID_E0)) {
1902		wa_write(wal,
1903			 RING_SEMA_WAIT_POLL(engine->mmio_base),
1904			 1);
1905	}
 
 
 
 
 
1906}
1907
1908static void
1909engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1910{
1911	if (I915_SELFTEST_ONLY(INTEL_GEN(engine->i915) < 4))
1912		return;
 
 
 
 
 
 
 
 
 
 
 
1913
1914	if (engine->class == RENDER_CLASS)
1915		rcs_engine_wa_init(engine, wal);
1916	else
1917		xcs_engine_wa_init(engine, wal);
1918}
1919
1920void intel_engine_init_workarounds(struct intel_engine_cs *engine)
1921{
1922	struct i915_wa_list *wal = &engine->wa_list;
 
1923
1924	if (INTEL_GEN(engine->i915) < 4)
1925		return;
1926
1927	wa_init_start(wal, "engine", engine->name);
1928	engine_init_workarounds(engine, wal);
1929	wa_init_finish(wal);
 
 
 
 
 
 
 
 
 
 
 
1930}
1931
1932void intel_engine_apply_workarounds(struct intel_engine_cs *engine)
 
 
 
 
 
 
 
 
 
 
1933{
1934	wa_list_apply(engine->uncore, &engine->wa_list);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1935}
1936
1937static struct i915_vma *
1938create_scratch(struct i915_address_space *vm, int count)
1939{
1940	struct drm_i915_gem_object *obj;
1941	struct i915_vma *vma;
1942	unsigned int size;
1943	int err;
1944
1945	size = round_up(count * sizeof(u32), PAGE_SIZE);
1946	obj = i915_gem_object_create_internal(vm->i915, size);
1947	if (IS_ERR(obj))
1948		return ERR_CAST(obj);
1949
1950	i915_gem_object_set_cache_coherency(obj, I915_CACHE_LLC);
 
 
 
 
 
 
 
 
1951
1952	vma = i915_vma_instance(obj, vm, NULL);
1953	if (IS_ERR(vma)) {
1954		err = PTR_ERR(vma);
1955		goto err_obj;
1956	}
 
 
1957
1958	err = i915_vma_pin(vma, 0, 0,
1959			   i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
1960	if (err)
1961		goto err_obj;
1962
1963	return vma;
 
 
 
1964
1965err_obj:
1966	i915_gem_object_put(obj);
1967	return ERR_PTR(err);
1968}
1969
1970static const struct {
1971	u32 start;
1972	u32 end;
1973} mcr_ranges_gen8[] = {
1974	{ .start = 0x5500, .end = 0x55ff },
1975	{ .start = 0x7000, .end = 0x7fff },
1976	{ .start = 0x9400, .end = 0x97ff },
1977	{ .start = 0xb000, .end = 0xb3ff },
1978	{ .start = 0xe000, .end = 0xe7ff },
1979	{},
1980};
1981
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1982static bool mcr_range(struct drm_i915_private *i915, u32 offset)
1983{
 
1984	int i;
1985
1986	if (INTEL_GEN(i915) < 8)
 
 
 
 
 
 
1987		return false;
1988
1989	/*
1990	 * Registers in these ranges are affected by the MCR selector
1991	 * which only controls CPU initiated MMIO. Routing does not
1992	 * work for CS access so we cannot verify them on this path.
1993	 */
1994	for (i = 0; mcr_ranges_gen8[i].start; i++)
1995		if (offset >= mcr_ranges_gen8[i].start &&
1996		    offset <= mcr_ranges_gen8[i].end)
1997			return true;
1998
1999	return false;
2000}
2001
2002static int
2003wa_list_srm(struct i915_request *rq,
2004	    const struct i915_wa_list *wal,
2005	    struct i915_vma *vma)
2006{
2007	struct drm_i915_private *i915 = rq->engine->i915;
2008	unsigned int i, count = 0;
2009	const struct i915_wa *wa;
2010	u32 srm, *cs;
2011
2012	srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
2013	if (INTEL_GEN(i915) >= 8)
2014		srm++;
2015
2016	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
2017		if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg)))
2018			count++;
2019	}
2020
2021	cs = intel_ring_begin(rq, 4 * count);
2022	if (IS_ERR(cs))
2023		return PTR_ERR(cs);
2024
2025	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
2026		u32 offset = i915_mmio_reg_offset(wa->reg);
2027
2028		if (mcr_range(i915, offset))
2029			continue;
2030
2031		*cs++ = srm;
2032		*cs++ = offset;
2033		*cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i;
2034		*cs++ = 0;
2035	}
2036	intel_ring_advance(rq, cs);
2037
2038	return 0;
2039}
2040
2041static int engine_wa_list_verify(struct intel_context *ce,
2042				 const struct i915_wa_list * const wal,
2043				 const char *from)
2044{
2045	const struct i915_wa *wa;
2046	struct i915_request *rq;
2047	struct i915_vma *vma;
 
2048	unsigned int i;
2049	u32 *results;
2050	int err;
2051
2052	if (!wal->count)
2053		return 0;
2054
2055	vma = create_scratch(&ce->engine->gt->ggtt->vm, wal->count);
 
2056	if (IS_ERR(vma))
2057		return PTR_ERR(vma);
2058
2059	intel_engine_pm_get(ce->engine);
2060	rq = intel_context_create_request(ce);
2061	intel_engine_pm_put(ce->engine);
 
 
 
 
 
 
 
 
 
 
 
 
2062	if (IS_ERR(rq)) {
2063		err = PTR_ERR(rq);
2064		goto err_vma;
2065	}
2066
2067	i915_vma_lock(vma);
2068	err = i915_request_await_object(rq, vma->obj, true);
2069	if (err == 0)
2070		err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
2071	i915_vma_unlock(vma);
2072	if (err) {
2073		i915_request_add(rq);
2074		goto err_vma;
2075	}
2076
2077	err = wa_list_srm(rq, wal, vma);
2078	if (err)
2079		goto err_vma;
2080
2081	i915_request_get(rq);
2082	i915_request_add(rq);
2083	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
2084		err = -ETIME;
2085		goto err_rq;
2086	}
2087
2088	results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
2089	if (IS_ERR(results)) {
2090		err = PTR_ERR(results);
2091		goto err_rq;
2092	}
2093
2094	err = 0;
2095	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
2096		if (mcr_range(rq->engine->i915, i915_mmio_reg_offset(wa->reg)))
2097			continue;
2098
2099		if (!wa_verify(wa, results[i], wal->name, from))
2100			err = -ENXIO;
2101	}
2102
2103	i915_gem_object_unpin_map(vma->obj);
2104
2105err_rq:
2106	i915_request_put(rq);
2107err_vma:
2108	i915_vma_unpin(vma);
 
 
 
 
 
 
 
 
 
 
2109	i915_vma_put(vma);
2110	return err;
2111}
2112
2113int intel_engine_verify_workarounds(struct intel_engine_cs *engine,
2114				    const char *from)
2115{
2116	return engine_wa_list_verify(engine->kernel_context,
2117				     &engine->wa_list,
2118				     from);
2119}
2120
2121#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
2122#include "selftest_workarounds.c"
2123#endif