intel_workarounds.c - drivers/gpu/drm/i915/gt/intel_workarounds.c - Linux source code v3.1

Note: File does not exist in v3.1.
   1/*
   2 * SPDX-License-Identifier: MIT
   3 *
   4 * Copyright © 2014-2018 Intel Corporation
   5 */
   6
   7#include "i915_drv.h"
   8#include "intel_context.h"
   9#include "intel_engine_pm.h"
  10#include "intel_gt.h"
  11#include "intel_ring.h"
  12#include "intel_workarounds.h"
  13
  14/**
  15 * DOC: Hardware workarounds
  16 *
  17 * This file is intended as a central place to implement most [1]_ of the
  18 * required workarounds for hardware to work as originally intended. They fall
  19 * in five basic categories depending on how/when they are applied:
  20 *
  21 * - Workarounds that touch registers that are saved/restored to/from the HW
  22 *   context image. The list is emitted (via Load Register Immediate commands)
  23 *   everytime a new context is created.
  24 * - GT workarounds. The list of these WAs is applied whenever these registers
  25 *   revert to default values (on GPU reset, suspend/resume [2]_, etc..).
  26 * - Display workarounds. The list is applied during display clock-gating
  27 *   initialization.
  28 * - Workarounds that whitelist a privileged register, so that UMDs can manage
  29 *   them directly. This is just a special case of a MMMIO workaround (as we
  30 *   write the list of these to/be-whitelisted registers to some special HW
  31 *   registers).
  32 * - Workaround batchbuffers, that get executed automatically by the hardware
  33 *   on every HW context restore.
  34 *
  35 * .. [1] Please notice that there are other WAs that, due to their nature,
  36 *    cannot be applied from a central place. Those are peppered around the rest
  37 *    of the code, as needed.
  38 *
  39 * .. [2] Technically, some registers are powercontext saved & restored, so they
  40 *    survive a suspend/resume. In practice, writing them again is not too
  41 *    costly and simplifies things. We can revisit this in the future.
  42 *
  43 * Layout
  44 * ~~~~~~
  45 *
  46 * Keep things in this file ordered by WA type, as per the above (context, GT,
  47 * display, register whitelist, batchbuffer). Then, inside each type, keep the
  48 * following order:
  49 *
  50 * - Infrastructure functions and macros
  51 * - WAs per platform in standard gen/chrono order
  52 * - Public functions to init or apply the given workaround type.
  53 */
  54
  55static void wa_init_start(struct i915_wa_list *wal, const char *name, const char *engine_name)
  56{
  57	wal->name = name;
  58	wal->engine_name = engine_name;
  59}
  60
  61#define WA_LIST_CHUNK (1 << 4)
  62
  63static void wa_init_finish(struct i915_wa_list *wal)
  64{
  65	/* Trim unused entries. */
  66	if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) {
  67		struct i915_wa *list = kmemdup(wal->list,
  68					       wal->count * sizeof(*list),
  69					       GFP_KERNEL);
  70
  71		if (list) {
  72			kfree(wal->list);
  73			wal->list = list;
  74		}
  75	}
  76
  77	if (!wal->count)
  78		return;
  79
  80	DRM_DEBUG_DRIVER("Initialized %u %s workarounds on %s\n",
  81			 wal->wa_count, wal->name, wal->engine_name);
  82}
  83
  84static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa)
  85{
  86	unsigned int addr = i915_mmio_reg_offset(wa->reg);
  87	unsigned int start = 0, end = wal->count;
  88	const unsigned int grow = WA_LIST_CHUNK;
  89	struct i915_wa *wa_;
  90
  91	GEM_BUG_ON(!is_power_of_2(grow));
  92
  93	if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */
  94		struct i915_wa *list;
  95
  96		list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*wa),
  97				     GFP_KERNEL);
  98		if (!list) {
  99			DRM_ERROR("No space for workaround init!\n");
 100			return;
 101		}
 102
 103		if (wal->list)
 104			memcpy(list, wal->list, sizeof(*wa) * wal->count);
 105
 106		wal->list = list;
 107	}
 108
 109	while (start < end) {
 110		unsigned int mid = start + (end - start) / 2;
 111
 112		if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) {
 113			start = mid + 1;
 114		} else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) {
 115			end = mid;
 116		} else {
 117			wa_ = &wal->list[mid];
 118
 119			if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) {
 120				DRM_ERROR("Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n",
 121					  i915_mmio_reg_offset(wa_->reg),
 122					  wa_->clr, wa_->set);
 123
 124				wa_->set &= ~wa->clr;
 125			}
 126
 127			wal->wa_count++;
 128			wa_->set |= wa->set;
 129			wa_->clr |= wa->clr;
 130			wa_->read |= wa->read;
 131			return;
 132		}
 133	}
 134
 135	wal->wa_count++;
 136	wa_ = &wal->list[wal->count++];
 137	*wa_ = *wa;
 138
 139	while (wa_-- > wal->list) {
 140		GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) ==
 141			   i915_mmio_reg_offset(wa_[1].reg));
 142		if (i915_mmio_reg_offset(wa_[1].reg) >
 143		    i915_mmio_reg_offset(wa_[0].reg))
 144			break;
 145
 146		swap(wa_[1], wa_[0]);
 147	}
 148}
 149
 150static void wa_add(struct i915_wa_list *wal, i915_reg_t reg,
 151		   u32 clear, u32 set, u32 read_mask)
 152{
 153	struct i915_wa wa = {
 154		.reg  = reg,
 155		.clr  = clear,
 156		.set  = set,
 157		.read = read_mask,
 158	};
 159
 160	_wa_add(wal, &wa);
 161}
 162
 163static void
 164wa_write_masked_or(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set)
 165{
 166	wa_add(wal, reg, clear, set, clear);
 167}
 168
 169static void
 170wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
 171{
 172	wa_write_masked_or(wal, reg, ~0, set);
 173}
 174
 175static void
 176wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
 177{
 178	wa_write_masked_or(wal, reg, set, set);
 179}
 180
 181static void
 182wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr)
 183{
 184	wa_write_masked_or(wal, reg, clr, 0);
 185}
 186
 187static void
 188wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
 189{
 190	wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val);
 191}
 192
 193static void
 194wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
 195{
 196	wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val);
 197}
 198
 199#define WA_SET_BIT_MASKED(addr, mask) \
 200	wa_masked_en(wal, (addr), (mask))
 201
 202#define WA_CLR_BIT_MASKED(addr, mask) \
 203	wa_masked_dis(wal, (addr), (mask))
 204
 205#define WA_SET_FIELD_MASKED(addr, mask, value) \
 206	wa_write_masked_or(wal, (addr), 0, _MASKED_FIELD((mask), (value)))
 207
 208static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine,
 209				      struct i915_wa_list *wal)
 210{
 211	WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING);
 212}
 213
 214static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine,
 215				      struct i915_wa_list *wal)
 216{
 217	WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING);
 218}
 219
 220static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine,
 221				      struct i915_wa_list *wal)
 222{
 223	WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING);
 224
 225	/* WaDisableAsyncFlipPerfMode:bdw,chv */
 226	WA_SET_BIT_MASKED(MI_MODE, ASYNC_FLIP_PERF_DISABLE);
 227
 228	/* WaDisablePartialInstShootdown:bdw,chv */
 229	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
 230			  PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
 231
 232	/* Use Force Non-Coherent whenever executing a 3D context. This is a
 233	 * workaround for for a possible hang in the unlikely event a TLB
 234	 * invalidation occurs during a PSD flush.
 235	 */
 236	/* WaForceEnableNonCoherent:bdw,chv */
 237	/* WaHdcDisableFetchWhenMasked:bdw,chv */
 238	WA_SET_BIT_MASKED(HDC_CHICKEN0,
 239			  HDC_DONOT_FETCH_MEM_WHEN_MASKED |
 240			  HDC_FORCE_NON_COHERENT);
 241
 242	/* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
 243	 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
 244	 *  polygons in the same 8x4 pixel/sample area to be processed without
 245	 *  stalling waiting for the earlier ones to write to Hierarchical Z
 246	 *  buffer."
 247	 *
 248	 * This optimization is off by default for BDW and CHV; turn it on.
 249	 */
 250	WA_CLR_BIT_MASKED(CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
 251
 252	/* Wa4x4STCOptimizationDisable:bdw,chv */
 253	WA_SET_BIT_MASKED(CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
 254
 255	/*
 256	 * BSpec recommends 8x4 when MSAA is used,
 257	 * however in practice 16x4 seems fastest.
 258	 *
 259	 * Note that PS/WM thread counts depend on the WIZ hashing
 260	 * disable bit, which we don't touch here, but it's good
 261	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
 262	 */
 263	WA_SET_FIELD_MASKED(GEN7_GT_MODE,
 264			    GEN6_WIZ_HASHING_MASK,
 265			    GEN6_WIZ_HASHING_16x4);
 266}
 267
 268static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine,
 269				     struct i915_wa_list *wal)
 270{
 271	struct drm_i915_private *i915 = engine->i915;
 272
 273	gen8_ctx_workarounds_init(engine, wal);
 274
 275	/* WaDisableThreadStallDopClockGating:bdw (pre-production) */
 276	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
 277
 278	/* WaDisableDopClockGating:bdw
 279	 *
 280	 * Also see the related UCGTCL1 write in bdw_init_clock_gating()
 281	 * to disable EUTC clock gating.
 282	 */
 283	WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
 284			  DOP_CLOCK_GATING_DISABLE);
 285
 286	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
 287			  GEN8_SAMPLER_POWER_BYPASS_DIS);
 288
 289	WA_SET_BIT_MASKED(HDC_CHICKEN0,
 290			  /* WaForceContextSaveRestoreNonCoherent:bdw */
 291			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
 292			  /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
 293			  (IS_BDW_GT3(i915) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
 294}
 295
 296static void chv_ctx_workarounds_init(struct intel_engine_cs *engine,
 297				     struct i915_wa_list *wal)
 298{
 299	gen8_ctx_workarounds_init(engine, wal);
 300
 301	/* WaDisableThreadStallDopClockGating:chv */
 302	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
 303
 304	/* Improve HiZ throughput on CHV. */
 305	WA_SET_BIT_MASKED(HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
 306}
 307
 308static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine,
 309				      struct i915_wa_list *wal)
 310{
 311	struct drm_i915_private *i915 = engine->i915;
 312
 313	if (HAS_LLC(i915)) {
 314		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
 315		 *
 316		 * Must match Display Engine. See
 317		 * WaCompressedResourceDisplayNewHashMode.
 318		 */
 319		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
 320				  GEN9_PBE_COMPRESSED_HASH_SELECTION);
 321		WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7,
 322				  GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR);
 323	}
 324
 325	/* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */
 326	/* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */
 327	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
 328			  FLOW_CONTROL_ENABLE |
 329			  PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
 330
 331	/* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */
 332	/* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */
 333	WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7,
 334			  GEN9_ENABLE_YV12_BUGFIX |
 335			  GEN9_ENABLE_GPGPU_PREEMPTION);
 336
 337	/* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */
 338	/* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */
 339	WA_SET_BIT_MASKED(CACHE_MODE_1,
 340			  GEN8_4x4_STC_OPTIMIZATION_DISABLE |
 341			  GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE);
 342
 343	/* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */
 344	WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5,
 345			  GEN9_CCS_TLB_PREFETCH_ENABLE);
 346
 347	/* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */
 348	WA_SET_BIT_MASKED(HDC_CHICKEN0,
 349			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
 350			  HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
 351
 352	/* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
 353	 * both tied to WaForceContextSaveRestoreNonCoherent
 354	 * in some hsds for skl. We keep the tie for all gen9. The
 355	 * documentation is a bit hazy and so we want to get common behaviour,
 356	 * even though there is no clear evidence we would need both on kbl/bxt.
 357	 * This area has been source of system hangs so we play it safe
 358	 * and mimic the skl regardless of what bspec says.
 359	 *
 360	 * Use Force Non-Coherent whenever executing a 3D context. This
 361	 * is a workaround for a possible hang in the unlikely event
 362	 * a TLB invalidation occurs during a PSD flush.
 363	 */
 364
 365	/* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */
 366	WA_SET_BIT_MASKED(HDC_CHICKEN0,
 367			  HDC_FORCE_NON_COHERENT);
 368
 369	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
 370	if (IS_SKYLAKE(i915) ||
 371	    IS_KABYLAKE(i915) ||
 372	    IS_COFFEELAKE(i915) ||
 373	    IS_COMETLAKE(i915))
 374		WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
 375				  GEN8_SAMPLER_POWER_BYPASS_DIS);
 376
 377	/* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */
 378	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
 379
 380	/*
 381	 * Supporting preemption with fine-granularity requires changes in the
 382	 * batch buffer programming. Since we can't break old userspace, we
 383	 * need to set our default preemption level to safe value. Userspace is
 384	 * still able to use more fine-grained preemption levels, since in
 385	 * WaEnablePreemptionGranularityControlByUMD we're whitelisting the
 386	 * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are
 387	 * not real HW workarounds, but merely a way to start using preemption
 388	 * while maintaining old contract with userspace.
 389	 */
 390
 391	/* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */
 392	WA_CLR_BIT_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
 393
 394	/* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */
 395	WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1,
 396			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 397			    GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
 398
 399	/* WaClearHIZ_WM_CHICKEN3:bxt,glk */
 400	if (IS_GEN9_LP(i915))
 401		WA_SET_BIT_MASKED(GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
 402}
 403
 404static void skl_tune_iz_hashing(struct intel_engine_cs *engine,
 405				struct i915_wa_list *wal)
 406{
 407	struct intel_gt *gt = engine->gt;
 408	u8 vals[3] = { 0, 0, 0 };
 409	unsigned int i;
 410
 411	for (i = 0; i < 3; i++) {
 412		u8 ss;
 413
 414		/*
 415		 * Only consider slices where one, and only one, subslice has 7
 416		 * EUs
 417		 */
 418		if (!is_power_of_2(gt->info.sseu.subslice_7eu[i]))
 419			continue;
 420
 421		/*
 422		 * subslice_7eu[i] != 0 (because of the check above) and
 423		 * ss_max == 4 (maximum number of subslices possible per slice)
 424		 *
 425		 * ->    0 <= ss <= 3;
 426		 */
 427		ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1;
 428		vals[i] = 3 - ss;
 429	}
 430
 431	if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
 432		return;
 433
 434	/* Tune IZ hashing. See intel_device_info_runtime_init() */
 435	WA_SET_FIELD_MASKED(GEN7_GT_MODE,
 436			    GEN9_IZ_HASHING_MASK(2) |
 437			    GEN9_IZ_HASHING_MASK(1) |
 438			    GEN9_IZ_HASHING_MASK(0),
 439			    GEN9_IZ_HASHING(2, vals[2]) |
 440			    GEN9_IZ_HASHING(1, vals[1]) |
 441			    GEN9_IZ_HASHING(0, vals[0]));
 442}
 443
 444static void skl_ctx_workarounds_init(struct intel_engine_cs *engine,
 445				     struct i915_wa_list *wal)
 446{
 447	gen9_ctx_workarounds_init(engine, wal);
 448	skl_tune_iz_hashing(engine, wal);
 449}
 450
 451static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine,
 452				     struct i915_wa_list *wal)
 453{
 454	gen9_ctx_workarounds_init(engine, wal);
 455
 456	/* WaDisableThreadStallDopClockGating:bxt */
 457	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
 458			  STALL_DOP_GATING_DISABLE);
 459
 460	/* WaToEnableHwFixForPushConstHWBug:bxt */
 461	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
 462			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 463}
 464
 465static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine,
 466				     struct i915_wa_list *wal)
 467{
 468	struct drm_i915_private *i915 = engine->i915;
 469
 470	gen9_ctx_workarounds_init(engine, wal);
 471
 472	/* WaToEnableHwFixForPushConstHWBug:kbl */
 473	if (IS_KBL_REVID(i915, KBL_REVID_C0, REVID_FOREVER))
 474		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
 475				  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 476
 477	/* WaDisableSbeCacheDispatchPortSharing:kbl */
 478	WA_SET_BIT_MASKED(GEN7_HALF_SLICE_CHICKEN1,
 479			  GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
 480}
 481
 482static void glk_ctx_workarounds_init(struct intel_engine_cs *engine,
 483				     struct i915_wa_list *wal)
 484{
 485	gen9_ctx_workarounds_init(engine, wal);
 486
 487	/* WaToEnableHwFixForPushConstHWBug:glk */
 488	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
 489			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 490}
 491
 492static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine,
 493				     struct i915_wa_list *wal)
 494{
 495	gen9_ctx_workarounds_init(engine, wal);
 496
 497	/* WaToEnableHwFixForPushConstHWBug:cfl */
 498	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
 499			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 500
 501	/* WaDisableSbeCacheDispatchPortSharing:cfl */
 502	WA_SET_BIT_MASKED(GEN7_HALF_SLICE_CHICKEN1,
 503			  GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
 504}
 505
 506static void cnl_ctx_workarounds_init(struct intel_engine_cs *engine,
 507				     struct i915_wa_list *wal)
 508{
 509	/* WaForceContextSaveRestoreNonCoherent:cnl */
 510	WA_SET_BIT_MASKED(CNL_HDC_CHICKEN0,
 511			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT);
 512
 513	/* WaDisableReplayBufferBankArbitrationOptimization:cnl */
 514	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
 515			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
 516
 517	/* WaPushConstantDereferenceHoldDisable:cnl */
 518	WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, PUSH_CONSTANT_DEREF_DISABLE);
 519
 520	/* FtrEnableFastAnisoL1BankingFix:cnl */
 521	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, CNL_FAST_ANISO_L1_BANKING_FIX);
 522
 523	/* WaDisable3DMidCmdPreemption:cnl */
 524	WA_CLR_BIT_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
 525
 526	/* WaDisableGPGPUMidCmdPreemption:cnl */
 527	WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1,
 528			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 529			    GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
 530
 531	/* WaDisableEarlyEOT:cnl */
 532	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, DISABLE_EARLY_EOT);
 533}
 534
 535static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
 536				     struct i915_wa_list *wal)
 537{
 538	struct drm_i915_private *i915 = engine->i915;
 539
 540	/* WaDisableBankHangMode:icl */
 541	wa_write(wal,
 542		 GEN8_L3CNTLREG,
 543		 intel_uncore_read(engine->uncore, GEN8_L3CNTLREG) |
 544		 GEN8_ERRDETBCTRL);
 545
 546	/* Wa_1604370585:icl (pre-prod)
 547	 * Formerly known as WaPushConstantDereferenceHoldDisable
 548	 */
 549	if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_B0))
 550		WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
 551				  PUSH_CONSTANT_DEREF_DISABLE);
 552
 553	/* WaForceEnableNonCoherent:icl
 554	 * This is not the same workaround as in early Gen9 platforms, where
 555	 * lacking this could cause system hangs, but coherency performance
 556	 * overhead is high and only a few compute workloads really need it
 557	 * (the register is whitelisted in hardware now, so UMDs can opt in
 558	 * for coherency if they have a good reason).
 559	 */
 560	WA_SET_BIT_MASKED(ICL_HDC_MODE, HDC_FORCE_NON_COHERENT);
 561
 562	/* Wa_2006611047:icl (pre-prod)
 563	 * Formerly known as WaDisableImprovedTdlClkGating
 564	 */
 565	if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0))
 566		WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
 567				  GEN11_TDL_CLOCK_GATING_FIX_DISABLE);
 568
 569	/* Wa_2006665173:icl (pre-prod) */
 570	if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0))
 571		WA_SET_BIT_MASKED(GEN11_COMMON_SLICE_CHICKEN3,
 572				  GEN11_BLEND_EMB_FIX_DISABLE_IN_RCC);
 573
 574	/* WaEnableFloatBlendOptimization:icl */
 575	wa_write_masked_or(wal,
 576			   GEN10_CACHE_MODE_SS,
 577			   0, /* write-only, so skip validation */
 578			   _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE));
 579
 580	/* WaDisableGPGPUMidThreadPreemption:icl */
 581	WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1,
 582			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 583			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
 584
 585	/* allow headerless messages for preemptible GPGPU context */
 586	WA_SET_BIT_MASKED(GEN10_SAMPLER_MODE,
 587			  GEN11_SAMPLER_ENABLE_HEADLESS_MSG);
 588
 589	/* Wa_1604278689:icl,ehl */
 590	wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID);
 591	wa_write_masked_or(wal, IVB_FBC_RT_BASE_UPPER,
 592			   0, /* write-only register; skip validation */
 593			   0xFFFFFFFF);
 594
 595	/* Wa_1406306137:icl,ehl */
 596	wa_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU);
 597}
 598
 599static void tgl_ctx_workarounds_init(struct intel_engine_cs *engine,
 600				     struct i915_wa_list *wal)
 601{
 602	/*
 603	 * Wa_1409142259:tgl
 604	 * Wa_1409347922:tgl
 605	 * Wa_1409252684:tgl
 606	 * Wa_1409217633:tgl
 607	 * Wa_1409207793:tgl
 608	 * Wa_1409178076:tgl
 609	 * Wa_1408979724:tgl
 610	 */
 611	WA_SET_BIT_MASKED(GEN11_COMMON_SLICE_CHICKEN3,
 612			  GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
 613
 614	/*
 615	 * Wa_1604555607:gen12 and Wa_1608008084:gen12
 616	 * FF_MODE2 register will return the wrong value when read. The default
 617	 * value for this register is zero for all fields and there are no bit
 618	 * masks. So instead of doing a RMW we should just write the GS Timer
 619	 * and TDS timer values for Wa_1604555607 and Wa_16011163337.
 620	 */
 621	wa_add(wal,
 622	       FF_MODE2,
 623	       FF_MODE2_GS_TIMER_MASK | FF_MODE2_TDS_TIMER_MASK,
 624	       FF_MODE2_GS_TIMER_224  | FF_MODE2_TDS_TIMER_128,
 625	       0);
 626
 627	/* WaDisableGPGPUMidThreadPreemption:tgl */
 628	WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1,
 629			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
 630			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
 631}
 632
 633static void
 634__intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
 635			   struct i915_wa_list *wal,
 636			   const char *name)
 637{
 638	struct drm_i915_private *i915 = engine->i915;
 639
 640	if (engine->class != RENDER_CLASS)
 641		return;
 642
 643	wa_init_start(wal, name, engine->name);
 644
 645	if (IS_GEN(i915, 12))
 646		tgl_ctx_workarounds_init(engine, wal);
 647	else if (IS_GEN(i915, 11))
 648		icl_ctx_workarounds_init(engine, wal);
 649	else if (IS_CANNONLAKE(i915))
 650		cnl_ctx_workarounds_init(engine, wal);
 651	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
 652		cfl_ctx_workarounds_init(engine, wal);
 653	else if (IS_GEMINILAKE(i915))
 654		glk_ctx_workarounds_init(engine, wal);
 655	else if (IS_KABYLAKE(i915))
 656		kbl_ctx_workarounds_init(engine, wal);
 657	else if (IS_BROXTON(i915))
 658		bxt_ctx_workarounds_init(engine, wal);
 659	else if (IS_SKYLAKE(i915))
 660		skl_ctx_workarounds_init(engine, wal);
 661	else if (IS_CHERRYVIEW(i915))
 662		chv_ctx_workarounds_init(engine, wal);
 663	else if (IS_BROADWELL(i915))
 664		bdw_ctx_workarounds_init(engine, wal);
 665	else if (IS_GEN(i915, 7))
 666		gen7_ctx_workarounds_init(engine, wal);
 667	else if (IS_GEN(i915, 6))
 668		gen6_ctx_workarounds_init(engine, wal);
 669	else if (INTEL_GEN(i915) < 8)
 670		return;
 671	else
 672		MISSING_CASE(INTEL_GEN(i915));
 673
 674	wa_init_finish(wal);
 675}
 676
 677void intel_engine_init_ctx_wa(struct intel_engine_cs *engine)
 678{
 679	__intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context");
 680}
 681
 682int intel_engine_emit_ctx_wa(struct i915_request *rq)
 683{
 684	struct i915_wa_list *wal = &rq->engine->ctx_wa_list;
 685	struct i915_wa *wa;
 686	unsigned int i;
 687	u32 *cs;
 688	int ret;
 689
 690	if (wal->count == 0)
 691		return 0;
 692
 693	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
 694	if (ret)
 695		return ret;
 696
 697	cs = intel_ring_begin(rq, (wal->count * 2 + 2));
 698	if (IS_ERR(cs))
 699		return PTR_ERR(cs);
 700
 701	*cs++ = MI_LOAD_REGISTER_IMM(wal->count);
 702	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
 703		*cs++ = i915_mmio_reg_offset(wa->reg);
 704		*cs++ = wa->set;
 705	}
 706	*cs++ = MI_NOOP;
 707
 708	intel_ring_advance(rq, cs);
 709
 710	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
 711	if (ret)
 712		return ret;
 713
 714	return 0;
 715}
 716
 717static void
 718gen4_gt_workarounds_init(struct drm_i915_private *i915,
 719			 struct i915_wa_list *wal)
 720{
 721	/* WaDisable_RenderCache_OperationalFlush:gen4,ilk */
 722	wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
 723}
 724
 725static void
 726g4x_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 727{
 728	gen4_gt_workarounds_init(i915, wal);
 729
 730	/* WaDisableRenderCachePipelinedFlush:g4x,ilk */
 731	wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE);
 732}
 733
 734static void
 735ilk_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 736{
 737	g4x_gt_workarounds_init(i915, wal);
 738
 739	wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED);
 740}
 741
 742static void
 743snb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 744{
 745	/* WaDisableHiZPlanesWhenMSAAEnabled:snb */
 746	wa_masked_en(wal,
 747		     _3D_CHICKEN,
 748		     _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB);
 749
 750	/* WaDisable_RenderCache_OperationalFlush:snb */
 751	wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
 752
 753	/*
 754	 * BSpec recommends 8x4 when MSAA is used,
 755	 * however in practice 16x4 seems fastest.
 756	 *
 757	 * Note that PS/WM thread counts depend on the WIZ hashing
 758	 * disable bit, which we don't touch here, but it's good
 759	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
 760	 */
 761	wa_add(wal,
 762	       GEN6_GT_MODE, 0,
 763	       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
 764	       GEN6_WIZ_HASHING_16x4);
 765
 766	wa_masked_dis(wal, CACHE_MODE_0, CM0_STC_EVICT_DISABLE_LRA_SNB);
 767
 768	wa_masked_en(wal,
 769		     _3D_CHICKEN3,
 770		     /* WaStripsFansDisableFastClipPerformanceFix:snb */
 771		     _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL |
 772		     /*
 773		      * Bspec says:
 774		      * "This bit must be set if 3DSTATE_CLIP clip mode is set
 775		      * to normal and 3DSTATE_SF number of SF output attributes
 776		      * is more than 16."
 777		      */
 778		   _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH);
 779}
 780
 781static void
 782ivb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 783{
 784	/* WaDisableEarlyCull:ivb */
 785	wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
 786
 787	/* WaDisablePSDDualDispatchEnable:ivb */
 788	if (IS_IVB_GT1(i915))
 789		wa_masked_en(wal,
 790			     GEN7_HALF_SLICE_CHICKEN1,
 791			     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
 792
 793	/* WaDisable_RenderCache_OperationalFlush:ivb */
 794	wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
 795
 796	/* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
 797	wa_masked_dis(wal,
 798		      GEN7_COMMON_SLICE_CHICKEN1,
 799		      GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
 800
 801	/* WaApplyL3ControlAndL3ChickenMode:ivb */
 802	wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL);
 803	wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
 804
 805	/* WaForceL3Serialization:ivb */
 806	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
 807
 808	/*
 809	 * WaVSThreadDispatchOverride:ivb,vlv
 810	 *
 811	 * This actually overrides the dispatch
 812	 * mode for all thread types.
 813	 */
 814	wa_write_masked_or(wal, GEN7_FF_THREAD_MODE,
 815			   GEN7_FF_SCHED_MASK,
 816			   GEN7_FF_TS_SCHED_HW |
 817			   GEN7_FF_VS_SCHED_HW |
 818			   GEN7_FF_DS_SCHED_HW);
 819
 820	if (0) { /* causes HiZ corruption on ivb:gt1 */
 821		/* enable HiZ Raw Stall Optimization */
 822		wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
 823	}
 824
 825	/* WaDisable4x2SubspanOptimization:ivb */
 826	wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
 827
 828	/*
 829	 * BSpec recommends 8x4 when MSAA is used,
 830	 * however in practice 16x4 seems fastest.
 831	 *
 832	 * Note that PS/WM thread counts depend on the WIZ hashing
 833	 * disable bit, which we don't touch here, but it's good
 834	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
 835	 */
 836	wa_add(wal, GEN7_GT_MODE, 0,
 837	       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
 838	       GEN6_WIZ_HASHING_16x4);
 839}
 840
 841static void
 842vlv_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 843{
 844	/* WaDisableEarlyCull:vlv */
 845	wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
 846
 847	/* WaPsdDispatchEnable:vlv */
 848	/* WaDisablePSDDualDispatchEnable:vlv */
 849	wa_masked_en(wal,
 850		     GEN7_HALF_SLICE_CHICKEN1,
 851		     GEN7_MAX_PS_THREAD_DEP |
 852		     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
 853
 854	/* WaDisable_RenderCache_OperationalFlush:vlv */
 855	wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
 856
 857	/* WaForceL3Serialization:vlv */
 858	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
 859
 860	/*
 861	 * WaVSThreadDispatchOverride:ivb,vlv
 862	 *
 863	 * This actually overrides the dispatch
 864	 * mode for all thread types.
 865	 */
 866	wa_write_masked_or(wal,
 867			   GEN7_FF_THREAD_MODE,
 868			   GEN7_FF_SCHED_MASK,
 869			   GEN7_FF_TS_SCHED_HW |
 870			   GEN7_FF_VS_SCHED_HW |
 871			   GEN7_FF_DS_SCHED_HW);
 872
 873	/*
 874	 * BSpec says this must be set, even though
 875	 * WaDisable4x2SubspanOptimization isn't listed for VLV.
 876	 */
 877	wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
 878
 879	/*
 880	 * BSpec recommends 8x4 when MSAA is used,
 881	 * however in practice 16x4 seems fastest.
 882	 *
 883	 * Note that PS/WM thread counts depend on the WIZ hashing
 884	 * disable bit, which we don't touch here, but it's good
 885	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
 886	 */
 887	wa_add(wal, GEN7_GT_MODE, 0,
 888	       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
 889	       GEN6_WIZ_HASHING_16x4);
 890
 891	/*
 892	 * WaIncreaseL3CreditsForVLVB0:vlv
 893	 * This is the hardware default actually.
 894	 */
 895	wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE);
 896}
 897
 898static void
 899hsw_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 900{
 901	/* L3 caching of data atomics doesn't work -- disable it. */
 902	wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
 903
 904	wa_add(wal,
 905	       HSW_ROW_CHICKEN3, 0,
 906	       _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE),
 907		0 /* XXX does this reg exist? */);
 908
 909	/* WaVSRefCountFullforceMissDisable:hsw */
 910	wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME);
 911
 912	wa_masked_dis(wal,
 913		      CACHE_MODE_0_GEN7,
 914		      /* WaDisable_RenderCache_OperationalFlush:hsw */
 915		      RC_OP_FLUSH_ENABLE |
 916		      /* enable HiZ Raw Stall Optimization */
 917		      HIZ_RAW_STALL_OPT_DISABLE);
 918
 919	/* WaDisable4x2SubspanOptimization:hsw */
 920	wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
 921
 922	/*
 923	 * BSpec recommends 8x4 when MSAA is used,
 924	 * however in practice 16x4 seems fastest.
 925	 *
 926	 * Note that PS/WM thread counts depend on the WIZ hashing
 927	 * disable bit, which we don't touch here, but it's good
 928	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
 929	 */
 930	wa_add(wal, GEN7_GT_MODE, 0,
 931	       _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
 932	       GEN6_WIZ_HASHING_16x4);
 933
 934	/* WaSampleCChickenBitEnable:hsw */
 935	wa_masked_en(wal, HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE);
 936}
 937
 938static void
 939gen9_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 940{
 941	/* WaDisableKillLogic:bxt,skl,kbl */
 942	if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915))
 943		wa_write_or(wal,
 944			    GAM_ECOCHK,
 945			    ECOCHK_DIS_TLB);
 946
 947	if (HAS_LLC(i915)) {
 948		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
 949		 *
 950		 * Must match Display Engine. See
 951		 * WaCompressedResourceDisplayNewHashMode.
 952		 */
 953		wa_write_or(wal,
 954			    MMCD_MISC_CTRL,
 955			    MMCD_PCLA | MMCD_HOTSPOT_EN);
 956	}
 957
 958	/* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */
 959	wa_write_or(wal,
 960		    GAM_ECOCHK,
 961		    BDW_DISABLE_HDC_INVALIDATION);
 962}
 963
 964static void
 965skl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 966{
 967	gen9_gt_workarounds_init(i915, wal);
 968
 969	/* WaDisableGafsUnitClkGating:skl */
 970	wa_write_or(wal,
 971		    GEN7_UCGCTL4,
 972		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
 973
 974	/* WaInPlaceDecompressionHang:skl */
 975	if (IS_SKL_REVID(i915, SKL_REVID_H0, REVID_FOREVER))
 976		wa_write_or(wal,
 977			    GEN9_GAMT_ECO_REG_RW_IA,
 978			    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
 979}
 980
 981static void
 982bxt_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 983{
 984	gen9_gt_workarounds_init(i915, wal);
 985
 986	/* WaInPlaceDecompressionHang:bxt */
 987	wa_write_or(wal,
 988		    GEN9_GAMT_ECO_REG_RW_IA,
 989		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
 990}
 991
 992static void
 993kbl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 994{
 995	gen9_gt_workarounds_init(i915, wal);
 996
 997	/* WaDisableDynamicCreditSharing:kbl */
 998	if (IS_KBL_REVID(i915, 0, KBL_REVID_B0))
 999		wa_write_or(wal,
1000			    GAMT_CHKN_BIT_REG,
1001			    GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);
1002
1003	/* WaDisableGafsUnitClkGating:kbl */
1004	wa_write_or(wal,
1005		    GEN7_UCGCTL4,
1006		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1007
1008	/* WaInPlaceDecompressionHang:kbl */
1009	wa_write_or(wal,
1010		    GEN9_GAMT_ECO_REG_RW_IA,
1011		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1012}
1013
1014static void
1015glk_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
1016{
1017	gen9_gt_workarounds_init(i915, wal);
1018}
1019
1020static void
1021cfl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
1022{
1023	gen9_gt_workarounds_init(i915, wal);
1024
1025	/* WaDisableGafsUnitClkGating:cfl */
1026	wa_write_or(wal,
1027		    GEN7_UCGCTL4,
1028		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1029
1030	/* WaInPlaceDecompressionHang:cfl */
1031	wa_write_or(wal,
1032		    GEN9_GAMT_ECO_REG_RW_IA,
1033		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1034}
1035
1036static void
1037wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal)
1038{
1039	const struct sseu_dev_info *sseu = &i915->gt.info.sseu;
1040	unsigned int slice, subslice;
1041	u32 l3_en, mcr, mcr_mask;
1042
1043	GEM_BUG_ON(INTEL_GEN(i915) < 10);
1044
1045	/*
1046	 * WaProgramMgsrForL3BankSpecificMmioReads: cnl,icl
1047	 * L3Banks could be fused off in single slice scenario. If that is
1048	 * the case, we might need to program MCR select to a valid L3Bank
1049	 * by default, to make sure we correctly read certain registers
1050	 * later on (in the range 0xB100 - 0xB3FF).
1051	 *
1052	 * WaProgramMgsrForCorrectSliceSpecificMmioReads:cnl,icl
1053	 * Before any MMIO read into slice/subslice specific registers, MCR
1054	 * packet control register needs to be programmed to point to any
1055	 * enabled s/ss pair. Otherwise, incorrect values will be returned.
1056	 * This means each subsequent MMIO read will be forwarded to an
1057	 * specific s/ss combination, but this is OK since these registers
1058	 * are consistent across s/ss in almost all cases. In the rare
1059	 * occasions, such as INSTDONE, where this value is dependent
1060	 * on s/ss combo, the read should be done with read_subslice_reg.
1061	 *
1062	 * Since GEN8_MCR_SELECTOR contains dual-purpose bits which select both
1063	 * to which subslice, or to which L3 bank, the respective mmio reads
1064	 * will go, we have to find a common index which works for both
1065	 * accesses.
1066	 *
1067	 * Case where we cannot find a common index fortunately should not
1068	 * happen in production hardware, so we only emit a warning instead of
1069	 * implementing something more complex that requires checking the range
1070	 * of every MMIO read.
1071	 */
1072
1073	if (INTEL_GEN(i915) >= 10 && is_power_of_2(sseu->slice_mask)) {
1074		u32 l3_fuse =
1075			intel_uncore_read(&i915->uncore, GEN10_MIRROR_FUSE3) &
1076			GEN10_L3BANK_MASK;
1077
1078		drm_dbg(&i915->drm, "L3 fuse = %x\n", l3_fuse);
1079		l3_en = ~(l3_fuse << GEN10_L3BANK_PAIR_COUNT | l3_fuse);
1080	} else {
1081		l3_en = ~0;
1082	}
1083
1084	slice = fls(sseu->slice_mask) - 1;
1085	subslice = fls(l3_en & intel_sseu_get_subslices(sseu, slice));
1086	if (!subslice) {
1087		drm_warn(&i915->drm,
1088			 "No common index found between subslice mask %x and L3 bank mask %x!\n",
1089			 intel_sseu_get_subslices(sseu, slice), l3_en);
1090		subslice = fls(l3_en);
1091		drm_WARN_ON(&i915->drm, !subslice);
1092	}
1093	subslice--;
1094
1095	if (INTEL_GEN(i915) >= 11) {
1096		mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice);
1097		mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
1098	} else {
1099		mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice);
1100		mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
1101	}
1102
1103	drm_dbg(&i915->drm, "MCR slice/subslice = %x\n", mcr);
1104
1105	wa_write_masked_or(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr);
1106}
1107
1108static void
1109cnl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
1110{
1111	wa_init_mcr(i915, wal);
1112
1113	/* WaInPlaceDecompressionHang:cnl */
1114	wa_write_or(wal,
1115		    GEN9_GAMT_ECO_REG_RW_IA,
1116		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1117}
1118
1119static void
1120icl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
1121{
1122	wa_init_mcr(i915, wal);
1123
1124	/* WaInPlaceDecompressionHang:icl */
1125	wa_write_or(wal,
1126		    GEN9_GAMT_ECO_REG_RW_IA,
1127		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1128
1129	/* WaModifyGamTlbPartitioning:icl */
1130	wa_write_masked_or(wal,
1131			   GEN11_GACB_PERF_CTRL,
1132			   GEN11_HASH_CTRL_MASK,
1133			   GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4);
1134
1135	/* Wa_1405766107:icl
1136	 * Formerly known as WaCL2SFHalfMaxAlloc
1137	 */
1138	wa_write_or(wal,
1139		    GEN11_LSN_UNSLCVC,
1140		    GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC |
1141		    GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC);
1142
1143	/* Wa_220166154:icl
1144	 * Formerly known as WaDisCtxReload
1145	 */
1146	wa_write_or(wal,
1147		    GEN8_GAMW_ECO_DEV_RW_IA,
1148		    GAMW_ECO_DEV_CTX_RELOAD_DISABLE);
1149
1150	/* Wa_1405779004:icl (pre-prod) */
1151	if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0))
1152		wa_write_or(wal,
1153			    SLICE_UNIT_LEVEL_CLKGATE,
1154			    MSCUNIT_CLKGATE_DIS);
1155
1156	/* Wa_1406838659:icl (pre-prod) */
1157	if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_B0))
1158		wa_write_or(wal,
1159			    INF_UNIT_LEVEL_CLKGATE,
1160			    CGPSF_CLKGATE_DIS);
1161
1162	/* Wa_1406463099:icl
1163	 * Formerly known as WaGamTlbPendError
1164	 */
1165	wa_write_or(wal,
1166		    GAMT_CHKN_BIT_REG,
1167		    GAMT_CHKN_DISABLE_L3_COH_PIPE);
1168
1169	/* Wa_1607087056:icl,ehl,jsl */
1170	if (IS_ICELAKE(i915) ||
1171	    IS_EHL_REVID(i915, EHL_REVID_A0, EHL_REVID_A0)) {
1172		wa_write_or(wal,
1173			    SLICE_UNIT_LEVEL_CLKGATE,
1174			    L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1175	}
1176}
1177
1178static void
1179tgl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
1180{
1181	wa_init_mcr(i915, wal);
1182
1183	/* Wa_1409420604:tgl */
1184	if (IS_TGL_REVID(i915, TGL_REVID_A0, TGL_REVID_A0))
1185		wa_write_or(wal,
1186			    SUBSLICE_UNIT_LEVEL_CLKGATE2,
1187			    CPSSUNIT_CLKGATE_DIS);
1188
1189	/* Wa_1607087056:tgl also know as BUG:1409180338 */
1190	if (IS_TGL_REVID(i915, TGL_REVID_A0, TGL_REVID_A0))
1191		wa_write_or(wal,
1192			    SLICE_UNIT_LEVEL_CLKGATE,
1193			    L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1194}
1195
1196static void
1197gt_init_workarounds(struct drm_i915_private *i915, struct i915_wa_list *wal)
1198{
1199	if (IS_GEN(i915, 12))
1200		tgl_gt_workarounds_init(i915, wal);
1201	else if (IS_GEN(i915, 11))
1202		icl_gt_workarounds_init(i915, wal);
1203	else if (IS_CANNONLAKE(i915))
1204		cnl_gt_workarounds_init(i915, wal);
1205	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
1206		cfl_gt_workarounds_init(i915, wal);
1207	else if (IS_GEMINILAKE(i915))
1208		glk_gt_workarounds_init(i915, wal);
1209	else if (IS_KABYLAKE(i915))
1210		kbl_gt_workarounds_init(i915, wal);
1211	else if (IS_BROXTON(i915))
1212		bxt_gt_workarounds_init(i915, wal);
1213	else if (IS_SKYLAKE(i915))
1214		skl_gt_workarounds_init(i915, wal);
1215	else if (IS_HASWELL(i915))
1216		hsw_gt_workarounds_init(i915, wal);
1217	else if (IS_VALLEYVIEW(i915))
1218		vlv_gt_workarounds_init(i915, wal);
1219	else if (IS_IVYBRIDGE(i915))
1220		ivb_gt_workarounds_init(i915, wal);
1221	else if (IS_GEN(i915, 6))
1222		snb_gt_workarounds_init(i915, wal);
1223	else if (IS_GEN(i915, 5))
1224		ilk_gt_workarounds_init(i915, wal);
1225	else if (IS_G4X(i915))
1226		g4x_gt_workarounds_init(i915, wal);
1227	else if (IS_GEN(i915, 4))
1228		gen4_gt_workarounds_init(i915, wal);
1229	else if (INTEL_GEN(i915) <= 8)
1230		return;
1231	else
1232		MISSING_CASE(INTEL_GEN(i915));
1233}
1234
1235void intel_gt_init_workarounds(struct drm_i915_private *i915)
1236{
1237	struct i915_wa_list *wal = &i915->gt_wa_list;
1238
1239	wa_init_start(wal, "GT", "global");
1240	gt_init_workarounds(i915, wal);
1241	wa_init_finish(wal);
1242}
1243
1244static enum forcewake_domains
1245wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal)
1246{
1247	enum forcewake_domains fw = 0;
1248	struct i915_wa *wa;
1249	unsigned int i;
1250
1251	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1252		fw |= intel_uncore_forcewake_for_reg(uncore,
1253						     wa->reg,
1254						     FW_REG_READ |
1255						     FW_REG_WRITE);
1256
1257	return fw;
1258}
1259
1260static bool
1261wa_verify(const struct i915_wa *wa, u32 cur, const char *name, const char *from)
1262{
1263	if ((cur ^ wa->set) & wa->read) {
1264		DRM_ERROR("%s workaround lost on %s! (%x=%x/%x, expected %x)\n",
1265			  name, from, i915_mmio_reg_offset(wa->reg),
1266			  cur, cur & wa->read, wa->set);
1267
1268		return false;
1269	}
1270
1271	return true;
1272}
1273
1274static void
1275wa_list_apply(struct intel_uncore *uncore, const struct i915_wa_list *wal)
1276{
1277	enum forcewake_domains fw;
1278	unsigned long flags;
1279	struct i915_wa *wa;
1280	unsigned int i;
1281
1282	if (!wal->count)
1283		return;
1284
1285	fw = wal_get_fw_for_rmw(uncore, wal);
1286
1287	spin_lock_irqsave(&uncore->lock, flags);
1288	intel_uncore_forcewake_get__locked(uncore, fw);
1289
1290	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1291		if (wa->clr)
1292			intel_uncore_rmw_fw(uncore, wa->reg, wa->clr, wa->set);
1293		else
1294			intel_uncore_write_fw(uncore, wa->reg, wa->set);
1295		if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1296			wa_verify(wa,
1297				  intel_uncore_read_fw(uncore, wa->reg),
1298				  wal->name, "application");
1299	}
1300
1301	intel_uncore_forcewake_put__locked(uncore, fw);
1302	spin_unlock_irqrestore(&uncore->lock, flags);
1303}
1304
1305void intel_gt_apply_workarounds(struct intel_gt *gt)
1306{
1307	wa_list_apply(gt->uncore, &gt->i915->gt_wa_list);
1308}
1309
1310static bool wa_list_verify(struct intel_uncore *uncore,
1311			   const struct i915_wa_list *wal,
1312			   const char *from)
1313{
1314	struct i915_wa *wa;
1315	unsigned int i;
1316	bool ok = true;
1317
1318	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1319		ok &= wa_verify(wa,
1320				intel_uncore_read(uncore, wa->reg),
1321				wal->name, from);
1322
1323	return ok;
1324}
1325
1326bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from)
1327{
1328	return wa_list_verify(gt->uncore, &gt->i915->gt_wa_list, from);
1329}
1330
1331static inline bool is_nonpriv_flags_valid(u32 flags)
1332{
1333	/* Check only valid flag bits are set */
1334	if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID)
1335		return false;
1336
1337	/* NB: Only 3 out of 4 enum values are valid for access field */
1338	if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) ==
1339	    RING_FORCE_TO_NONPRIV_ACCESS_INVALID)
1340		return false;
1341
1342	return true;
1343}
1344
1345static void
1346whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags)
1347{
1348	struct i915_wa wa = {
1349		.reg = reg
1350	};
1351
1352	if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1353		return;
1354
1355	if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1356		return;
1357
1358	wa.reg.reg |= flags;
1359	_wa_add(wal, &wa);
1360}
1361
1362static void
1363whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
1364{
1365	whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1366}
1367
1368static void gen9_whitelist_build(struct i915_wa_list *w)
1369{
1370	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
1371	whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
1372
1373	/* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */
1374	whitelist_reg(w, GEN8_CS_CHICKEN1);
1375
1376	/* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */
1377	whitelist_reg(w, GEN8_HDC_CHICKEN1);
1378
1379	/* WaSendPushConstantsFromMMIO:skl,bxt */
1380	whitelist_reg(w, COMMON_SLICE_CHICKEN2);
1381}
1382
1383static void skl_whitelist_build(struct intel_engine_cs *engine)
1384{
1385	struct i915_wa_list *w = &engine->whitelist;
1386
1387	if (engine->class != RENDER_CLASS)
1388		return;
1389
1390	gen9_whitelist_build(w);
1391
1392	/* WaDisableLSQCROPERFforOCL:skl */
1393	whitelist_reg(w, GEN8_L3SQCREG4);
1394}
1395
1396static void bxt_whitelist_build(struct intel_engine_cs *engine)
1397{
1398	if (engine->class != RENDER_CLASS)
1399		return;
1400
1401	gen9_whitelist_build(&engine->whitelist);
1402}
1403
1404static void kbl_whitelist_build(struct intel_engine_cs *engine)
1405{
1406	struct i915_wa_list *w = &engine->whitelist;
1407
1408	if (engine->class != RENDER_CLASS)
1409		return;
1410
1411	gen9_whitelist_build(w);
1412
1413	/* WaDisableLSQCROPERFforOCL:kbl */
1414	whitelist_reg(w, GEN8_L3SQCREG4);
1415}
1416
1417static void glk_whitelist_build(struct intel_engine_cs *engine)
1418{
1419	struct i915_wa_list *w = &engine->whitelist;
1420
1421	if (engine->class != RENDER_CLASS)
1422		return;
1423
1424	gen9_whitelist_build(w);
1425
1426	/* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */
1427	whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1428}
1429
1430static void cfl_whitelist_build(struct intel_engine_cs *engine)
1431{
1432	struct i915_wa_list *w = &engine->whitelist;
1433
1434	if (engine->class != RENDER_CLASS)
1435		return;
1436
1437	gen9_whitelist_build(w);
1438
1439	/*
1440	 * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml
1441	 *
1442	 * This covers 4 register which are next to one another :
1443	 *   - PS_INVOCATION_COUNT
1444	 *   - PS_INVOCATION_COUNT_UDW
1445	 *   - PS_DEPTH_COUNT
1446	 *   - PS_DEPTH_COUNT_UDW
1447	 */
1448	whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1449			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
1450			  RING_FORCE_TO_NONPRIV_RANGE_4);
1451}
1452
1453static void cml_whitelist_build(struct intel_engine_cs *engine)
1454{
1455	struct i915_wa_list *w = &engine->whitelist;
1456
1457	if (engine->class != RENDER_CLASS)
1458		whitelist_reg_ext(w,
1459				  RING_CTX_TIMESTAMP(engine->mmio_base),
1460				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1461
1462	cfl_whitelist_build(engine);
1463}
1464
1465static void cnl_whitelist_build(struct intel_engine_cs *engine)
1466{
1467	struct i915_wa_list *w = &engine->whitelist;
1468
1469	if (engine->class != RENDER_CLASS)
1470		return;
1471
1472	/* WaEnablePreemptionGranularityControlByUMD:cnl */
1473	whitelist_reg(w, GEN8_CS_CHICKEN1);
1474}
1475
1476static void icl_whitelist_build(struct intel_engine_cs *engine)
1477{
1478	struct i915_wa_list *w = &engine->whitelist;
1479
1480	switch (engine->class) {
1481	case RENDER_CLASS:
1482		/* WaAllowUMDToModifyHalfSliceChicken7:icl */
1483		whitelist_reg(w, GEN9_HALF_SLICE_CHICKEN7);
1484
1485		/* WaAllowUMDToModifySamplerMode:icl */
1486		whitelist_reg(w, GEN10_SAMPLER_MODE);
1487
1488		/* WaEnableStateCacheRedirectToCS:icl */
1489		whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
1490
1491		/*
1492		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl
1493		 *
1494		 * This covers 4 register which are next to one another :
1495		 *   - PS_INVOCATION_COUNT
1496		 *   - PS_INVOCATION_COUNT_UDW
1497		 *   - PS_DEPTH_COUNT
1498		 *   - PS_DEPTH_COUNT_UDW
1499		 */
1500		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1501				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
1502				  RING_FORCE_TO_NONPRIV_RANGE_4);
1503		break;
1504
1505	case VIDEO_DECODE_CLASS:
1506		/* hucStatusRegOffset */
1507		whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base),
1508				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1509		/* hucUKernelHdrInfoRegOffset */
1510		whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base),
1511				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1512		/* hucStatus2RegOffset */
1513		whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base),
1514				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1515		whitelist_reg_ext(w,
1516				  RING_CTX_TIMESTAMP(engine->mmio_base),
1517				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1518		break;
1519
1520	default:
1521		whitelist_reg_ext(w,
1522				  RING_CTX_TIMESTAMP(engine->mmio_base),
1523				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1524		break;
1525	}
1526}
1527
1528static void tgl_whitelist_build(struct intel_engine_cs *engine)
1529{
1530	struct i915_wa_list *w = &engine->whitelist;
1531
1532	switch (engine->class) {
1533	case RENDER_CLASS:
1534		/*
1535		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl
1536		 * Wa_1408556865:tgl
1537		 *
1538		 * This covers 4 registers which are next to one another :
1539		 *   - PS_INVOCATION_COUNT
1540		 *   - PS_INVOCATION_COUNT_UDW
1541		 *   - PS_DEPTH_COUNT
1542		 *   - PS_DEPTH_COUNT_UDW
1543		 */
1544		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
1545				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
1546				  RING_FORCE_TO_NONPRIV_RANGE_4);
1547
1548		/* Wa_1808121037:tgl */
1549		whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
1550
1551		/* Wa_1806527549:tgl */
1552		whitelist_reg(w, HIZ_CHICKEN);
1553		break;
1554	default:
1555		whitelist_reg_ext(w,
1556				  RING_CTX_TIMESTAMP(engine->mmio_base),
1557				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
1558		break;
1559	}
1560}
1561
1562void intel_engine_init_whitelist(struct intel_engine_cs *engine)
1563{
1564	struct drm_i915_private *i915 = engine->i915;
1565	struct i915_wa_list *w = &engine->whitelist;
1566
1567	wa_init_start(w, "whitelist", engine->name);
1568
1569	if (IS_GEN(i915, 12))
1570		tgl_whitelist_build(engine);
1571	else if (IS_GEN(i915, 11))
1572		icl_whitelist_build(engine);
1573	else if (IS_CANNONLAKE(i915))
1574		cnl_whitelist_build(engine);
1575	else if (IS_COMETLAKE(i915))
1576		cml_whitelist_build(engine);
1577	else if (IS_COFFEELAKE(i915))
1578		cfl_whitelist_build(engine);
1579	else if (IS_GEMINILAKE(i915))
1580		glk_whitelist_build(engine);
1581	else if (IS_KABYLAKE(i915))
1582		kbl_whitelist_build(engine);
1583	else if (IS_BROXTON(i915))
1584		bxt_whitelist_build(engine);
1585	else if (IS_SKYLAKE(i915))
1586		skl_whitelist_build(engine);
1587	else if (INTEL_GEN(i915) <= 8)
1588		return;
1589	else
1590		MISSING_CASE(INTEL_GEN(i915));
1591
1592	wa_init_finish(w);
1593}
1594
1595void intel_engine_apply_whitelist(struct intel_engine_cs *engine)
1596{
1597	const struct i915_wa_list *wal = &engine->whitelist;
1598	struct intel_uncore *uncore = engine->uncore;
1599	const u32 base = engine->mmio_base;
1600	struct i915_wa *wa;
1601	unsigned int i;
1602
1603	if (!wal->count)
1604		return;
1605
1606	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1607		intel_uncore_write(uncore,
1608				   RING_FORCE_TO_NONPRIV(base, i),
1609				   i915_mmio_reg_offset(wa->reg));
1610
1611	/* And clear the rest just in case of garbage */
1612	for (; i < RING_MAX_NONPRIV_SLOTS; i++)
1613		intel_uncore_write(uncore,
1614				   RING_FORCE_TO_NONPRIV(base, i),
1615				   i915_mmio_reg_offset(RING_NOPID(base)));
1616}
1617
1618static void
1619rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
1620{
1621	struct drm_i915_private *i915 = engine->i915;
1622
1623	if (IS_TGL_REVID(i915, TGL_REVID_A0, TGL_REVID_A0)) {
1624		/*
1625		 * Wa_1607138336:tgl
1626		 * Wa_1607063988:tgl
1627		 */
1628		wa_write_or(wal,
1629			    GEN9_CTX_PREEMPT_REG,
1630			    GEN12_DISABLE_POSH_BUSY_FF_DOP_CG);
1631
1632		/*
1633		 * Wa_1607030317:tgl
1634		 * Wa_1607186500:tgl
1635		 * Wa_1607297627:tgl there is 3 entries for this WA on BSpec, 2
1636		 * of then says it is fixed on B0 the other one says it is
1637		 * permanent
1638		 */
1639		wa_masked_en(wal,
1640			     GEN6_RC_SLEEP_PSMI_CONTROL,
1641			     GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE |
1642			     GEN8_RC_SEMA_IDLE_MSG_DISABLE);
1643
1644		/*
1645		 * Wa_1606679103:tgl
1646		 * (see also Wa_1606682166:icl)
1647		 */
1648		wa_write_or(wal,
1649			    GEN7_SARCHKMD,
1650			    GEN7_DISABLE_SAMPLER_PREFETCH);
1651
1652		/* Wa_1408615072:tgl */
1653		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1654			    VSUNIT_CLKGATE_DIS_TGL);
1655	}
1656
1657	if (IS_TIGERLAKE(i915)) {
1658		/* Wa_1606931601:tgl */
1659		wa_masked_en(wal, GEN7_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ);
1660
1661		/* Wa_1409804808:tgl */
1662		wa_masked_en(wal, GEN7_ROW_CHICKEN2,
1663			     GEN12_PUSH_CONST_DEREF_HOLD_DIS);
1664
1665		/* Wa_1606700617:tgl */
1666		wa_masked_en(wal,
1667			     GEN9_CS_DEBUG_MODE1,
1668			     FF_DOP_CLOCK_GATE_DISABLE);
1669
1670		/*
1671		 * Wa_1409085225:tgl
1672		 * Wa_14010229206:tgl
1673		 */
1674		wa_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH);
1675
1676		/*
1677		 * Wa_1407928979:tgl A*
1678		 * Wa_18011464164:tgl B0+
1679		 * Wa_22010931296:tgl B0+
1680		 */
1681		wa_write_or(wal, GEN7_FF_THREAD_MODE,
1682			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
1683	}
1684
1685	if (IS_GEN(i915, 11)) {
1686		/* This is not an Wa. Enable for better image quality */
1687		wa_masked_en(wal,
1688			     _3D_CHICKEN3,
1689			     _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE);
1690
1691		/* WaPipelineFlushCoherentLines:icl */
1692		wa_write_or(wal,
1693			    GEN8_L3SQCREG4,
1694			    GEN8_LQSC_FLUSH_COHERENT_LINES);
1695
1696		/*
1697		 * Wa_1405543622:icl
1698		 * Formerly known as WaGAPZPriorityScheme
1699		 */
1700		wa_write_or(wal,
1701			    GEN8_GARBCNTL,
1702			    GEN11_ARBITRATION_PRIO_ORDER_MASK);
1703
1704		/*
1705		 * Wa_1604223664:icl
1706		 * Formerly known as WaL3BankAddressHashing
1707		 */
1708		wa_write_masked_or(wal,
1709				   GEN8_GARBCNTL,
1710				   GEN11_HASH_CTRL_EXCL_MASK,
1711				   GEN11_HASH_CTRL_EXCL_BIT0);
1712		wa_write_masked_or(wal,
1713				   GEN11_GLBLINVL,
1714				   GEN11_BANK_HASH_ADDR_EXCL_MASK,
1715				   GEN11_BANK_HASH_ADDR_EXCL_BIT0);
1716
1717		/*
1718		 * Wa_1405733216:icl
1719		 * Formerly known as WaDisableCleanEvicts
1720		 */
1721		wa_write_or(wal,
1722			    GEN8_L3SQCREG4,
1723			    GEN11_LQSC_CLEAN_EVICT_DISABLE);
1724
1725		/* WaForwardProgressSoftReset:icl */
1726		wa_write_or(wal,
1727			    GEN10_SCRATCH_LNCF2,
1728			    PMFLUSHDONE_LNICRSDROP |
1729			    PMFLUSH_GAPL3UNBLOCK |
1730			    PMFLUSHDONE_LNEBLK);
1731
1732		/* Wa_1406609255:icl (pre-prod) */
1733		if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_B0))
1734			wa_write_or(wal,
1735				    GEN7_SARCHKMD,
1736				    GEN7_DISABLE_DEMAND_PREFETCH);
1737
1738		/* Wa_1606682166:icl */
1739		wa_write_or(wal,
1740			    GEN7_SARCHKMD,
1741			    GEN7_DISABLE_SAMPLER_PREFETCH);
1742
1743		/* Wa_1409178092:icl */
1744		wa_write_masked_or(wal,
1745				   GEN11_SCRATCH2,
1746				   GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE,
1747				   0);
1748
1749		/* WaEnable32PlaneMode:icl */
1750		wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS,
1751			     GEN11_ENABLE_32_PLANE_MODE);
1752
1753		/*
1754		 * Wa_1408615072:icl,ehl  (vsunit)
1755		 * Wa_1407596294:icl,ehl  (hsunit)
1756		 */
1757		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1758			    VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS);
1759
1760		/* Wa_1407352427:icl,ehl */
1761		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1762			    PSDUNIT_CLKGATE_DIS);
1763
1764		/* Wa_1406680159:icl,ehl */
1765		wa_write_or(wal,
1766			    SUBSLICE_UNIT_LEVEL_CLKGATE,
1767			    GWUNIT_CLKGATE_DIS);
1768
1769		/*
1770		 * Wa_1408767742:icl[a2..forever],ehl[all]
1771		 * Wa_1605460711:icl[a0..c0]
1772		 */
1773		wa_write_or(wal,
1774			    GEN7_FF_THREAD_MODE,
1775			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
1776
1777		/* Wa_22010271021:ehl */
1778		if (IS_ELKHARTLAKE(i915))
1779			wa_masked_en(wal,
1780				     GEN9_CS_DEBUG_MODE1,
1781				     FF_DOP_CLOCK_GATE_DISABLE);
1782	}
1783
1784	if (IS_GEN_RANGE(i915, 9, 12)) {
1785		/* FtrPerCtxtPreemptionGranularityControl:skl,bxt,kbl,cfl,cnl,icl,tgl */
1786		wa_masked_en(wal,
1787			     GEN7_FF_SLICE_CS_CHICKEN1,
1788			     GEN9_FFSC_PERCTX_PREEMPT_CTRL);
1789	}
1790
1791	if (IS_SKYLAKE(i915) ||
1792	    IS_KABYLAKE(i915) ||
1793	    IS_COFFEELAKE(i915) ||
1794	    IS_COMETLAKE(i915)) {
1795		/* WaEnableGapsTsvCreditFix:skl,kbl,cfl */
1796		wa_write_or(wal,
1797			    GEN8_GARBCNTL,
1798			    GEN9_GAPS_TSV_CREDIT_DISABLE);
1799	}
1800
1801	if (IS_BROXTON(i915)) {
1802		/* WaDisablePooledEuLoadBalancingFix:bxt */
1803		wa_masked_en(wal,
1804			     FF_SLICE_CS_CHICKEN2,
1805			     GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
1806	}
1807
1808	if (IS_GEN(i915, 9)) {
1809		/* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */
1810		wa_masked_en(wal,
1811			     GEN9_CSFE_CHICKEN1_RCS,
1812			     GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE);
1813
1814		/* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */
1815		wa_write_or(wal,
1816			    BDW_SCRATCH1,
1817			    GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
1818
1819		/* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */
1820		if (IS_GEN9_LP(i915))
1821			wa_write_masked_or(wal,
1822					   GEN8_L3SQCREG1,
1823					   L3_PRIO_CREDITS_MASK,
1824					   L3_GENERAL_PRIO_CREDITS(62) |
1825					   L3_HIGH_PRIO_CREDITS(2));
1826
1827		/* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */
1828		wa_write_or(wal,
1829			    GEN8_L3SQCREG4,
1830			    GEN8_LQSC_FLUSH_COHERENT_LINES);
1831	}
1832
1833	if (IS_GEN(i915, 7))
1834		/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
1835		wa_masked_en(wal,
1836			     GFX_MODE_GEN7,
1837			     GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
1838
1839	if (IS_GEN_RANGE(i915, 6, 7))
1840		/*
1841		 * We need to disable the AsyncFlip performance optimisations in
1842		 * order to use MI_WAIT_FOR_EVENT within the CS. It should
1843		 * already be programmed to '1' on all products.
1844		 *
1845		 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
1846		 */
1847		wa_masked_en(wal,
1848			     MI_MODE,
1849			     ASYNC_FLIP_PERF_DISABLE);
1850
1851	if (IS_GEN(i915, 6)) {
1852		/*
1853		 * Required for the hardware to program scanline values for
1854		 * waiting
1855		 * WaEnableFlushTlbInvalidationMode:snb
1856		 */
1857		wa_masked_en(wal,
1858			     GFX_MODE,
1859			     GFX_TLB_INVALIDATE_EXPLICIT);
1860
1861		/*
1862		 * From the Sandybridge PRM, volume 1 part 3, page 24:
1863		 * "If this bit is set, STCunit will have LRA as replacement
1864		 *  policy. [...] This bit must be reset. LRA replacement
1865		 *  policy is not supported."
1866		 */
1867		wa_masked_dis(wal,
1868			      CACHE_MODE_0,
1869			      CM0_STC_EVICT_DISABLE_LRA_SNB);
1870	}
1871
1872	if (IS_GEN_RANGE(i915, 4, 6))
1873		/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
1874		wa_add(wal, MI_MODE,
1875		       0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH),
1876		       /* XXX bit doesn't stick on Broadwater */
1877		       IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH);
1878
1879	if (IS_GEN(i915, 4))
1880		/*
1881		 * Disable CONSTANT_BUFFER before it is loaded from the context
1882		 * image. For as it is loaded, it is executed and the stored
1883		 * address may no longer be valid, leading to a GPU hang.
1884		 *
1885		 * This imposes the requirement that userspace reload their
1886		 * CONSTANT_BUFFER on every batch, fortunately a requirement
1887		 * they are already accustomed to from before contexts were
1888		 * enabled.
1889		 */
1890		wa_add(wal, ECOSKPD,
1891		       0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE),
1892		       0 /* XXX bit doesn't stick on Broadwater */);
1893}
1894
1895static void
1896xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
1897{
1898	struct drm_i915_private *i915 = engine->i915;
1899
1900	/* WaKBLVECSSemaphoreWaitPoll:kbl */
1901	if (IS_KBL_REVID(i915, KBL_REVID_A0, KBL_REVID_E0)) {
1902		wa_write(wal,
1903			 RING_SEMA_WAIT_POLL(engine->mmio_base),
1904			 1);
1905	}
1906}
1907
1908static void
1909engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal)
1910{
1911	if (I915_SELFTEST_ONLY(INTEL_GEN(engine->i915) < 4))
1912		return;
1913
1914	if (engine->class == RENDER_CLASS)
1915		rcs_engine_wa_init(engine, wal);
1916	else
1917		xcs_engine_wa_init(engine, wal);
1918}
1919
1920void intel_engine_init_workarounds(struct intel_engine_cs *engine)
1921{
1922	struct i915_wa_list *wal = &engine->wa_list;
1923
1924	if (INTEL_GEN(engine->i915) < 4)
1925		return;
1926
1927	wa_init_start(wal, "engine", engine->name);
1928	engine_init_workarounds(engine, wal);
1929	wa_init_finish(wal);
1930}
1931
1932void intel_engine_apply_workarounds(struct intel_engine_cs *engine)
1933{
1934	wa_list_apply(engine->uncore, &engine->wa_list);
1935}
1936
1937static struct i915_vma *
1938create_scratch(struct i915_address_space *vm, int count)
1939{
1940	struct drm_i915_gem_object *obj;
1941	struct i915_vma *vma;
1942	unsigned int size;
1943	int err;
1944
1945	size = round_up(count * sizeof(u32), PAGE_SIZE);
1946	obj = i915_gem_object_create_internal(vm->i915, size);
1947	if (IS_ERR(obj))
1948		return ERR_CAST(obj);
1949
1950	i915_gem_object_set_cache_coherency(obj, I915_CACHE_LLC);
1951
1952	vma = i915_vma_instance(obj, vm, NULL);
1953	if (IS_ERR(vma)) {
1954		err = PTR_ERR(vma);
1955		goto err_obj;
1956	}
1957
1958	err = i915_vma_pin(vma, 0, 0,
1959			   i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
1960	if (err)
1961		goto err_obj;
1962
1963	return vma;
1964
1965err_obj:
1966	i915_gem_object_put(obj);
1967	return ERR_PTR(err);
1968}
1969
1970static const struct {
1971	u32 start;
1972	u32 end;
1973} mcr_ranges_gen8[] = {
1974	{ .start = 0x5500, .end = 0x55ff },
1975	{ .start = 0x7000, .end = 0x7fff },
1976	{ .start = 0x9400, .end = 0x97ff },
1977	{ .start = 0xb000, .end = 0xb3ff },
1978	{ .start = 0xe000, .end = 0xe7ff },
1979	{},
1980};
1981
1982static bool mcr_range(struct drm_i915_private *i915, u32 offset)
1983{
1984	int i;
1985
1986	if (INTEL_GEN(i915) < 8)
1987		return false;
1988
1989	/*
1990	 * Registers in these ranges are affected by the MCR selector
1991	 * which only controls CPU initiated MMIO. Routing does not
1992	 * work for CS access so we cannot verify them on this path.
1993	 */
1994	for (i = 0; mcr_ranges_gen8[i].start; i++)
1995		if (offset >= mcr_ranges_gen8[i].start &&
1996		    offset <= mcr_ranges_gen8[i].end)
1997			return true;
1998
1999	return false;
2000}
2001
2002static int
2003wa_list_srm(struct i915_request *rq,
2004	    const struct i915_wa_list *wal,
2005	    struct i915_vma *vma)
2006{
2007	struct drm_i915_private *i915 = rq->engine->i915;
2008	unsigned int i, count = 0;
2009	const struct i915_wa *wa;
2010	u32 srm, *cs;
2011
2012	srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
2013	if (INTEL_GEN(i915) >= 8)
2014		srm++;
2015
2016	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
2017		if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg)))
2018			count++;
2019	}
2020
2021	cs = intel_ring_begin(rq, 4 * count);
2022	if (IS_ERR(cs))
2023		return PTR_ERR(cs);
2024
2025	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
2026		u32 offset = i915_mmio_reg_offset(wa->reg);
2027
2028		if (mcr_range(i915, offset))
2029			continue;
2030
2031		*cs++ = srm;
2032		*cs++ = offset;
2033		*cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i;
2034		*cs++ = 0;
2035	}
2036	intel_ring_advance(rq, cs);
2037
2038	return 0;
2039}
2040
2041static int engine_wa_list_verify(struct intel_context *ce,
2042				 const struct i915_wa_list * const wal,
2043				 const char *from)
2044{
2045	const struct i915_wa *wa;
2046	struct i915_request *rq;
2047	struct i915_vma *vma;
2048	unsigned int i;
2049	u32 *results;
2050	int err;
2051
2052	if (!wal->count)
2053		return 0;
2054
2055	vma = create_scratch(&ce->engine->gt->ggtt->vm, wal->count);
2056	if (IS_ERR(vma))
2057		return PTR_ERR(vma);
2058
2059	intel_engine_pm_get(ce->engine);
2060	rq = intel_context_create_request(ce);
2061	intel_engine_pm_put(ce->engine);
2062	if (IS_ERR(rq)) {
2063		err = PTR_ERR(rq);
2064		goto err_vma;
2065	}
2066
2067	i915_vma_lock(vma);
2068	err = i915_request_await_object(rq, vma->obj, true);
2069	if (err == 0)
2070		err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
2071	i915_vma_unlock(vma);
2072	if (err) {
2073		i915_request_add(rq);
2074		goto err_vma;
2075	}
2076
2077	err = wa_list_srm(rq, wal, vma);
2078	if (err)
2079		goto err_vma;
2080
2081	i915_request_get(rq);
2082	i915_request_add(rq);
2083	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
2084		err = -ETIME;
2085		goto err_rq;
2086	}
2087
2088	results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
2089	if (IS_ERR(results)) {
2090		err = PTR_ERR(results);
2091		goto err_rq;
2092	}
2093
2094	err = 0;
2095	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
2096		if (mcr_range(rq->engine->i915, i915_mmio_reg_offset(wa->reg)))
2097			continue;
2098
2099		if (!wa_verify(wa, results[i], wal->name, from))
2100			err = -ENXIO;
2101	}
2102
2103	i915_gem_object_unpin_map(vma->obj);
2104
2105err_rq:
2106	i915_request_put(rq);
2107err_vma:
2108	i915_vma_unpin(vma);
2109	i915_vma_put(vma);
2110	return err;
2111}
2112
2113int intel_engine_verify_workarounds(struct intel_engine_cs *engine,
2114				    const char *from)
2115{
2116	return engine_wa_list_verify(engine->kernel_context,
2117				     &engine->wa_list,
2118				     from);
2119}
2120
2121#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
2122#include "selftest_workarounds.c"
2123#endif