gen6_engine_cs.c - drivers/gpu/drm/i915/gt/gen6_engine_cs.c - Linux source code v4.6

Note: File does not exist in v4.6.
  1// SPDX-License-Identifier: MIT
  2/*
  3 * Copyright © 2020 Intel Corporation
  4 */
  5
  6#include "gen6_engine_cs.h"
  7#include "intel_engine.h"
  8#include "intel_gpu_commands.h"
  9#include "intel_gt.h"
 10#include "intel_gt_irq.h"
 11#include "intel_gt_pm_irq.h"
 12#include "intel_ring.h"
 13
 14#define HWS_SCRATCH_ADDR	(I915_GEM_HWS_SCRATCH * sizeof(u32))
 15
 16/*
 17 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
 18 * implementing two workarounds on gen6.  From section 1.4.7.1
 19 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
 20 *
 21 * [DevSNB-C+{W/A}] Before any depth stall flush (including those
 22 * produced by non-pipelined state commands), software needs to first
 23 * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
 24 * 0.
 25 *
 26 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
 27 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
 28 *
 29 * And the workaround for these two requires this workaround first:
 30 *
 31 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
 32 * BEFORE the pipe-control with a post-sync op and no write-cache
 33 * flushes.
 34 *
 35 * And this last workaround is tricky because of the requirements on
 36 * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
 37 * volume 2 part 1:
 38 *
 39 *     "1 of the following must also be set:
 40 *      - Render Target Cache Flush Enable ([12] of DW1)
 41 *      - Depth Cache Flush Enable ([0] of DW1)
 42 *      - Stall at Pixel Scoreboard ([1] of DW1)
 43 *      - Depth Stall ([13] of DW1)
 44 *      - Post-Sync Operation ([13] of DW1)
 45 *      - Notify Enable ([8] of DW1)"
 46 *
 47 * The cache flushes require the workaround flush that triggered this
 48 * one, so we can't use it.  Depth stall would trigger the same.
 49 * Post-sync nonzero is what triggered this second workaround, so we
 50 * can't use that one either.  Notify enable is IRQs, which aren't
 51 * really our business.  That leaves only stall at scoreboard.
 52 */
 53static int
 54gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
 55{
 56	u32 scratch_addr =
 57		intel_gt_scratch_offset(rq->engine->gt,
 58					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
 59	u32 *cs;
 60
 61	cs = intel_ring_begin(rq, 6);
 62	if (IS_ERR(cs))
 63		return PTR_ERR(cs);
 64
 65	*cs++ = GFX_OP_PIPE_CONTROL(5);
 66	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 67	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 68	*cs++ = 0; /* low dword */
 69	*cs++ = 0; /* high dword */
 70	*cs++ = MI_NOOP;
 71	intel_ring_advance(rq, cs);
 72
 73	cs = intel_ring_begin(rq, 6);
 74	if (IS_ERR(cs))
 75		return PTR_ERR(cs);
 76
 77	*cs++ = GFX_OP_PIPE_CONTROL(5);
 78	*cs++ = PIPE_CONTROL_QW_WRITE;
 79	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 80	*cs++ = 0;
 81	*cs++ = 0;
 82	*cs++ = MI_NOOP;
 83	intel_ring_advance(rq, cs);
 84
 85	return 0;
 86}
 87
 88int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
 89{
 90	u32 scratch_addr =
 91		intel_gt_scratch_offset(rq->engine->gt,
 92					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
 93	u32 *cs, flags = 0;
 94	int ret;
 95
 96	/* Force SNB workarounds for PIPE_CONTROL flushes */
 97	ret = gen6_emit_post_sync_nonzero_flush(rq);
 98	if (ret)
 99		return ret;
100
101	/*
102	 * Just flush everything.  Experiments have shown that reducing the
103	 * number of bits based on the write domains has little performance
104	 * impact. And when rearranging requests, the order of flushes is
105	 * unknown.
106	 */
107	if (mode & EMIT_FLUSH) {
108		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
109		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
110		/*
111		 * Ensure that any following seqno writes only happen
112		 * when the render cache is indeed flushed.
113		 */
114		flags |= PIPE_CONTROL_CS_STALL;
115	}
116	if (mode & EMIT_INVALIDATE) {
117		flags |= PIPE_CONTROL_TLB_INVALIDATE;
118		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
119		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
120		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
121		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
122		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
123		/*
124		 * TLB invalidate requires a post-sync write.
125		 */
126		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
127	}
128
129	cs = intel_ring_begin(rq, 4);
130	if (IS_ERR(cs))
131		return PTR_ERR(cs);
132
133	*cs++ = GFX_OP_PIPE_CONTROL(4);
134	*cs++ = flags;
135	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
136	*cs++ = 0;
137	intel_ring_advance(rq, cs);
138
139	return 0;
140}
141
142u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
143{
144	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
145	*cs++ = GFX_OP_PIPE_CONTROL(4);
146	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
147	*cs++ = 0;
148	*cs++ = 0;
149
150	*cs++ = GFX_OP_PIPE_CONTROL(4);
151	*cs++ = PIPE_CONTROL_QW_WRITE;
152	*cs++ = intel_gt_scratch_offset(rq->engine->gt,
153					INTEL_GT_SCRATCH_FIELD_DEFAULT) |
154		PIPE_CONTROL_GLOBAL_GTT;
155	*cs++ = 0;
156
157	/* Finally we can flush and with it emit the breadcrumb */
158	*cs++ = GFX_OP_PIPE_CONTROL(4);
159	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
160		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
161		 PIPE_CONTROL_DC_FLUSH_ENABLE |
162		 PIPE_CONTROL_QW_WRITE |
163		 PIPE_CONTROL_CS_STALL);
164	*cs++ = i915_request_active_timeline(rq)->hwsp_offset |
165		PIPE_CONTROL_GLOBAL_GTT;
166	*cs++ = rq->fence.seqno;
167
168	*cs++ = MI_USER_INTERRUPT;
169	*cs++ = MI_NOOP;
170
171	rq->tail = intel_ring_offset(rq, cs);
172	assert_ring_tail_valid(rq->ring, rq->tail);
173
174	return cs;
175}
176
177static int mi_flush_dw(struct i915_request *rq, u32 flags)
178{
179	u32 cmd, *cs;
180
181	cs = intel_ring_begin(rq, 4);
182	if (IS_ERR(cs))
183		return PTR_ERR(cs);
184
185	cmd = MI_FLUSH_DW;
186
187	/*
188	 * We always require a command barrier so that subsequent
189	 * commands, such as breadcrumb interrupts, are strictly ordered
190	 * wrt the contents of the write cache being flushed to memory
191	 * (and thus being coherent from the CPU).
192	 */
193	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
194
195	/*
196	 * Bspec vol 1c.3 - blitter engine command streamer:
197	 * "If ENABLED, all TLBs will be invalidated once the flush
198	 * operation is complete. This bit is only valid when the
199	 * Post-Sync Operation field is a value of 1h or 3h."
200	 */
201	cmd |= flags;
202
203	*cs++ = cmd;
204	*cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
205	*cs++ = 0;
206	*cs++ = MI_NOOP;
207
208	intel_ring_advance(rq, cs);
209
210	return 0;
211}
212
213static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
214{
215	return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
216}
217
218int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
219{
220	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
221}
222
223int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
224{
225	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
226}
227
228int gen6_emit_bb_start(struct i915_request *rq,
229		       u64 offset, u32 len,
230		       unsigned int dispatch_flags)
231{
232	u32 security;
233	u32 *cs;
234
235	security = MI_BATCH_NON_SECURE_I965;
236	if (dispatch_flags & I915_DISPATCH_SECURE)
237		security = 0;
238
239	cs = intel_ring_begin(rq, 2);
240	if (IS_ERR(cs))
241		return PTR_ERR(cs);
242
243	cs = __gen6_emit_bb_start(cs, offset, security);
244	intel_ring_advance(rq, cs);
245
246	return 0;
247}
248
249int
250hsw_emit_bb_start(struct i915_request *rq,
251		  u64 offset, u32 len,
252		  unsigned int dispatch_flags)
253{
254	u32 security;
255	u32 *cs;
256
257	security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
258	if (dispatch_flags & I915_DISPATCH_SECURE)
259		security = 0;
260
261	cs = intel_ring_begin(rq, 2);
262	if (IS_ERR(cs))
263		return PTR_ERR(cs);
264
265	cs = __gen6_emit_bb_start(cs, offset, security);
266	intel_ring_advance(rq, cs);
267
268	return 0;
269}
270
271static int gen7_stall_cs(struct i915_request *rq)
272{
273	u32 *cs;
274
275	cs = intel_ring_begin(rq, 4);
276	if (IS_ERR(cs))
277		return PTR_ERR(cs);
278
279	*cs++ = GFX_OP_PIPE_CONTROL(4);
280	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
281	*cs++ = 0;
282	*cs++ = 0;
283	intel_ring_advance(rq, cs);
284
285	return 0;
286}
287
288int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
289{
290	u32 scratch_addr =
291		intel_gt_scratch_offset(rq->engine->gt,
292					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
293	u32 *cs, flags = 0;
294
295	/*
296	 * Ensure that any following seqno writes only happen when the render
297	 * cache is indeed flushed.
298	 *
299	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
300	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
301	 * don't try to be clever and just set it unconditionally.
302	 */
303	flags |= PIPE_CONTROL_CS_STALL;
304
305	/*
306	 * CS_STALL suggests at least a post-sync write.
307	 */
308	flags |= PIPE_CONTROL_QW_WRITE;
309	flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
310
311	/*
312	 * Just flush everything.  Experiments have shown that reducing the
313	 * number of bits based on the write domains has little performance
314	 * impact.
315	 */
316	if (mode & EMIT_FLUSH) {
317		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
318		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
319		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
320		flags |= PIPE_CONTROL_FLUSH_ENABLE;
321	}
322	if (mode & EMIT_INVALIDATE) {
323		flags |= PIPE_CONTROL_TLB_INVALIDATE;
324		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
325		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
326		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
327		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
328		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
329		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
330
331		/*
332		 * Workaround: we must issue a pipe_control with CS-stall bit
333		 * set before a pipe_control command that has the state cache
334		 * invalidate bit set.
335		 */
336		gen7_stall_cs(rq);
337	}
338
339	cs = intel_ring_begin(rq, 4);
340	if (IS_ERR(cs))
341		return PTR_ERR(cs);
342
343	*cs++ = GFX_OP_PIPE_CONTROL(4);
344	*cs++ = flags;
345	*cs++ = scratch_addr;
346	*cs++ = 0;
347	intel_ring_advance(rq, cs);
348
349	return 0;
350}
351
352u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
353{
354	*cs++ = GFX_OP_PIPE_CONTROL(4);
355	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
356		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
357		 PIPE_CONTROL_DC_FLUSH_ENABLE |
358		 PIPE_CONTROL_FLUSH_ENABLE |
359		 PIPE_CONTROL_QW_WRITE |
360		 PIPE_CONTROL_GLOBAL_GTT_IVB |
361		 PIPE_CONTROL_CS_STALL);
362	*cs++ = i915_request_active_timeline(rq)->hwsp_offset;
363	*cs++ = rq->fence.seqno;
364
365	*cs++ = MI_USER_INTERRUPT;
366	*cs++ = MI_NOOP;
367
368	rq->tail = intel_ring_offset(rq, cs);
369	assert_ring_tail_valid(rq->ring, rq->tail);
370
371	return cs;
372}
373
374u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
375{
376	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
377	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
378
379	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
380	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
381	*cs++ = rq->fence.seqno;
382
383	*cs++ = MI_USER_INTERRUPT;
384
385	rq->tail = intel_ring_offset(rq, cs);
386	assert_ring_tail_valid(rq->ring, rq->tail);
387
388	return cs;
389}
390
391#define GEN7_XCS_WA 32
392u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
393{
394	int i;
395
396	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
397	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
398
399	*cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
400		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
401	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
402	*cs++ = rq->fence.seqno;
403
404	for (i = 0; i < GEN7_XCS_WA; i++) {
405		*cs++ = MI_STORE_DWORD_INDEX;
406		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
407		*cs++ = rq->fence.seqno;
408	}
409
410	*cs++ = MI_FLUSH_DW;
411	*cs++ = 0;
412	*cs++ = 0;
413
414	*cs++ = MI_USER_INTERRUPT;
415	*cs++ = MI_NOOP;
416
417	rq->tail = intel_ring_offset(rq, cs);
418	assert_ring_tail_valid(rq->ring, rq->tail);
419
420	return cs;
421}
422#undef GEN7_XCS_WA
423
424void gen6_irq_enable(struct intel_engine_cs *engine)
425{
426	ENGINE_WRITE(engine, RING_IMR,
427		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
428
429	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
430	ENGINE_POSTING_READ(engine, RING_IMR);
431
432	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
433}
434
435void gen6_irq_disable(struct intel_engine_cs *engine)
436{
437	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
438	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
439}
440
441void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
442{
443	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
444
445	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
446	ENGINE_POSTING_READ(engine, RING_IMR);
447
448	gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
449}
450
451void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
452{
453	ENGINE_WRITE(engine, RING_IMR, ~0);
454	gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
455}