Linux Audio

Check our new training course

Loading...
Note: File does not exist in v5.4.
  1// SPDX-License-Identifier: MIT
  2/*
  3 * Copyright © 2022 Intel Corporation
  4 */
  5
  6#include "xe_ring_ops.h"
  7
  8#include <generated/xe_wa_oob.h>
  9
 10#include "instructions/xe_gpu_commands.h"
 11#include "instructions/xe_mi_commands.h"
 12#include "regs/xe_engine_regs.h"
 13#include "regs/xe_gt_regs.h"
 14#include "regs/xe_lrc_layout.h"
 15#include "xe_exec_queue_types.h"
 16#include "xe_gt.h"
 17#include "xe_lrc.h"
 18#include "xe_macros.h"
 19#include "xe_sched_job.h"
 20#include "xe_sriov.h"
 21#include "xe_vm_types.h"
 22#include "xe_vm.h"
 23#include "xe_wa.h"
 24
 25/*
 26 * 3D-related flags that can't be set on _engines_ that lack access to the 3D
 27 * pipeline (i.e., CCS engines).
 28 */
 29#define PIPE_CONTROL_3D_ENGINE_FLAGS (\
 30		PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | \
 31		PIPE_CONTROL_DEPTH_CACHE_FLUSH | \
 32		PIPE_CONTROL_TILE_CACHE_FLUSH | \
 33		PIPE_CONTROL_DEPTH_STALL | \
 34		PIPE_CONTROL_STALL_AT_SCOREBOARD | \
 35		PIPE_CONTROL_PSD_SYNC | \
 36		PIPE_CONTROL_AMFS_FLUSH | \
 37		PIPE_CONTROL_VF_CACHE_INVALIDATE | \
 38		PIPE_CONTROL_GLOBAL_SNAPSHOT_RESET)
 39
 40/* 3D-related flags that can't be set on _platforms_ that lack a 3D pipeline */
 41#define PIPE_CONTROL_3D_ARCH_FLAGS ( \
 42		PIPE_CONTROL_3D_ENGINE_FLAGS | \
 43		PIPE_CONTROL_INDIRECT_STATE_DISABLE | \
 44		PIPE_CONTROL_FLUSH_ENABLE | \
 45		PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | \
 46		PIPE_CONTROL_DC_FLUSH_ENABLE)
 47
 48static u32 preparser_disable(bool state)
 49{
 50	return MI_ARB_CHECK | BIT(8) | state;
 51}
 52
 53static int emit_aux_table_inv(struct xe_gt *gt, struct xe_reg reg,
 54			      u32 *dw, int i)
 55{
 56	dw[i++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1) | MI_LRI_MMIO_REMAP_EN;
 57	dw[i++] = reg.addr + gt->mmio.adj_offset;
 58	dw[i++] = AUX_INV;
 59	dw[i++] = MI_NOOP;
 60
 61	return i;
 62}
 63
 64static int emit_user_interrupt(u32 *dw, int i)
 65{
 66	dw[i++] = MI_USER_INTERRUPT;
 67	dw[i++] = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 68	dw[i++] = MI_ARB_CHECK;
 69
 70	return i;
 71}
 72
 73static int emit_store_imm_ggtt(u32 addr, u32 value, u32 *dw, int i)
 74{
 75	dw[i++] = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
 76	dw[i++] = addr;
 77	dw[i++] = 0;
 78	dw[i++] = value;
 79
 80	return i;
 81}
 82
 83static int emit_flush_dw(u32 *dw, int i)
 84{
 85	dw[i++] = MI_FLUSH_DW | MI_FLUSH_IMM_DW;
 86	dw[i++] = 0;
 87	dw[i++] = 0;
 88	dw[i++] = 0;
 89
 90	return i;
 91}
 92
 93static int emit_flush_imm_ggtt(u32 addr, u32 value, bool invalidate_tlb,
 94			       u32 *dw, int i)
 95{
 96	dw[i++] = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_IMM_DW |
 97		(invalidate_tlb ? MI_INVALIDATE_TLB : 0);
 98	dw[i++] = addr | MI_FLUSH_DW_USE_GTT;
 99	dw[i++] = 0;
100	dw[i++] = value;
101
102	return i;
103}
104
105static int emit_bb_start(u64 batch_addr, u32 ppgtt_flag, u32 *dw, int i)
106{
107	dw[i++] = MI_BATCH_BUFFER_START | ppgtt_flag | XE_INSTR_NUM_DW(3);
108	dw[i++] = lower_32_bits(batch_addr);
109	dw[i++] = upper_32_bits(batch_addr);
110
111	return i;
112}
113
114static int emit_flush_invalidate(u32 flag, u32 *dw, int i)
115{
116	dw[i] = MI_FLUSH_DW;
117	dw[i] |= flag;
118	dw[i++] |= MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_IMM_DW |
119		MI_FLUSH_DW_STORE_INDEX;
120
121	dw[i++] = LRC_PPHWSP_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
122	dw[i++] = 0;
123	dw[i++] = ~0U;
124
125	return i;
126}
127
128static int
129emit_pipe_control(u32 *dw, int i, u32 bit_group_0, u32 bit_group_1, u32 offset, u32 value)
130{
131	dw[i++] = GFX_OP_PIPE_CONTROL(6) | bit_group_0;
132	dw[i++] = bit_group_1;
133	dw[i++] = offset;
134	dw[i++] = 0;
135	dw[i++] = value;
136	dw[i++] = 0;
137
138	return i;
139}
140
141static int emit_pipe_invalidate(u32 mask_flags, bool invalidate_tlb, u32 *dw,
142				int i)
143{
144	u32 flags = PIPE_CONTROL_CS_STALL |
145		PIPE_CONTROL_COMMAND_CACHE_INVALIDATE |
146		PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE |
147		PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
148		PIPE_CONTROL_VF_CACHE_INVALIDATE |
149		PIPE_CONTROL_CONST_CACHE_INVALIDATE |
150		PIPE_CONTROL_STATE_CACHE_INVALIDATE |
151		PIPE_CONTROL_QW_WRITE |
152		PIPE_CONTROL_STORE_DATA_INDEX;
153
154	if (invalidate_tlb)
155		flags |= PIPE_CONTROL_TLB_INVALIDATE;
156
157	flags &= ~mask_flags;
158
159	return emit_pipe_control(dw, i, 0, flags, LRC_PPHWSP_SCRATCH_ADDR, 0);
160}
161
162static int emit_store_imm_ppgtt_posted(u64 addr, u64 value,
163				       u32 *dw, int i)
164{
165	dw[i++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(1);
166	dw[i++] = lower_32_bits(addr);
167	dw[i++] = upper_32_bits(addr);
168	dw[i++] = lower_32_bits(value);
169	dw[i++] = upper_32_bits(value);
170
171	return i;
172}
173
174static int emit_render_cache_flush(struct xe_sched_job *job, u32 *dw, int i)
175{
176	struct xe_gt *gt = job->q->gt;
177	bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
178	u32 flags;
179
180	flags = (PIPE_CONTROL_CS_STALL |
181		 PIPE_CONTROL_TILE_CACHE_FLUSH |
182		 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
183		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
184		 PIPE_CONTROL_DC_FLUSH_ENABLE |
185		 PIPE_CONTROL_FLUSH_ENABLE);
186
187	if (XE_WA(gt, 1409600907))
188		flags |= PIPE_CONTROL_DEPTH_STALL;
189
190	if (lacks_render)
191		flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
192	else if (job->q->class == XE_ENGINE_CLASS_COMPUTE)
193		flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
194
195	return emit_pipe_control(dw, i, PIPE_CONTROL0_HDC_PIPELINE_FLUSH, flags, 0, 0);
196}
197
198static int emit_pipe_control_to_ring_end(struct xe_hw_engine *hwe, u32 *dw, int i)
199{
200	if (hwe->class != XE_ENGINE_CLASS_RENDER)
201		return i;
202
203	if (XE_WA(hwe->gt, 16020292621))
204		i = emit_pipe_control(dw, i, 0, PIPE_CONTROL_LRI_POST_SYNC,
205				      RING_NOPID(hwe->mmio_base).addr, 0);
206
207	return i;
208}
209
210static int emit_pipe_imm_ggtt(u32 addr, u32 value, bool stall_only, u32 *dw,
211			      int i)
212{
213	u32 flags = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_GLOBAL_GTT_IVB |
214		    PIPE_CONTROL_QW_WRITE;
215
216	if (!stall_only)
217		flags |= PIPE_CONTROL_FLUSH_ENABLE;
218
219	return emit_pipe_control(dw, i, 0, flags, addr, value);
220}
221
222static u32 get_ppgtt_flag(struct xe_sched_job *job)
223{
224	if (job->q->vm && !job->ggtt)
225		return BIT(8);
226
227	return 0;
228}
229
230static int emit_copy_timestamp(struct xe_lrc *lrc, u32 *dw, int i)
231{
232	dw[i++] = MI_COPY_MEM_MEM | MI_COPY_MEM_MEM_SRC_GGTT |
233		MI_COPY_MEM_MEM_DST_GGTT;
234	dw[i++] = xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
235	dw[i++] = 0;
236	dw[i++] = xe_lrc_ctx_timestamp_ggtt_addr(lrc);
237	dw[i++] = 0;
238	dw[i++] = MI_NOOP;
239
240	return i;
241}
242
243/* for engines that don't require any special HW handling (no EUs, no aux inval, etc) */
244static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc,
245				    u64 batch_addr, u32 seqno)
246{
247	u32 dw[MAX_JOB_SIZE_DW], i = 0;
248	u32 ppgtt_flag = get_ppgtt_flag(job);
249	struct xe_gt *gt = job->q->gt;
250
251	i = emit_copy_timestamp(lrc, dw, i);
252
253	if (job->ring_ops_flush_tlb) {
254		dw[i++] = preparser_disable(true);
255		i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
256					seqno, true, dw, i);
257		dw[i++] = preparser_disable(false);
258	} else {
259		i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
260					seqno, dw, i);
261	}
262
263	i = emit_bb_start(batch_addr, ppgtt_flag, dw, i);
264
265	if (job->user_fence.used) {
266		i = emit_flush_dw(dw, i);
267		i = emit_store_imm_ppgtt_posted(job->user_fence.addr,
268						job->user_fence.value,
269						dw, i);
270	}
271
272	i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, false, dw, i);
273
274	i = emit_user_interrupt(dw, i);
275
276	xe_gt_assert(gt, i <= MAX_JOB_SIZE_DW);
277
278	xe_lrc_write_ring(lrc, dw, i * sizeof(*dw));
279}
280
281static bool has_aux_ccs(struct xe_device *xe)
282{
283	/*
284	 * PVC is a special case that has no compression of either type
285	 * (FlatCCS or AuxCCS).  Also, AuxCCS is no longer used from Xe2
286	 * onward, so any future platforms with no FlatCCS will not have
287	 * AuxCCS either.
288	 */
289	if (GRAPHICS_VER(xe) >= 20 || xe->info.platform == XE_PVC)
290		return false;
291
292	return !xe->info.has_flat_ccs;
293}
294
295static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
296				   u64 batch_addr, u32 seqno)
297{
298	u32 dw[MAX_JOB_SIZE_DW], i = 0;
299	u32 ppgtt_flag = get_ppgtt_flag(job);
300	struct xe_gt *gt = job->q->gt;
301	struct xe_device *xe = gt_to_xe(gt);
302	bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE;
303
304	i = emit_copy_timestamp(lrc, dw, i);
305
306	dw[i++] = preparser_disable(true);
307
308	/* hsdes: 1809175790 */
309	if (has_aux_ccs(xe)) {
310		if (decode)
311			i = emit_aux_table_inv(gt, VD0_AUX_INV, dw, i);
312		else
313			i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i);
314	}
315
316	if (job->ring_ops_flush_tlb)
317		i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
318					seqno, true, dw, i);
319
320	dw[i++] = preparser_disable(false);
321
322	if (!job->ring_ops_flush_tlb)
323		i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
324					seqno, dw, i);
325
326	i = emit_bb_start(batch_addr, ppgtt_flag, dw, i);
327
328	if (job->user_fence.used) {
329		i = emit_flush_dw(dw, i);
330		i = emit_store_imm_ppgtt_posted(job->user_fence.addr,
331						job->user_fence.value,
332						dw, i);
333	}
334
335	i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, false, dw, i);
336
337	i = emit_user_interrupt(dw, i);
338
339	xe_gt_assert(gt, i <= MAX_JOB_SIZE_DW);
340
341	xe_lrc_write_ring(lrc, dw, i * sizeof(*dw));
342}
343
344static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
345					    struct xe_lrc *lrc,
346					    u64 batch_addr, u32 seqno)
347{
348	u32 dw[MAX_JOB_SIZE_DW], i = 0;
349	u32 ppgtt_flag = get_ppgtt_flag(job);
350	struct xe_gt *gt = job->q->gt;
351	struct xe_device *xe = gt_to_xe(gt);
352	bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
353	u32 mask_flags = 0;
354
355	i = emit_copy_timestamp(lrc, dw, i);
356
357	dw[i++] = preparser_disable(true);
358	if (lacks_render)
359		mask_flags = PIPE_CONTROL_3D_ARCH_FLAGS;
360	else if (job->q->class == XE_ENGINE_CLASS_COMPUTE)
361		mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS;
362
363	/* See __xe_pt_bind_vma() for a discussion on TLB invalidations. */
364	i = emit_pipe_invalidate(mask_flags, job->ring_ops_flush_tlb, dw, i);
365
366	/* hsdes: 1809175790 */
367	if (has_aux_ccs(xe))
368		i = emit_aux_table_inv(gt, CCS_AUX_INV, dw, i);
369
370	dw[i++] = preparser_disable(false);
371
372	i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
373				seqno, dw, i);
374
375	i = emit_bb_start(batch_addr, ppgtt_flag, dw, i);
376
377	i = emit_render_cache_flush(job, dw, i);
378
379	if (job->user_fence.used)
380		i = emit_store_imm_ppgtt_posted(job->user_fence.addr,
381						job->user_fence.value,
382						dw, i);
383
384	i = emit_pipe_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, lacks_render, dw, i);
385
386	i = emit_user_interrupt(dw, i);
387
388	i = emit_pipe_control_to_ring_end(job->q->hwe, dw, i);
389
390	xe_gt_assert(gt, i <= MAX_JOB_SIZE_DW);
391
392	xe_lrc_write_ring(lrc, dw, i * sizeof(*dw));
393}
394
395static void emit_migration_job_gen12(struct xe_sched_job *job,
396				     struct xe_lrc *lrc, u32 seqno)
397{
398	u32 dw[MAX_JOB_SIZE_DW], i = 0;
399
400	i = emit_copy_timestamp(lrc, dw, i);
401
402	i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
403				seqno, dw, i);
404
405	dw[i++] = MI_ARB_ON_OFF | MI_ARB_DISABLE; /* Enabled again below */
406
407	i = emit_bb_start(job->ptrs[0].batch_addr, BIT(8), dw, i);
408
409	if (!IS_SRIOV_VF(gt_to_xe(job->q->gt))) {
410		/* XXX: Do we need this? Leaving for now. */
411		dw[i++] = preparser_disable(true);
412		i = emit_flush_invalidate(0, dw, i);
413		dw[i++] = preparser_disable(false);
414	}
415
416	i = emit_bb_start(job->ptrs[1].batch_addr, BIT(8), dw, i);
417
418	dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | job->migrate_flush_flags |
419		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_IMM_DW;
420	dw[i++] = xe_lrc_seqno_ggtt_addr(lrc) | MI_FLUSH_DW_USE_GTT;
421	dw[i++] = 0;
422	dw[i++] = seqno; /* value */
423
424	i = emit_user_interrupt(dw, i);
425
426	xe_gt_assert(job->q->gt, i <= MAX_JOB_SIZE_DW);
427
428	xe_lrc_write_ring(lrc, dw, i * sizeof(*dw));
429}
430
431static void emit_job_gen12_gsc(struct xe_sched_job *job)
432{
433	struct xe_gt *gt = job->q->gt;
434
435	xe_gt_assert(gt, job->q->width <= 1); /* no parallel submission for GSCCS */
436
437	__emit_job_gen12_simple(job, job->q->lrc[0],
438				job->ptrs[0].batch_addr,
439				xe_sched_job_lrc_seqno(job));
440}
441
442static void emit_job_gen12_copy(struct xe_sched_job *job)
443{
444	int i;
445
446	if (xe_sched_job_is_migration(job->q)) {
447		emit_migration_job_gen12(job, job->q->lrc[0],
448					 xe_sched_job_lrc_seqno(job));
449		return;
450	}
451
452	for (i = 0; i < job->q->width; ++i)
453		__emit_job_gen12_simple(job, job->q->lrc[i],
454					job->ptrs[i].batch_addr,
455					xe_sched_job_lrc_seqno(job));
456}
457
458static void emit_job_gen12_video(struct xe_sched_job *job)
459{
460	int i;
461
462	/* FIXME: Not doing parallel handshake for now */
463	for (i = 0; i < job->q->width; ++i)
464		__emit_job_gen12_video(job, job->q->lrc[i],
465				       job->ptrs[i].batch_addr,
466				       xe_sched_job_lrc_seqno(job));
467}
468
469static void emit_job_gen12_render_compute(struct xe_sched_job *job)
470{
471	int i;
472
473	for (i = 0; i < job->q->width; ++i)
474		__emit_job_gen12_render_compute(job, job->q->lrc[i],
475						job->ptrs[i].batch_addr,
476						xe_sched_job_lrc_seqno(job));
477}
478
479static const struct xe_ring_ops ring_ops_gen12_gsc = {
480	.emit_job = emit_job_gen12_gsc,
481};
482
483static const struct xe_ring_ops ring_ops_gen12_copy = {
484	.emit_job = emit_job_gen12_copy,
485};
486
487static const struct xe_ring_ops ring_ops_gen12_video = {
488	.emit_job = emit_job_gen12_video,
489};
490
491static const struct xe_ring_ops ring_ops_gen12_render_compute = {
492	.emit_job = emit_job_gen12_render_compute,
493};
494
495const struct xe_ring_ops *
496xe_ring_ops_get(struct xe_gt *gt, enum xe_engine_class class)
497{
498	switch (class) {
499	case XE_ENGINE_CLASS_OTHER:
500		return &ring_ops_gen12_gsc;
501	case XE_ENGINE_CLASS_COPY:
502		return &ring_ops_gen12_copy;
503	case XE_ENGINE_CLASS_VIDEO_DECODE:
504	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
505		return &ring_ops_gen12_video;
506	case XE_ENGINE_CLASS_RENDER:
507	case XE_ENGINE_CLASS_COMPUTE:
508		return &ring_ops_gen12_render_compute;
509	default:
510		return NULL;
511	}
512}