Linux Audio

Check our new training course

Linux kernel drivers training

Mar 31-Apr 9, 2025, special US time zones
Register
Loading...
Note: File does not exist in v4.6.
  1// SPDX-License-Identifier: MIT
  2/*
  3 * Copyright © 2022 Intel Corporation
  4 */
  5
  6#include "xe_ring_ops.h"
  7
  8#include "generated/xe_wa_oob.h"
  9#include "instructions/xe_mi_commands.h"
 10#include "regs/xe_engine_regs.h"
 11#include "regs/xe_gpu_commands.h"
 12#include "regs/xe_gt_regs.h"
 13#include "regs/xe_lrc_layout.h"
 14#include "xe_exec_queue_types.h"
 15#include "xe_gt.h"
 16#include "xe_lrc.h"
 17#include "xe_macros.h"
 18#include "xe_sched_job.h"
 19#include "xe_vm_types.h"
 20#include "xe_vm.h"
 21#include "xe_wa.h"
 22
 23/*
 24 * 3D-related flags that can't be set on _engines_ that lack access to the 3D
 25 * pipeline (i.e., CCS engines).
 26 */
 27#define PIPE_CONTROL_3D_ENGINE_FLAGS (\
 28		PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | \
 29		PIPE_CONTROL_DEPTH_CACHE_FLUSH | \
 30		PIPE_CONTROL_TILE_CACHE_FLUSH | \
 31		PIPE_CONTROL_DEPTH_STALL | \
 32		PIPE_CONTROL_STALL_AT_SCOREBOARD | \
 33		PIPE_CONTROL_PSD_SYNC | \
 34		PIPE_CONTROL_AMFS_FLUSH | \
 35		PIPE_CONTROL_VF_CACHE_INVALIDATE | \
 36		PIPE_CONTROL_GLOBAL_SNAPSHOT_RESET)
 37
 38/* 3D-related flags that can't be set on _platforms_ that lack a 3D pipeline */
 39#define PIPE_CONTROL_3D_ARCH_FLAGS ( \
 40		PIPE_CONTROL_3D_ENGINE_FLAGS | \
 41		PIPE_CONTROL_INDIRECT_STATE_DISABLE | \
 42		PIPE_CONTROL_FLUSH_ENABLE | \
 43		PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | \
 44		PIPE_CONTROL_DC_FLUSH_ENABLE)
 45
 46static u32 preparser_disable(bool state)
 47{
 48	return MI_ARB_CHECK | BIT(8) | state;
 49}
 50
 51static int emit_aux_table_inv(struct xe_gt *gt, struct xe_reg reg,
 52			      u32 *dw, int i)
 53{
 54	dw[i++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1) | MI_LRI_MMIO_REMAP_EN;
 55	dw[i++] = reg.addr + gt->mmio.adj_offset;
 56	dw[i++] = AUX_INV;
 57	dw[i++] = MI_NOOP;
 58
 59	return i;
 60}
 61
 62static int emit_user_interrupt(u32 *dw, int i)
 63{
 64	dw[i++] = MI_USER_INTERRUPT;
 65	dw[i++] = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 66	dw[i++] = MI_ARB_CHECK;
 67
 68	return i;
 69}
 70
 71static int emit_store_imm_ggtt(u32 addr, u32 value, u32 *dw, int i)
 72{
 73	dw[i++] = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
 74	dw[i++] = addr;
 75	dw[i++] = 0;
 76	dw[i++] = value;
 77
 78	return i;
 79}
 80
 81static int emit_flush_imm_ggtt(u32 addr, u32 value, bool invalidate_tlb,
 82			       u32 *dw, int i)
 83{
 84	dw[i++] = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_IMM_DW |
 85		(invalidate_tlb ? MI_INVALIDATE_TLB : 0);
 86	dw[i++] = addr | MI_FLUSH_DW_USE_GTT;
 87	dw[i++] = 0;
 88	dw[i++] = value;
 89
 90	return i;
 91}
 92
 93static int emit_bb_start(u64 batch_addr, u32 ppgtt_flag, u32 *dw, int i)
 94{
 95	dw[i++] = MI_BATCH_BUFFER_START | ppgtt_flag | XE_INSTR_NUM_DW(3);
 96	dw[i++] = lower_32_bits(batch_addr);
 97	dw[i++] = upper_32_bits(batch_addr);
 98
 99	return i;
100}
101
102static int emit_flush_invalidate(u32 flag, u32 *dw, int i)
103{
104	dw[i] = MI_FLUSH_DW;
105	dw[i] |= flag;
106	dw[i++] |= MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_IMM_DW |
107		MI_FLUSH_DW_STORE_INDEX;
108
109	dw[i++] = LRC_PPHWSP_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
110	dw[i++] = 0;
111	dw[i++] = ~0U;
112
113	return i;
114}
115
116static int emit_pipe_invalidate(u32 mask_flags, bool invalidate_tlb, u32 *dw,
117				int i)
118{
119	u32 flags = PIPE_CONTROL_CS_STALL |
120		PIPE_CONTROL_COMMAND_CACHE_INVALIDATE |
121		PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE |
122		PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
123		PIPE_CONTROL_VF_CACHE_INVALIDATE |
124		PIPE_CONTROL_CONST_CACHE_INVALIDATE |
125		PIPE_CONTROL_STATE_CACHE_INVALIDATE |
126		PIPE_CONTROL_QW_WRITE |
127		PIPE_CONTROL_STORE_DATA_INDEX;
128
129	if (invalidate_tlb)
130		flags |= PIPE_CONTROL_TLB_INVALIDATE;
131
132	flags &= ~mask_flags;
133
134	dw[i++] = GFX_OP_PIPE_CONTROL(6);
135	dw[i++] = flags;
136	dw[i++] = LRC_PPHWSP_SCRATCH_ADDR;
137	dw[i++] = 0;
138	dw[i++] = 0;
139	dw[i++] = 0;
140
141	return i;
142}
143
144static int emit_store_imm_ppgtt_posted(u64 addr, u64 value,
145				       u32 *dw, int i)
146{
147	dw[i++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(1);
148	dw[i++] = lower_32_bits(addr);
149	dw[i++] = upper_32_bits(addr);
150	dw[i++] = lower_32_bits(value);
151	dw[i++] = upper_32_bits(value);
152
153	return i;
154}
155
156static int emit_render_cache_flush(struct xe_sched_job *job, u32 *dw, int i)
157{
158	struct xe_gt *gt = job->q->gt;
159	bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
160	u32 flags;
161
162	flags = (PIPE_CONTROL_CS_STALL |
163		 PIPE_CONTROL_TILE_CACHE_FLUSH |
164		 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
165		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
166		 PIPE_CONTROL_DC_FLUSH_ENABLE |
167		 PIPE_CONTROL_FLUSH_ENABLE);
168
169	if (XE_WA(gt, 1409600907))
170		flags |= PIPE_CONTROL_DEPTH_STALL;
171
172	if (lacks_render)
173		flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
174	else if (job->q->class == XE_ENGINE_CLASS_COMPUTE)
175		flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
176
177	dw[i++] = GFX_OP_PIPE_CONTROL(6) | PIPE_CONTROL0_HDC_PIPELINE_FLUSH;
178	dw[i++] = flags;
179	dw[i++] = 0;
180	dw[i++] = 0;
181	dw[i++] = 0;
182	dw[i++] = 0;
183
184	return i;
185}
186
187static int emit_pipe_control_to_ring_end(struct xe_hw_engine *hwe, u32 *dw, int i)
188{
189	if (hwe->class != XE_ENGINE_CLASS_RENDER)
190		return i;
191
192	if (XE_WA(hwe->gt, 16020292621)) {
193		dw[i++] = GFX_OP_PIPE_CONTROL(6);
194		dw[i++] = PIPE_CONTROL_LRI_POST_SYNC;
195		dw[i++] = RING_NOPID(hwe->mmio_base).addr;
196		dw[i++] = 0;
197		dw[i++] = 0;
198		dw[i++] = 0;
199	}
200
201	return i;
202}
203
204static int emit_pipe_imm_ggtt(u32 addr, u32 value, bool stall_only, u32 *dw,
205			      int i)
206{
207	dw[i++] = GFX_OP_PIPE_CONTROL(6);
208	dw[i++] = (stall_only ? PIPE_CONTROL_CS_STALL :
209		   PIPE_CONTROL_FLUSH_ENABLE | PIPE_CONTROL_CS_STALL) |
210		PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_QW_WRITE;
211	dw[i++] = addr;
212	dw[i++] = 0;
213	dw[i++] = value;
214	dw[i++] = 0; /* We're thrashing one extra dword. */
215
216	return i;
217}
218
219static u32 get_ppgtt_flag(struct xe_sched_job *job)
220{
221	return job->q->vm ? BIT(8) : 0;
222}
223
224/* for engines that don't require any special HW handling (no EUs, no aux inval, etc) */
225static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc,
226				    u64 batch_addr, u32 seqno)
227{
228	u32 dw[MAX_JOB_SIZE_DW], i = 0;
229	u32 ppgtt_flag = get_ppgtt_flag(job);
230	struct xe_vm *vm = job->q->vm;
231	struct xe_gt *gt = job->q->gt;
232
233	if (vm && vm->batch_invalidate_tlb) {
234		dw[i++] = preparser_disable(true);
235		i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
236					seqno, true, dw, i);
237		dw[i++] = preparser_disable(false);
238	} else {
239		i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
240					seqno, dw, i);
241	}
242
243	i = emit_bb_start(batch_addr, ppgtt_flag, dw, i);
244
245	if (job->user_fence.used)
246		i = emit_store_imm_ppgtt_posted(job->user_fence.addr,
247						job->user_fence.value,
248						dw, i);
249
250	i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, false, dw, i);
251
252	i = emit_user_interrupt(dw, i);
253
254	xe_gt_assert(gt, i <= MAX_JOB_SIZE_DW);
255
256	xe_lrc_write_ring(lrc, dw, i * sizeof(*dw));
257}
258
259static bool has_aux_ccs(struct xe_device *xe)
260{
261	/*
262	 * PVC is a special case that has no compression of either type
263	 * (FlatCCS or AuxCCS).  Also, AuxCCS is no longer used from Xe2
264	 * onward, so any future platforms with no FlatCCS will not have
265	 * AuxCCS either.
266	 */
267	if (GRAPHICS_VER(xe) >= 20 || xe->info.platform == XE_PVC)
268		return false;
269
270	return !xe->info.has_flat_ccs;
271}
272
273static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
274				   u64 batch_addr, u32 seqno)
275{
276	u32 dw[MAX_JOB_SIZE_DW], i = 0;
277	u32 ppgtt_flag = get_ppgtt_flag(job);
278	struct xe_gt *gt = job->q->gt;
279	struct xe_device *xe = gt_to_xe(gt);
280	bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE;
281	struct xe_vm *vm = job->q->vm;
282
283	dw[i++] = preparser_disable(true);
284
285	/* hsdes: 1809175790 */
286	if (has_aux_ccs(xe)) {
287		if (decode)
288			i = emit_aux_table_inv(gt, VD0_AUX_INV, dw, i);
289		else
290			i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i);
291	}
292
293	if (vm && vm->batch_invalidate_tlb)
294		i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
295					seqno, true, dw, i);
296
297	dw[i++] = preparser_disable(false);
298
299	if (!vm || !vm->batch_invalidate_tlb)
300		i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
301					seqno, dw, i);
302
303	i = emit_bb_start(batch_addr, ppgtt_flag, dw, i);
304
305	if (job->user_fence.used)
306		i = emit_store_imm_ppgtt_posted(job->user_fence.addr,
307						job->user_fence.value,
308						dw, i);
309
310	i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, false, dw, i);
311
312	i = emit_user_interrupt(dw, i);
313
314	xe_gt_assert(gt, i <= MAX_JOB_SIZE_DW);
315
316	xe_lrc_write_ring(lrc, dw, i * sizeof(*dw));
317}
318
319static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
320					    struct xe_lrc *lrc,
321					    u64 batch_addr, u32 seqno)
322{
323	u32 dw[MAX_JOB_SIZE_DW], i = 0;
324	u32 ppgtt_flag = get_ppgtt_flag(job);
325	struct xe_gt *gt = job->q->gt;
326	struct xe_device *xe = gt_to_xe(gt);
327	bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
328	struct xe_vm *vm = job->q->vm;
329	u32 mask_flags = 0;
330
331	dw[i++] = preparser_disable(true);
332	if (lacks_render)
333		mask_flags = PIPE_CONTROL_3D_ARCH_FLAGS;
334	else if (job->q->class == XE_ENGINE_CLASS_COMPUTE)
335		mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS;
336
337	/* See __xe_pt_bind_vma() for a discussion on TLB invalidations. */
338	i = emit_pipe_invalidate(mask_flags, vm && vm->batch_invalidate_tlb, dw, i);
339
340	/* hsdes: 1809175790 */
341	if (has_aux_ccs(xe))
342		i = emit_aux_table_inv(gt, CCS_AUX_INV, dw, i);
343
344	dw[i++] = preparser_disable(false);
345
346	i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
347				seqno, dw, i);
348
349	i = emit_bb_start(batch_addr, ppgtt_flag, dw, i);
350
351	i = emit_render_cache_flush(job, dw, i);
352
353	if (job->user_fence.used)
354		i = emit_store_imm_ppgtt_posted(job->user_fence.addr,
355						job->user_fence.value,
356						dw, i);
357
358	i = emit_pipe_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, lacks_render, dw, i);
359
360	i = emit_user_interrupt(dw, i);
361
362	i = emit_pipe_control_to_ring_end(job->q->hwe, dw, i);
363
364	xe_gt_assert(gt, i <= MAX_JOB_SIZE_DW);
365
366	xe_lrc_write_ring(lrc, dw, i * sizeof(*dw));
367}
368
369static void emit_migration_job_gen12(struct xe_sched_job *job,
370				     struct xe_lrc *lrc, u32 seqno)
371{
372	u32 dw[MAX_JOB_SIZE_DW], i = 0;
373
374	i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
375				seqno, dw, i);
376
377	dw[i++] = MI_ARB_ON_OFF | MI_ARB_DISABLE; /* Enabled again below */
378
379	i = emit_bb_start(job->batch_addr[0], BIT(8), dw, i);
380
381	/* XXX: Do we need this? Leaving for now. */
382	dw[i++] = preparser_disable(true);
383	i = emit_flush_invalidate(0, dw, i);
384	dw[i++] = preparser_disable(false);
385
386	i = emit_bb_start(job->batch_addr[1], BIT(8), dw, i);
387
388	dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | job->migrate_flush_flags |
389		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_IMM_DW;
390	dw[i++] = xe_lrc_seqno_ggtt_addr(lrc) | MI_FLUSH_DW_USE_GTT;
391	dw[i++] = 0;
392	dw[i++] = seqno; /* value */
393
394	i = emit_user_interrupt(dw, i);
395
396	xe_gt_assert(job->q->gt, i <= MAX_JOB_SIZE_DW);
397
398	xe_lrc_write_ring(lrc, dw, i * sizeof(*dw));
399}
400
401static void emit_job_gen12_gsc(struct xe_sched_job *job)
402{
403	struct xe_gt *gt = job->q->gt;
404
405	xe_gt_assert(gt, job->q->width <= 1); /* no parallel submission for GSCCS */
406
407	__emit_job_gen12_simple(job, job->q->lrc,
408				job->batch_addr[0],
409				xe_sched_job_seqno(job));
410}
411
412static void emit_job_gen12_copy(struct xe_sched_job *job)
413{
414	int i;
415
416	if (xe_sched_job_is_migration(job->q)) {
417		emit_migration_job_gen12(job, job->q->lrc,
418					 xe_sched_job_seqno(job));
419		return;
420	}
421
422	for (i = 0; i < job->q->width; ++i)
423		__emit_job_gen12_simple(job, job->q->lrc + i,
424				        job->batch_addr[i],
425				        xe_sched_job_seqno(job));
426}
427
428static void emit_job_gen12_video(struct xe_sched_job *job)
429{
430	int i;
431
432	/* FIXME: Not doing parallel handshake for now */
433	for (i = 0; i < job->q->width; ++i)
434		__emit_job_gen12_video(job, job->q->lrc + i,
435				       job->batch_addr[i],
436				       xe_sched_job_seqno(job));
437}
438
439static void emit_job_gen12_render_compute(struct xe_sched_job *job)
440{
441	int i;
442
443	for (i = 0; i < job->q->width; ++i)
444		__emit_job_gen12_render_compute(job, job->q->lrc + i,
445						job->batch_addr[i],
446						xe_sched_job_seqno(job));
447}
448
449static const struct xe_ring_ops ring_ops_gen12_gsc = {
450	.emit_job = emit_job_gen12_gsc,
451};
452
453static const struct xe_ring_ops ring_ops_gen12_copy = {
454	.emit_job = emit_job_gen12_copy,
455};
456
457static const struct xe_ring_ops ring_ops_gen12_video = {
458	.emit_job = emit_job_gen12_video,
459};
460
461static const struct xe_ring_ops ring_ops_gen12_render_compute = {
462	.emit_job = emit_job_gen12_render_compute,
463};
464
465const struct xe_ring_ops *
466xe_ring_ops_get(struct xe_gt *gt, enum xe_engine_class class)
467{
468	switch (class) {
469	case XE_ENGINE_CLASS_OTHER:
470		return &ring_ops_gen12_gsc;
471	case XE_ENGINE_CLASS_COPY:
472		return &ring_ops_gen12_copy;
473	case XE_ENGINE_CLASS_VIDEO_DECODE:
474	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
475		return &ring_ops_gen12_video;
476	case XE_ENGINE_CLASS_RENDER:
477	case XE_ENGINE_CLASS_COMPUTE:
478		return &ring_ops_gen12_render_compute;
479	default:
480		return NULL;
481	}
482}