Loading...
Note: File does not exist in v4.6.
1// SPDX-License-Identifier: MIT
2/*
3 * Copyright © 2022 Intel Corporation
4 */
5
6#include "xe_ring_ops.h"
7
8#include "generated/xe_wa_oob.h"
9#include "instructions/xe_mi_commands.h"
10#include "regs/xe_engine_regs.h"
11#include "regs/xe_gpu_commands.h"
12#include "regs/xe_gt_regs.h"
13#include "regs/xe_lrc_layout.h"
14#include "xe_exec_queue_types.h"
15#include "xe_gt.h"
16#include "xe_lrc.h"
17#include "xe_macros.h"
18#include "xe_sched_job.h"
19#include "xe_vm_types.h"
20#include "xe_vm.h"
21#include "xe_wa.h"
22
23/*
24 * 3D-related flags that can't be set on _engines_ that lack access to the 3D
25 * pipeline (i.e., CCS engines).
26 */
27#define PIPE_CONTROL_3D_ENGINE_FLAGS (\
28 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | \
29 PIPE_CONTROL_DEPTH_CACHE_FLUSH | \
30 PIPE_CONTROL_TILE_CACHE_FLUSH | \
31 PIPE_CONTROL_DEPTH_STALL | \
32 PIPE_CONTROL_STALL_AT_SCOREBOARD | \
33 PIPE_CONTROL_PSD_SYNC | \
34 PIPE_CONTROL_AMFS_FLUSH | \
35 PIPE_CONTROL_VF_CACHE_INVALIDATE | \
36 PIPE_CONTROL_GLOBAL_SNAPSHOT_RESET)
37
38/* 3D-related flags that can't be set on _platforms_ that lack a 3D pipeline */
39#define PIPE_CONTROL_3D_ARCH_FLAGS ( \
40 PIPE_CONTROL_3D_ENGINE_FLAGS | \
41 PIPE_CONTROL_INDIRECT_STATE_DISABLE | \
42 PIPE_CONTROL_FLUSH_ENABLE | \
43 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | \
44 PIPE_CONTROL_DC_FLUSH_ENABLE)
45
46static u32 preparser_disable(bool state)
47{
48 return MI_ARB_CHECK | BIT(8) | state;
49}
50
51static int emit_aux_table_inv(struct xe_gt *gt, struct xe_reg reg,
52 u32 *dw, int i)
53{
54 dw[i++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1) | MI_LRI_MMIO_REMAP_EN;
55 dw[i++] = reg.addr + gt->mmio.adj_offset;
56 dw[i++] = AUX_INV;
57 dw[i++] = MI_NOOP;
58
59 return i;
60}
61
62static int emit_user_interrupt(u32 *dw, int i)
63{
64 dw[i++] = MI_USER_INTERRUPT;
65 dw[i++] = MI_ARB_ON_OFF | MI_ARB_ENABLE;
66 dw[i++] = MI_ARB_CHECK;
67
68 return i;
69}
70
71static int emit_store_imm_ggtt(u32 addr, u32 value, u32 *dw, int i)
72{
73 dw[i++] = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
74 dw[i++] = addr;
75 dw[i++] = 0;
76 dw[i++] = value;
77
78 return i;
79}
80
81static int emit_flush_imm_ggtt(u32 addr, u32 value, bool invalidate_tlb,
82 u32 *dw, int i)
83{
84 dw[i++] = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_IMM_DW |
85 (invalidate_tlb ? MI_INVALIDATE_TLB : 0);
86 dw[i++] = addr | MI_FLUSH_DW_USE_GTT;
87 dw[i++] = 0;
88 dw[i++] = value;
89
90 return i;
91}
92
93static int emit_bb_start(u64 batch_addr, u32 ppgtt_flag, u32 *dw, int i)
94{
95 dw[i++] = MI_BATCH_BUFFER_START | ppgtt_flag | XE_INSTR_NUM_DW(3);
96 dw[i++] = lower_32_bits(batch_addr);
97 dw[i++] = upper_32_bits(batch_addr);
98
99 return i;
100}
101
102static int emit_flush_invalidate(u32 flag, u32 *dw, int i)
103{
104 dw[i] = MI_FLUSH_DW;
105 dw[i] |= flag;
106 dw[i++] |= MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_IMM_DW |
107 MI_FLUSH_DW_STORE_INDEX;
108
109 dw[i++] = LRC_PPHWSP_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
110 dw[i++] = 0;
111 dw[i++] = ~0U;
112
113 return i;
114}
115
116static int emit_pipe_invalidate(u32 mask_flags, bool invalidate_tlb, u32 *dw,
117 int i)
118{
119 u32 flags = PIPE_CONTROL_CS_STALL |
120 PIPE_CONTROL_COMMAND_CACHE_INVALIDATE |
121 PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE |
122 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
123 PIPE_CONTROL_VF_CACHE_INVALIDATE |
124 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
125 PIPE_CONTROL_STATE_CACHE_INVALIDATE |
126 PIPE_CONTROL_QW_WRITE |
127 PIPE_CONTROL_STORE_DATA_INDEX;
128
129 if (invalidate_tlb)
130 flags |= PIPE_CONTROL_TLB_INVALIDATE;
131
132 flags &= ~mask_flags;
133
134 dw[i++] = GFX_OP_PIPE_CONTROL(6);
135 dw[i++] = flags;
136 dw[i++] = LRC_PPHWSP_SCRATCH_ADDR;
137 dw[i++] = 0;
138 dw[i++] = 0;
139 dw[i++] = 0;
140
141 return i;
142}
143
144static int emit_store_imm_ppgtt_posted(u64 addr, u64 value,
145 u32 *dw, int i)
146{
147 dw[i++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(1);
148 dw[i++] = lower_32_bits(addr);
149 dw[i++] = upper_32_bits(addr);
150 dw[i++] = lower_32_bits(value);
151 dw[i++] = upper_32_bits(value);
152
153 return i;
154}
155
156static int emit_render_cache_flush(struct xe_sched_job *job, u32 *dw, int i)
157{
158 struct xe_gt *gt = job->q->gt;
159 bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
160 u32 flags;
161
162 flags = (PIPE_CONTROL_CS_STALL |
163 PIPE_CONTROL_TILE_CACHE_FLUSH |
164 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
165 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
166 PIPE_CONTROL_DC_FLUSH_ENABLE |
167 PIPE_CONTROL_FLUSH_ENABLE);
168
169 if (XE_WA(gt, 1409600907))
170 flags |= PIPE_CONTROL_DEPTH_STALL;
171
172 if (lacks_render)
173 flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
174 else if (job->q->class == XE_ENGINE_CLASS_COMPUTE)
175 flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
176
177 dw[i++] = GFX_OP_PIPE_CONTROL(6) | PIPE_CONTROL0_HDC_PIPELINE_FLUSH;
178 dw[i++] = flags;
179 dw[i++] = 0;
180 dw[i++] = 0;
181 dw[i++] = 0;
182 dw[i++] = 0;
183
184 return i;
185}
186
187static int emit_pipe_control_to_ring_end(struct xe_hw_engine *hwe, u32 *dw, int i)
188{
189 if (hwe->class != XE_ENGINE_CLASS_RENDER)
190 return i;
191
192 if (XE_WA(hwe->gt, 16020292621)) {
193 dw[i++] = GFX_OP_PIPE_CONTROL(6);
194 dw[i++] = PIPE_CONTROL_LRI_POST_SYNC;
195 dw[i++] = RING_NOPID(hwe->mmio_base).addr;
196 dw[i++] = 0;
197 dw[i++] = 0;
198 dw[i++] = 0;
199 }
200
201 return i;
202}
203
204static int emit_pipe_imm_ggtt(u32 addr, u32 value, bool stall_only, u32 *dw,
205 int i)
206{
207 dw[i++] = GFX_OP_PIPE_CONTROL(6);
208 dw[i++] = (stall_only ? PIPE_CONTROL_CS_STALL :
209 PIPE_CONTROL_FLUSH_ENABLE | PIPE_CONTROL_CS_STALL) |
210 PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_QW_WRITE;
211 dw[i++] = addr;
212 dw[i++] = 0;
213 dw[i++] = value;
214 dw[i++] = 0; /* We're thrashing one extra dword. */
215
216 return i;
217}
218
219static u32 get_ppgtt_flag(struct xe_sched_job *job)
220{
221 return job->q->vm ? BIT(8) : 0;
222}
223
224/* for engines that don't require any special HW handling (no EUs, no aux inval, etc) */
225static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc,
226 u64 batch_addr, u32 seqno)
227{
228 u32 dw[MAX_JOB_SIZE_DW], i = 0;
229 u32 ppgtt_flag = get_ppgtt_flag(job);
230 struct xe_vm *vm = job->q->vm;
231 struct xe_gt *gt = job->q->gt;
232
233 if (vm && vm->batch_invalidate_tlb) {
234 dw[i++] = preparser_disable(true);
235 i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
236 seqno, true, dw, i);
237 dw[i++] = preparser_disable(false);
238 } else {
239 i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
240 seqno, dw, i);
241 }
242
243 i = emit_bb_start(batch_addr, ppgtt_flag, dw, i);
244
245 if (job->user_fence.used)
246 i = emit_store_imm_ppgtt_posted(job->user_fence.addr,
247 job->user_fence.value,
248 dw, i);
249
250 i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, false, dw, i);
251
252 i = emit_user_interrupt(dw, i);
253
254 xe_gt_assert(gt, i <= MAX_JOB_SIZE_DW);
255
256 xe_lrc_write_ring(lrc, dw, i * sizeof(*dw));
257}
258
259static bool has_aux_ccs(struct xe_device *xe)
260{
261 /*
262 * PVC is a special case that has no compression of either type
263 * (FlatCCS or AuxCCS). Also, AuxCCS is no longer used from Xe2
264 * onward, so any future platforms with no FlatCCS will not have
265 * AuxCCS either.
266 */
267 if (GRAPHICS_VER(xe) >= 20 || xe->info.platform == XE_PVC)
268 return false;
269
270 return !xe->info.has_flat_ccs;
271}
272
273static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
274 u64 batch_addr, u32 seqno)
275{
276 u32 dw[MAX_JOB_SIZE_DW], i = 0;
277 u32 ppgtt_flag = get_ppgtt_flag(job);
278 struct xe_gt *gt = job->q->gt;
279 struct xe_device *xe = gt_to_xe(gt);
280 bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE;
281 struct xe_vm *vm = job->q->vm;
282
283 dw[i++] = preparser_disable(true);
284
285 /* hsdes: 1809175790 */
286 if (has_aux_ccs(xe)) {
287 if (decode)
288 i = emit_aux_table_inv(gt, VD0_AUX_INV, dw, i);
289 else
290 i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i);
291 }
292
293 if (vm && vm->batch_invalidate_tlb)
294 i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
295 seqno, true, dw, i);
296
297 dw[i++] = preparser_disable(false);
298
299 if (!vm || !vm->batch_invalidate_tlb)
300 i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
301 seqno, dw, i);
302
303 i = emit_bb_start(batch_addr, ppgtt_flag, dw, i);
304
305 if (job->user_fence.used)
306 i = emit_store_imm_ppgtt_posted(job->user_fence.addr,
307 job->user_fence.value,
308 dw, i);
309
310 i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, false, dw, i);
311
312 i = emit_user_interrupt(dw, i);
313
314 xe_gt_assert(gt, i <= MAX_JOB_SIZE_DW);
315
316 xe_lrc_write_ring(lrc, dw, i * sizeof(*dw));
317}
318
319static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
320 struct xe_lrc *lrc,
321 u64 batch_addr, u32 seqno)
322{
323 u32 dw[MAX_JOB_SIZE_DW], i = 0;
324 u32 ppgtt_flag = get_ppgtt_flag(job);
325 struct xe_gt *gt = job->q->gt;
326 struct xe_device *xe = gt_to_xe(gt);
327 bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
328 struct xe_vm *vm = job->q->vm;
329 u32 mask_flags = 0;
330
331 dw[i++] = preparser_disable(true);
332 if (lacks_render)
333 mask_flags = PIPE_CONTROL_3D_ARCH_FLAGS;
334 else if (job->q->class == XE_ENGINE_CLASS_COMPUTE)
335 mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS;
336
337 /* See __xe_pt_bind_vma() for a discussion on TLB invalidations. */
338 i = emit_pipe_invalidate(mask_flags, vm && vm->batch_invalidate_tlb, dw, i);
339
340 /* hsdes: 1809175790 */
341 if (has_aux_ccs(xe))
342 i = emit_aux_table_inv(gt, CCS_AUX_INV, dw, i);
343
344 dw[i++] = preparser_disable(false);
345
346 i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
347 seqno, dw, i);
348
349 i = emit_bb_start(batch_addr, ppgtt_flag, dw, i);
350
351 i = emit_render_cache_flush(job, dw, i);
352
353 if (job->user_fence.used)
354 i = emit_store_imm_ppgtt_posted(job->user_fence.addr,
355 job->user_fence.value,
356 dw, i);
357
358 i = emit_pipe_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, lacks_render, dw, i);
359
360 i = emit_user_interrupt(dw, i);
361
362 i = emit_pipe_control_to_ring_end(job->q->hwe, dw, i);
363
364 xe_gt_assert(gt, i <= MAX_JOB_SIZE_DW);
365
366 xe_lrc_write_ring(lrc, dw, i * sizeof(*dw));
367}
368
369static void emit_migration_job_gen12(struct xe_sched_job *job,
370 struct xe_lrc *lrc, u32 seqno)
371{
372 u32 dw[MAX_JOB_SIZE_DW], i = 0;
373
374 i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
375 seqno, dw, i);
376
377 dw[i++] = MI_ARB_ON_OFF | MI_ARB_DISABLE; /* Enabled again below */
378
379 i = emit_bb_start(job->batch_addr[0], BIT(8), dw, i);
380
381 /* XXX: Do we need this? Leaving for now. */
382 dw[i++] = preparser_disable(true);
383 i = emit_flush_invalidate(0, dw, i);
384 dw[i++] = preparser_disable(false);
385
386 i = emit_bb_start(job->batch_addr[1], BIT(8), dw, i);
387
388 dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | job->migrate_flush_flags |
389 MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_IMM_DW;
390 dw[i++] = xe_lrc_seqno_ggtt_addr(lrc) | MI_FLUSH_DW_USE_GTT;
391 dw[i++] = 0;
392 dw[i++] = seqno; /* value */
393
394 i = emit_user_interrupt(dw, i);
395
396 xe_gt_assert(job->q->gt, i <= MAX_JOB_SIZE_DW);
397
398 xe_lrc_write_ring(lrc, dw, i * sizeof(*dw));
399}
400
401static void emit_job_gen12_gsc(struct xe_sched_job *job)
402{
403 struct xe_gt *gt = job->q->gt;
404
405 xe_gt_assert(gt, job->q->width <= 1); /* no parallel submission for GSCCS */
406
407 __emit_job_gen12_simple(job, job->q->lrc,
408 job->batch_addr[0],
409 xe_sched_job_seqno(job));
410}
411
412static void emit_job_gen12_copy(struct xe_sched_job *job)
413{
414 int i;
415
416 if (xe_sched_job_is_migration(job->q)) {
417 emit_migration_job_gen12(job, job->q->lrc,
418 xe_sched_job_seqno(job));
419 return;
420 }
421
422 for (i = 0; i < job->q->width; ++i)
423 __emit_job_gen12_simple(job, job->q->lrc + i,
424 job->batch_addr[i],
425 xe_sched_job_seqno(job));
426}
427
428static void emit_job_gen12_video(struct xe_sched_job *job)
429{
430 int i;
431
432 /* FIXME: Not doing parallel handshake for now */
433 for (i = 0; i < job->q->width; ++i)
434 __emit_job_gen12_video(job, job->q->lrc + i,
435 job->batch_addr[i],
436 xe_sched_job_seqno(job));
437}
438
439static void emit_job_gen12_render_compute(struct xe_sched_job *job)
440{
441 int i;
442
443 for (i = 0; i < job->q->width; ++i)
444 __emit_job_gen12_render_compute(job, job->q->lrc + i,
445 job->batch_addr[i],
446 xe_sched_job_seqno(job));
447}
448
449static const struct xe_ring_ops ring_ops_gen12_gsc = {
450 .emit_job = emit_job_gen12_gsc,
451};
452
453static const struct xe_ring_ops ring_ops_gen12_copy = {
454 .emit_job = emit_job_gen12_copy,
455};
456
457static const struct xe_ring_ops ring_ops_gen12_video = {
458 .emit_job = emit_job_gen12_video,
459};
460
461static const struct xe_ring_ops ring_ops_gen12_render_compute = {
462 .emit_job = emit_job_gen12_render_compute,
463};
464
465const struct xe_ring_ops *
466xe_ring_ops_get(struct xe_gt *gt, enum xe_engine_class class)
467{
468 switch (class) {
469 case XE_ENGINE_CLASS_OTHER:
470 return &ring_ops_gen12_gsc;
471 case XE_ENGINE_CLASS_COPY:
472 return &ring_ops_gen12_copy;
473 case XE_ENGINE_CLASS_VIDEO_DECODE:
474 case XE_ENGINE_CLASS_VIDEO_ENHANCE:
475 return &ring_ops_gen12_video;
476 case XE_ENGINE_CLASS_RENDER:
477 case XE_ENGINE_CLASS_COMPUTE:
478 return &ring_ops_gen12_render_compute;
479 default:
480 return NULL;
481 }
482}