Loading...
1/*
2 * Copyright 2023 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23#include "kfd_debug.h"
24#include "kfd_device_queue_manager.h"
25#include "kfd_topology.h"
26#include <linux/file.h>
27#include <uapi/linux/kfd_ioctl.h>
28
29#define MAX_WATCH_ADDRESSES 4
30
31int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
32 unsigned int *queue_id,
33 unsigned int *gpu_id,
34 uint64_t exception_clear_mask,
35 uint64_t *event_status)
36{
37 struct process_queue_manager *pqm;
38 struct process_queue_node *pqn;
39 int i;
40
41 if (!(process && process->debug_trap_enabled))
42 return -ENODATA;
43
44 mutex_lock(&process->event_mutex);
45 *event_status = 0;
46 *queue_id = 0;
47 *gpu_id = 0;
48
49 /* find and report queue events */
50 pqm = &process->pqm;
51 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
52 uint64_t tmp = process->exception_enable_mask;
53
54 if (!pqn->q)
55 continue;
56
57 tmp &= pqn->q->properties.exception_status;
58
59 if (!tmp)
60 continue;
61
62 *event_status = pqn->q->properties.exception_status;
63 *queue_id = pqn->q->properties.queue_id;
64 *gpu_id = pqn->q->device->id;
65 pqn->q->properties.exception_status &= ~exception_clear_mask;
66 goto out;
67 }
68
69 /* find and report device events */
70 for (i = 0; i < process->n_pdds; i++) {
71 struct kfd_process_device *pdd = process->pdds[i];
72 uint64_t tmp = process->exception_enable_mask
73 & pdd->exception_status;
74
75 if (!tmp)
76 continue;
77
78 *event_status = pdd->exception_status;
79 *gpu_id = pdd->dev->id;
80 pdd->exception_status &= ~exception_clear_mask;
81 goto out;
82 }
83
84 /* report process events */
85 if (process->exception_enable_mask & process->exception_status) {
86 *event_status = process->exception_status;
87 process->exception_status &= ~exception_clear_mask;
88 }
89
90out:
91 mutex_unlock(&process->event_mutex);
92 return *event_status ? 0 : -EAGAIN;
93}
94
95void debug_event_write_work_handler(struct work_struct *work)
96{
97 struct kfd_process *process;
98
99 static const char write_data = '.';
100 loff_t pos = 0;
101
102 process = container_of(work,
103 struct kfd_process,
104 debug_event_workarea);
105
106 kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
107}
108
109/* update process/device/queue exception status, write to descriptor
110 * only if exception_status is enabled.
111 */
112bool kfd_dbg_ev_raise(uint64_t event_mask,
113 struct kfd_process *process, struct kfd_node *dev,
114 unsigned int source_id, bool use_worker,
115 void *exception_data, size_t exception_data_size)
116{
117 struct process_queue_manager *pqm;
118 struct process_queue_node *pqn;
119 int i;
120 static const char write_data = '.';
121 loff_t pos = 0;
122 bool is_subscribed = true;
123
124 if (!(process && process->debug_trap_enabled))
125 return false;
126
127 mutex_lock(&process->event_mutex);
128
129 if (event_mask & KFD_EC_MASK_DEVICE) {
130 for (i = 0; i < process->n_pdds; i++) {
131 struct kfd_process_device *pdd = process->pdds[i];
132
133 if (pdd->dev != dev)
134 continue;
135
136 pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
137
138 if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
139 if (!pdd->vm_fault_exc_data) {
140 pdd->vm_fault_exc_data = kmemdup(
141 exception_data,
142 exception_data_size,
143 GFP_KERNEL);
144 if (!pdd->vm_fault_exc_data)
145 pr_debug("Failed to allocate exception data memory");
146 } else {
147 pr_debug("Debugger exception data not saved\n");
148 print_hex_dump_bytes("exception data: ",
149 DUMP_PREFIX_OFFSET,
150 exception_data,
151 exception_data_size);
152 }
153 }
154 break;
155 }
156 } else if (event_mask & KFD_EC_MASK_PROCESS) {
157 process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
158 } else {
159 pqm = &process->pqm;
160 list_for_each_entry(pqn, &pqm->queues,
161 process_queue_list) {
162 int target_id;
163
164 if (!pqn->q)
165 continue;
166
167 target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
168 pqn->q->properties.queue_id :
169 pqn->q->doorbell_id;
170
171 if (pqn->q->device != dev || target_id != source_id)
172 continue;
173
174 pqn->q->properties.exception_status |= event_mask;
175 break;
176 }
177 }
178
179 if (process->exception_enable_mask & event_mask) {
180 if (use_worker)
181 schedule_work(&process->debug_event_workarea);
182 else
183 kernel_write(process->dbg_ev_file,
184 &write_data,
185 1,
186 &pos);
187 } else {
188 is_subscribed = false;
189 }
190
191 mutex_unlock(&process->event_mutex);
192
193 return is_subscribed;
194}
195
196/* set pending event queue entry from ring entry */
197bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
198 unsigned int pasid,
199 uint32_t doorbell_id,
200 uint64_t trap_mask,
201 void *exception_data,
202 size_t exception_data_size)
203{
204 struct kfd_process *p;
205 bool signaled_to_debugger_or_runtime = false;
206
207 p = kfd_lookup_process_by_pasid(pasid);
208
209 if (!p)
210 return false;
211
212 if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
213 exception_data, exception_data_size)) {
214 struct process_queue_manager *pqm;
215 struct process_queue_node *pqn;
216
217 if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
218 p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
219 mutex_lock(&p->mutex);
220
221 pqm = &p->pqm;
222 list_for_each_entry(pqn, &pqm->queues,
223 process_queue_list) {
224
225 if (!(pqn->q && pqn->q->device == dev &&
226 pqn->q->doorbell_id == doorbell_id))
227 continue;
228
229 kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
230 trap_mask);
231
232 signaled_to_debugger_or_runtime = true;
233
234 break;
235 }
236
237 mutex_unlock(&p->mutex);
238 } else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
239 kfd_dqm_evict_pasid(dev->dqm, p->pasid);
240 kfd_signal_vm_fault_event(dev, p->pasid, NULL,
241 exception_data);
242
243 signaled_to_debugger_or_runtime = true;
244 }
245 } else {
246 signaled_to_debugger_or_runtime = true;
247 }
248
249 kfd_unref_process(p);
250
251 return signaled_to_debugger_or_runtime;
252}
253
254int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
255 unsigned int dev_id,
256 unsigned int queue_id,
257 uint64_t error_reason)
258{
259 if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
260 struct kfd_process_device *pdd = NULL;
261 struct kfd_hsa_memory_exception_data *data;
262 int i;
263
264 for (i = 0; i < p->n_pdds; i++) {
265 if (p->pdds[i]->dev->id == dev_id) {
266 pdd = p->pdds[i];
267 break;
268 }
269 }
270
271 if (!pdd)
272 return -ENODEV;
273
274 data = (struct kfd_hsa_memory_exception_data *)
275 pdd->vm_fault_exc_data;
276
277 kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
278 kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
279 error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
280 }
281
282 if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
283 /*
284 * block should only happen after the debugger receives runtime
285 * enable notice.
286 */
287 up(&p->runtime_enable_sema);
288 error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
289 }
290
291 if (error_reason)
292 return kfd_send_exception_to_runtime(p, queue_id, error_reason);
293
294 return 0;
295}
296
297static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
298{
299 struct mqd_update_info minfo = {0};
300 int err;
301
302 if (!q)
303 return 0;
304
305 if (!kfd_dbg_has_cwsr_workaround(q->device))
306 return 0;
307
308 if (enable && q->properties.is_user_cu_masked)
309 return -EBUSY;
310
311 minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
312
313 q->properties.is_dbg_wa = enable;
314 err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
315 if (err)
316 q->properties.is_dbg_wa = false;
317
318 return err;
319}
320
321static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
322{
323 struct process_queue_manager *pqm = &target->pqm;
324 struct process_queue_node *pqn;
325 int r = 0;
326
327 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
328 r = kfd_dbg_set_queue_workaround(pqn->q, enable);
329 if (enable && r)
330 goto unwind;
331 }
332
333 return 0;
334
335unwind:
336 list_for_each_entry(pqn, &pqm->queues, process_queue_list)
337 kfd_dbg_set_queue_workaround(pqn->q, false);
338
339 if (enable)
340 target->runtime_info.runtime_state = r == -EBUSY ?
341 DEBUG_RUNTIME_STATE_ENABLED_BUSY :
342 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
343
344 return r;
345}
346
347int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
348{
349 uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
350 uint32_t flags = pdd->process->dbg_flags;
351
352 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
353 return 0;
354
355 return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
356 pdd->watch_points, flags, sq_trap_en);
357}
358
359#define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
360static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
361{
362 int i;
363
364 *watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
365
366 spin_lock(&pdd->dev->kfd->watch_points_lock);
367
368 for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
369 /* device watchpoint in use so skip */
370 if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1)
371 continue;
372
373 pdd->alloc_watch_ids |= 0x1 << i;
374 pdd->dev->kfd->alloc_watch_ids |= 0x1 << i;
375 *watch_id = i;
376 spin_unlock(&pdd->dev->kfd->watch_points_lock);
377 return 0;
378 }
379
380 spin_unlock(&pdd->dev->kfd->watch_points_lock);
381
382 return -ENOMEM;
383}
384
385static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
386{
387 spin_lock(&pdd->dev->kfd->watch_points_lock);
388
389 /* process owns device watch point so safe to clear */
390 if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
391 pdd->alloc_watch_ids &= ~(0x1 << watch_id);
392 pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id);
393 }
394
395 spin_unlock(&pdd->dev->kfd->watch_points_lock);
396}
397
398static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
399{
400 bool owns_watch_id = false;
401
402 spin_lock(&pdd->dev->kfd->watch_points_lock);
403 owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
404 ((pdd->alloc_watch_ids >> watch_id) & 0x1);
405
406 spin_unlock(&pdd->dev->kfd->watch_points_lock);
407
408 return owns_watch_id;
409}
410
411int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
412 uint32_t watch_id)
413{
414 int r;
415
416 if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
417 return -EINVAL;
418
419 if (!pdd->dev->kfd->shared_resources.enable_mes) {
420 r = debug_lock_and_unmap(pdd->dev->dqm);
421 if (r)
422 return r;
423 }
424
425 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
426 pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
427 pdd->dev->adev,
428 watch_id);
429 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
430
431 if (!pdd->dev->kfd->shared_resources.enable_mes)
432 r = debug_map_and_unlock(pdd->dev->dqm);
433 else
434 r = kfd_dbg_set_mes_debug_mode(pdd, true);
435
436 kfd_dbg_clear_dev_watch_id(pdd, watch_id);
437
438 return r;
439}
440
441int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
442 uint64_t watch_address,
443 uint32_t watch_address_mask,
444 uint32_t *watch_id,
445 uint32_t watch_mode)
446{
447 int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
448 uint32_t xcc_mask = pdd->dev->xcc_mask;
449
450 if (r)
451 return r;
452
453 if (!pdd->dev->kfd->shared_resources.enable_mes) {
454 r = debug_lock_and_unmap(pdd->dev->dqm);
455 if (r) {
456 kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
457 return r;
458 }
459 }
460
461 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
462 for_each_inst(xcc_id, xcc_mask)
463 pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
464 pdd->dev->adev,
465 watch_address,
466 watch_address_mask,
467 *watch_id,
468 watch_mode,
469 pdd->dev->vm_info.last_vmid_kfd,
470 xcc_id);
471 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
472
473 if (!pdd->dev->kfd->shared_resources.enable_mes)
474 r = debug_map_and_unlock(pdd->dev->dqm);
475 else
476 r = kfd_dbg_set_mes_debug_mode(pdd, true);
477
478 /* HWS is broken so no point in HW rollback but release the watchpoint anyways */
479 if (r)
480 kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
481
482 return 0;
483}
484
485static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
486{
487 int i, j;
488
489 for (i = 0; i < target->n_pdds; i++)
490 for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
491 kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
492}
493
494int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
495{
496 uint32_t prev_flags = target->dbg_flags;
497 int i, r = 0, rewind_count = 0;
498
499 for (i = 0; i < target->n_pdds; i++) {
500 if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
501 (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
502 *flags = prev_flags;
503 return -EACCES;
504 }
505 }
506
507 target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP;
508 *flags = prev_flags;
509 for (i = 0; i < target->n_pdds; i++) {
510 struct kfd_process_device *pdd = target->pdds[i];
511
512 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
513 continue;
514
515 if (!pdd->dev->kfd->shared_resources.enable_mes)
516 r = debug_refresh_runlist(pdd->dev->dqm);
517 else
518 r = kfd_dbg_set_mes_debug_mode(pdd, true);
519
520 if (r) {
521 target->dbg_flags = prev_flags;
522 break;
523 }
524
525 rewind_count++;
526 }
527
528 /* Rewind flags */
529 if (r) {
530 target->dbg_flags = prev_flags;
531
532 for (i = 0; i < rewind_count; i++) {
533 struct kfd_process_device *pdd = target->pdds[i];
534
535 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
536 continue;
537
538 if (!pdd->dev->kfd->shared_resources.enable_mes)
539 debug_refresh_runlist(pdd->dev->dqm);
540 else
541 kfd_dbg_set_mes_debug_mode(pdd, true);
542 }
543 }
544
545 return r;
546}
547
548/* kfd_dbg_trap_deactivate:
549 * target: target process
550 * unwind: If this is unwinding a failed kfd_dbg_trap_enable()
551 * unwind_count:
552 * If unwind == true, how far down the pdd list we need
553 * to unwind
554 * else: ignored
555 */
556void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
557{
558 int i;
559
560 if (!unwind) {
561 uint32_t flags = 0;
562 int resume_count = resume_queues(target, 0, NULL);
563
564 if (resume_count)
565 pr_debug("Resumed %d queues\n", resume_count);
566
567 cancel_work_sync(&target->debug_event_workarea);
568 kfd_dbg_clear_process_address_watch(target);
569 kfd_dbg_trap_set_wave_launch_mode(target, 0);
570
571 kfd_dbg_trap_set_flags(target, &flags);
572 }
573
574 for (i = 0; i < target->n_pdds; i++) {
575 struct kfd_process_device *pdd = target->pdds[i];
576
577 /* If this is an unwind, and we have unwound the required
578 * enable calls on the pdd list, we need to stop now
579 * otherwise we may mess up another debugger session.
580 */
581 if (unwind && i == unwind_count)
582 break;
583
584 kfd_process_set_trap_debug_flag(&pdd->qpd, false);
585
586 /* GFX off is already disabled by debug activate if not RLC restore supported. */
587 if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
588 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
589 pdd->spi_dbg_override =
590 pdd->dev->kfd2kgd->disable_debug_trap(
591 pdd->dev->adev,
592 target->runtime_info.ttmp_setup,
593 pdd->dev->vm_info.last_vmid_kfd);
594 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
595
596 if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
597 release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
598 pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
599
600 if (!pdd->dev->kfd->shared_resources.enable_mes)
601 debug_refresh_runlist(pdd->dev->dqm);
602 else
603 kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));
604 }
605
606 kfd_dbg_set_workaround(target, false);
607}
608
609static void kfd_dbg_clean_exception_status(struct kfd_process *target)
610{
611 struct process_queue_manager *pqm;
612 struct process_queue_node *pqn;
613 int i;
614
615 for (i = 0; i < target->n_pdds; i++) {
616 struct kfd_process_device *pdd = target->pdds[i];
617
618 kfd_process_drain_interrupts(pdd);
619
620 pdd->exception_status = 0;
621 }
622
623 pqm = &target->pqm;
624 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
625 if (!pqn->q)
626 continue;
627
628 pqn->q->properties.exception_status = 0;
629 }
630
631 target->exception_status = 0;
632}
633
634int kfd_dbg_trap_disable(struct kfd_process *target)
635{
636 if (!target->debug_trap_enabled)
637 return 0;
638
639 /*
640 * Defer deactivation to runtime if runtime not enabled otherwise reset
641 * attached running target runtime state to enable for re-attach.
642 */
643 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
644 kfd_dbg_trap_deactivate(target, false, 0);
645 else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
646 target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
647
648 fput(target->dbg_ev_file);
649 target->dbg_ev_file = NULL;
650
651 if (target->debugger_process) {
652 atomic_dec(&target->debugger_process->debugged_process_count);
653 target->debugger_process = NULL;
654 }
655
656 target->debug_trap_enabled = false;
657 kfd_dbg_clean_exception_status(target);
658 kfd_unref_process(target);
659
660 return 0;
661}
662
663int kfd_dbg_trap_activate(struct kfd_process *target)
664{
665 int i, r = 0;
666
667 r = kfd_dbg_set_workaround(target, true);
668 if (r)
669 return r;
670
671 for (i = 0; i < target->n_pdds; i++) {
672 struct kfd_process_device *pdd = target->pdds[i];
673
674 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
675 r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
676
677 if (r) {
678 target->runtime_info.runtime_state = (r == -EBUSY) ?
679 DEBUG_RUNTIME_STATE_ENABLED_BUSY :
680 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
681
682 goto unwind_err;
683 }
684 }
685
686 /* Disable GFX OFF to prevent garbage read/writes to debug registers.
687 * If RLC restore of debug registers is not supported and runtime enable
688 * hasn't done so already on ttmp setup request, restore the trap config registers.
689 *
690 * If RLC restore of debug registers is not supported, keep gfx off disabled for
691 * the debug session.
692 */
693 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
694 if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
695 target->runtime_info.ttmp_setup))
696 pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
697 pdd->dev->vm_info.last_vmid_kfd);
698
699 pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
700 pdd->dev->adev,
701 false,
702 pdd->dev->vm_info.last_vmid_kfd);
703
704 if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
705 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
706
707 /*
708 * Setting the debug flag in the trap handler requires that the TMA has been
709 * allocated, which occurs during CWSR initialization.
710 * In the event that CWSR has not been initialized at this point, setting the
711 * flag will be called again during CWSR initialization if the target process
712 * is still debug enabled.
713 */
714 kfd_process_set_trap_debug_flag(&pdd->qpd, true);
715
716 if (!pdd->dev->kfd->shared_resources.enable_mes)
717 r = debug_refresh_runlist(pdd->dev->dqm);
718 else
719 r = kfd_dbg_set_mes_debug_mode(pdd, true);
720
721 if (r) {
722 target->runtime_info.runtime_state =
723 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
724 goto unwind_err;
725 }
726 }
727
728 return 0;
729
730unwind_err:
731 /* Enabling debug failed, we need to disable on
732 * all GPUs so the enable is all or nothing.
733 */
734 kfd_dbg_trap_deactivate(target, true, i);
735 return r;
736}
737
738int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
739 void __user *runtime_info, uint32_t *runtime_size)
740{
741 struct file *f;
742 uint32_t copy_size;
743 int i, r = 0;
744
745 if (target->debug_trap_enabled)
746 return -EALREADY;
747
748 /* Enable pre-checks */
749 for (i = 0; i < target->n_pdds; i++) {
750 struct kfd_process_device *pdd = target->pdds[i];
751
752 if (!KFD_IS_SOC15(pdd->dev))
753 return -ENODEV;
754
755 if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
756 kfd_dbg_has_cwsr_workaround(pdd->dev)))
757 return -EBUSY;
758 }
759
760 copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
761
762 f = fget(fd);
763 if (!f) {
764 pr_err("Failed to get file for (%i)\n", fd);
765 return -EBADF;
766 }
767
768 target->dbg_ev_file = f;
769
770 /* defer activation to runtime if not runtime enabled */
771 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
772 kfd_dbg_trap_activate(target);
773
774 /* We already hold the process reference but hold another one for the
775 * debug session.
776 */
777 kref_get(&target->ref);
778 target->debug_trap_enabled = true;
779
780 if (target->debugger_process)
781 atomic_inc(&target->debugger_process->debugged_process_count);
782
783 if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
784 kfd_dbg_trap_deactivate(target, false, 0);
785 r = -EFAULT;
786 }
787
788 *runtime_size = sizeof(target->runtime_info);
789
790 return r;
791}
792
793static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
794 uint32_t trap_override,
795 uint32_t trap_mask_request,
796 uint32_t *trap_mask_supported)
797{
798 int i = 0;
799
800 *trap_mask_supported = 0xffffffff;
801
802 for (i = 0; i < p->n_pdds; i++) {
803 struct kfd_process_device *pdd = p->pdds[i];
804 int err = pdd->dev->kfd2kgd->validate_trap_override_request(
805 pdd->dev->adev,
806 trap_override,
807 trap_mask_supported);
808
809 if (err)
810 return err;
811 }
812
813 if (trap_mask_request & ~*trap_mask_supported)
814 return -EACCES;
815
816 return 0;
817}
818
819int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
820 uint32_t trap_override,
821 uint32_t trap_mask_bits,
822 uint32_t trap_mask_request,
823 uint32_t *trap_mask_prev,
824 uint32_t *trap_mask_supported)
825{
826 int r = 0, i;
827
828 r = kfd_dbg_validate_trap_override_request(target,
829 trap_override,
830 trap_mask_request,
831 trap_mask_supported);
832
833 if (r)
834 return r;
835
836 for (i = 0; i < target->n_pdds; i++) {
837 struct kfd_process_device *pdd = target->pdds[i];
838
839 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
840 pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
841 pdd->dev->adev,
842 pdd->dev->vm_info.last_vmid_kfd,
843 trap_override,
844 trap_mask_bits,
845 trap_mask_request,
846 trap_mask_prev,
847 pdd->spi_dbg_override);
848 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
849
850 if (!pdd->dev->kfd->shared_resources.enable_mes)
851 r = debug_refresh_runlist(pdd->dev->dqm);
852 else
853 r = kfd_dbg_set_mes_debug_mode(pdd, true);
854
855 if (r)
856 break;
857 }
858
859 return r;
860}
861
862int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
863 uint8_t wave_launch_mode)
864{
865 int r = 0, i;
866
867 if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
868 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
869 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
870 return -EINVAL;
871
872 for (i = 0; i < target->n_pdds; i++) {
873 struct kfd_process_device *pdd = target->pdds[i];
874
875 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
876 pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
877 pdd->dev->adev,
878 wave_launch_mode,
879 pdd->dev->vm_info.last_vmid_kfd);
880 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
881
882 if (!pdd->dev->kfd->shared_resources.enable_mes)
883 r = debug_refresh_runlist(pdd->dev->dqm);
884 else
885 r = kfd_dbg_set_mes_debug_mode(pdd, true);
886
887 if (r)
888 break;
889 }
890
891 return r;
892}
893
894int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
895 uint32_t source_id,
896 uint32_t exception_code,
897 bool clear_exception,
898 void __user *info,
899 uint32_t *info_size)
900{
901 bool found = false;
902 int r = 0;
903 uint32_t copy_size, actual_info_size = 0;
904 uint64_t *exception_status_ptr = NULL;
905
906 if (!target)
907 return -EINVAL;
908
909 if (!info || !info_size)
910 return -EINVAL;
911
912 mutex_lock(&target->event_mutex);
913
914 if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
915 /* Per queue exceptions */
916 struct queue *queue = NULL;
917 int i;
918
919 for (i = 0; i < target->n_pdds; i++) {
920 struct kfd_process_device *pdd = target->pdds[i];
921 struct qcm_process_device *qpd = &pdd->qpd;
922
923 list_for_each_entry(queue, &qpd->queues_list, list) {
924 if (!found && queue->properties.queue_id == source_id) {
925 found = true;
926 break;
927 }
928 }
929 if (found)
930 break;
931 }
932
933 if (!found) {
934 r = -EINVAL;
935 goto out;
936 }
937
938 if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
939 r = -ENODATA;
940 goto out;
941 }
942 exception_status_ptr = &queue->properties.exception_status;
943 } else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
944 /* Per device exceptions */
945 struct kfd_process_device *pdd = NULL;
946 int i;
947
948 for (i = 0; i < target->n_pdds; i++) {
949 pdd = target->pdds[i];
950 if (pdd->dev->id == source_id) {
951 found = true;
952 break;
953 }
954 }
955
956 if (!found) {
957 r = -EINVAL;
958 goto out;
959 }
960
961 if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
962 r = -ENODATA;
963 goto out;
964 }
965
966 if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
967 copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
968
969 if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
970 r = -EFAULT;
971 goto out;
972 }
973 actual_info_size = pdd->vm_fault_exc_data_size;
974 if (clear_exception) {
975 kfree(pdd->vm_fault_exc_data);
976 pdd->vm_fault_exc_data = NULL;
977 pdd->vm_fault_exc_data_size = 0;
978 }
979 }
980 exception_status_ptr = &pdd->exception_status;
981 } else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
982 /* Per process exceptions */
983 if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
984 r = -ENODATA;
985 goto out;
986 }
987
988 if (exception_code == EC_PROCESS_RUNTIME) {
989 copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
990
991 if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
992 r = -EFAULT;
993 goto out;
994 }
995
996 actual_info_size = sizeof(target->runtime_info);
997 }
998
999 exception_status_ptr = &target->exception_status;
1000 } else {
1001 pr_debug("Bad exception type [%i]\n", exception_code);
1002 r = -EINVAL;
1003 goto out;
1004 }
1005
1006 *info_size = actual_info_size;
1007 if (clear_exception)
1008 *exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1009out:
1010 mutex_unlock(&target->event_mutex);
1011 return r;
1012}
1013
1014int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
1015 uint64_t exception_clear_mask,
1016 void __user *user_info,
1017 uint32_t *number_of_device_infos,
1018 uint32_t *entry_size)
1019{
1020 struct kfd_dbg_device_info_entry device_info;
1021 uint32_t tmp_entry_size, tmp_num_devices;
1022 int i, r = 0;
1023
1024 if (!(target && user_info && number_of_device_infos && entry_size))
1025 return -EINVAL;
1026
1027 tmp_entry_size = *entry_size;
1028
1029 tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
1030 *number_of_device_infos = target->n_pdds;
1031 *entry_size = min_t(size_t, *entry_size, sizeof(device_info));
1032
1033 if (!tmp_num_devices)
1034 return 0;
1035
1036 memset(&device_info, 0, sizeof(device_info));
1037
1038 mutex_lock(&target->event_mutex);
1039
1040 /* Run over all pdd of the process */
1041 for (i = 0; i < tmp_num_devices; i++) {
1042 struct kfd_process_device *pdd = target->pdds[i];
1043 struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
1044
1045 device_info.gpu_id = pdd->dev->id;
1046 device_info.exception_status = pdd->exception_status;
1047 device_info.lds_base = pdd->lds_base;
1048 device_info.lds_limit = pdd->lds_limit;
1049 device_info.scratch_base = pdd->scratch_base;
1050 device_info.scratch_limit = pdd->scratch_limit;
1051 device_info.gpuvm_base = pdd->gpuvm_base;
1052 device_info.gpuvm_limit = pdd->gpuvm_limit;
1053 device_info.location_id = topo_dev->node_props.location_id;
1054 device_info.vendor_id = topo_dev->node_props.vendor_id;
1055 device_info.device_id = topo_dev->node_props.device_id;
1056 device_info.revision_id = pdd->dev->adev->pdev->revision;
1057 device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
1058 device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
1059 device_info.fw_version = pdd->dev->kfd->mec_fw_version;
1060 device_info.gfx_target_version =
1061 topo_dev->node_props.gfx_target_version;
1062 device_info.simd_count = topo_dev->node_props.simd_count;
1063 device_info.max_waves_per_simd =
1064 topo_dev->node_props.max_waves_per_simd;
1065 device_info.array_count = topo_dev->node_props.array_count;
1066 device_info.simd_arrays_per_engine =
1067 topo_dev->node_props.simd_arrays_per_engine;
1068 device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
1069 device_info.capability = topo_dev->node_props.capability;
1070 device_info.debug_prop = topo_dev->node_props.debug_prop;
1071
1072 if (exception_clear_mask)
1073 pdd->exception_status &= ~exception_clear_mask;
1074
1075 if (copy_to_user(user_info, &device_info, *entry_size)) {
1076 r = -EFAULT;
1077 break;
1078 }
1079
1080 user_info += tmp_entry_size;
1081 }
1082
1083 mutex_unlock(&target->event_mutex);
1084
1085 return r;
1086}
1087
1088void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1089 uint64_t exception_set_mask)
1090{
1091 uint64_t found_mask = 0;
1092 struct process_queue_manager *pqm;
1093 struct process_queue_node *pqn;
1094 static const char write_data = '.';
1095 loff_t pos = 0;
1096 int i;
1097
1098 mutex_lock(&target->event_mutex);
1099
1100 found_mask |= target->exception_status;
1101
1102 pqm = &target->pqm;
1103 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1104 if (!pqn->q)
1105 continue;
1106
1107 found_mask |= pqn->q->properties.exception_status;
1108 }
1109
1110 for (i = 0; i < target->n_pdds; i++) {
1111 struct kfd_process_device *pdd = target->pdds[i];
1112
1113 found_mask |= pdd->exception_status;
1114 }
1115
1116 if (exception_set_mask & found_mask)
1117 kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1118
1119 target->exception_enable_mask = exception_set_mask;
1120
1121 mutex_unlock(&target->event_mutex);
1122}
1/*
2 * Copyright 2023 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23#include "kfd_debug.h"
24#include "kfd_device_queue_manager.h"
25#include "kfd_topology.h"
26#include <linux/file.h>
27#include <uapi/linux/kfd_ioctl.h>
28#include <uapi/linux/kfd_sysfs.h>
29
30#define MAX_WATCH_ADDRESSES 4
31
32int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
33 unsigned int *queue_id,
34 unsigned int *gpu_id,
35 uint64_t exception_clear_mask,
36 uint64_t *event_status)
37{
38 struct process_queue_manager *pqm;
39 struct process_queue_node *pqn;
40 int i;
41
42 if (!(process && process->debug_trap_enabled))
43 return -ENODATA;
44
45 mutex_lock(&process->event_mutex);
46 *event_status = 0;
47 *queue_id = 0;
48 *gpu_id = 0;
49
50 /* find and report queue events */
51 pqm = &process->pqm;
52 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
53 uint64_t tmp = process->exception_enable_mask;
54
55 if (!pqn->q)
56 continue;
57
58 tmp &= pqn->q->properties.exception_status;
59
60 if (!tmp)
61 continue;
62
63 *event_status = pqn->q->properties.exception_status;
64 *queue_id = pqn->q->properties.queue_id;
65 *gpu_id = pqn->q->device->id;
66 pqn->q->properties.exception_status &= ~exception_clear_mask;
67 goto out;
68 }
69
70 /* find and report device events */
71 for (i = 0; i < process->n_pdds; i++) {
72 struct kfd_process_device *pdd = process->pdds[i];
73 uint64_t tmp = process->exception_enable_mask
74 & pdd->exception_status;
75
76 if (!tmp)
77 continue;
78
79 *event_status = pdd->exception_status;
80 *gpu_id = pdd->dev->id;
81 pdd->exception_status &= ~exception_clear_mask;
82 goto out;
83 }
84
85 /* report process events */
86 if (process->exception_enable_mask & process->exception_status) {
87 *event_status = process->exception_status;
88 process->exception_status &= ~exception_clear_mask;
89 }
90
91out:
92 mutex_unlock(&process->event_mutex);
93 return *event_status ? 0 : -EAGAIN;
94}
95
96void debug_event_write_work_handler(struct work_struct *work)
97{
98 struct kfd_process *process;
99
100 static const char write_data = '.';
101 loff_t pos = 0;
102
103 process = container_of(work,
104 struct kfd_process,
105 debug_event_workarea);
106
107 if (process->debug_trap_enabled && process->dbg_ev_file)
108 kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
109}
110
111/* update process/device/queue exception status, write to descriptor
112 * only if exception_status is enabled.
113 */
114bool kfd_dbg_ev_raise(uint64_t event_mask,
115 struct kfd_process *process, struct kfd_node *dev,
116 unsigned int source_id, bool use_worker,
117 void *exception_data, size_t exception_data_size)
118{
119 struct process_queue_manager *pqm;
120 struct process_queue_node *pqn;
121 int i;
122 static const char write_data = '.';
123 loff_t pos = 0;
124 bool is_subscribed = true;
125
126 if (!(process && process->debug_trap_enabled))
127 return false;
128
129 mutex_lock(&process->event_mutex);
130
131 if (event_mask & KFD_EC_MASK_DEVICE) {
132 for (i = 0; i < process->n_pdds; i++) {
133 struct kfd_process_device *pdd = process->pdds[i];
134
135 if (pdd->dev != dev)
136 continue;
137
138 pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
139
140 if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
141 if (!pdd->vm_fault_exc_data) {
142 pdd->vm_fault_exc_data = kmemdup(
143 exception_data,
144 exception_data_size,
145 GFP_KERNEL);
146 if (!pdd->vm_fault_exc_data)
147 pr_debug("Failed to allocate exception data memory");
148 } else {
149 pr_debug("Debugger exception data not saved\n");
150 print_hex_dump_bytes("exception data: ",
151 DUMP_PREFIX_OFFSET,
152 exception_data,
153 exception_data_size);
154 }
155 }
156 break;
157 }
158 } else if (event_mask & KFD_EC_MASK_PROCESS) {
159 process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
160 } else {
161 pqm = &process->pqm;
162 list_for_each_entry(pqn, &pqm->queues,
163 process_queue_list) {
164 int target_id;
165
166 if (!pqn->q)
167 continue;
168
169 target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
170 pqn->q->properties.queue_id :
171 pqn->q->doorbell_id;
172
173 if (pqn->q->device != dev || target_id != source_id)
174 continue;
175
176 pqn->q->properties.exception_status |= event_mask;
177 break;
178 }
179 }
180
181 if (process->exception_enable_mask & event_mask) {
182 if (use_worker)
183 schedule_work(&process->debug_event_workarea);
184 else
185 kernel_write(process->dbg_ev_file,
186 &write_data,
187 1,
188 &pos);
189 } else {
190 is_subscribed = false;
191 }
192
193 mutex_unlock(&process->event_mutex);
194
195 return is_subscribed;
196}
197
198/* set pending event queue entry from ring entry */
199bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
200 unsigned int pasid,
201 uint32_t doorbell_id,
202 uint64_t trap_mask,
203 void *exception_data,
204 size_t exception_data_size)
205{
206 struct kfd_process *p;
207 bool signaled_to_debugger_or_runtime = false;
208
209 p = kfd_lookup_process_by_pasid(pasid);
210
211 if (!p)
212 return false;
213
214 if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
215 exception_data, exception_data_size)) {
216 struct process_queue_manager *pqm;
217 struct process_queue_node *pqn;
218
219 if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
220 p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
221 mutex_lock(&p->mutex);
222
223 pqm = &p->pqm;
224 list_for_each_entry(pqn, &pqm->queues,
225 process_queue_list) {
226
227 if (!(pqn->q && pqn->q->device == dev &&
228 pqn->q->doorbell_id == doorbell_id))
229 continue;
230
231 kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
232 trap_mask);
233
234 signaled_to_debugger_or_runtime = true;
235
236 break;
237 }
238
239 mutex_unlock(&p->mutex);
240 } else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
241 kfd_dqm_evict_pasid(dev->dqm, p->pasid);
242 kfd_signal_vm_fault_event(dev, p->pasid, NULL,
243 exception_data);
244
245 signaled_to_debugger_or_runtime = true;
246 }
247 } else {
248 signaled_to_debugger_or_runtime = true;
249 }
250
251 kfd_unref_process(p);
252
253 return signaled_to_debugger_or_runtime;
254}
255
256int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
257 unsigned int dev_id,
258 unsigned int queue_id,
259 uint64_t error_reason)
260{
261 if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
262 struct kfd_process_device *pdd = NULL;
263 struct kfd_hsa_memory_exception_data *data;
264 int i;
265
266 for (i = 0; i < p->n_pdds; i++) {
267 if (p->pdds[i]->dev->id == dev_id) {
268 pdd = p->pdds[i];
269 break;
270 }
271 }
272
273 if (!pdd)
274 return -ENODEV;
275
276 data = (struct kfd_hsa_memory_exception_data *)
277 pdd->vm_fault_exc_data;
278
279 kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
280 kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
281 error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
282 }
283
284 if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
285 /*
286 * block should only happen after the debugger receives runtime
287 * enable notice.
288 */
289 up(&p->runtime_enable_sema);
290 error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
291 }
292
293 if (error_reason)
294 return kfd_send_exception_to_runtime(p, queue_id, error_reason);
295
296 return 0;
297}
298
299static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
300{
301 struct mqd_update_info minfo = {0};
302 int err;
303
304 if (!q)
305 return 0;
306
307 if (!kfd_dbg_has_cwsr_workaround(q->device))
308 return 0;
309
310 if (enable && q->properties.is_user_cu_masked)
311 return -EBUSY;
312
313 minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
314
315 q->properties.is_dbg_wa = enable;
316 err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
317 if (err)
318 q->properties.is_dbg_wa = false;
319
320 return err;
321}
322
323static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
324{
325 struct process_queue_manager *pqm = &target->pqm;
326 struct process_queue_node *pqn;
327 int r = 0;
328
329 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
330 r = kfd_dbg_set_queue_workaround(pqn->q, enable);
331 if (enable && r)
332 goto unwind;
333 }
334
335 return 0;
336
337unwind:
338 list_for_each_entry(pqn, &pqm->queues, process_queue_list)
339 kfd_dbg_set_queue_workaround(pqn->q, false);
340
341 if (enable)
342 target->runtime_info.runtime_state = r == -EBUSY ?
343 DEBUG_RUNTIME_STATE_ENABLED_BUSY :
344 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
345
346 return r;
347}
348
349int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
350{
351 uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
352 uint32_t flags = pdd->process->dbg_flags;
353 struct amdgpu_device *adev = pdd->dev->adev;
354 int r;
355
356 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
357 return 0;
358
359 if (!pdd->proc_ctx_cpu_ptr) {
360 r = amdgpu_amdkfd_alloc_gtt_mem(adev,
361 AMDGPU_MES_PROC_CTX_SIZE,
362 &pdd->proc_ctx_bo,
363 &pdd->proc_ctx_gpu_addr,
364 &pdd->proc_ctx_cpu_ptr,
365 false);
366 if (r) {
367 dev_err(adev->dev,
368 "failed to allocate process context bo\n");
369 return r;
370 }
371 memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
372 }
373
374 return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
375 pdd->watch_points, flags, sq_trap_en);
376}
377
378#define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
379static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
380{
381 int i;
382
383 *watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
384
385 spin_lock(&pdd->dev->watch_points_lock);
386
387 for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
388 /* device watchpoint in use so skip */
389 if ((pdd->dev->alloc_watch_ids >> i) & 0x1)
390 continue;
391
392 pdd->alloc_watch_ids |= 0x1 << i;
393 pdd->dev->alloc_watch_ids |= 0x1 << i;
394 *watch_id = i;
395 spin_unlock(&pdd->dev->watch_points_lock);
396 return 0;
397 }
398
399 spin_unlock(&pdd->dev->watch_points_lock);
400
401 return -ENOMEM;
402}
403
404static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
405{
406 spin_lock(&pdd->dev->watch_points_lock);
407
408 /* process owns device watch point so safe to clear */
409 if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
410 pdd->alloc_watch_ids &= ~(0x1 << watch_id);
411 pdd->dev->alloc_watch_ids &= ~(0x1 << watch_id);
412 }
413
414 spin_unlock(&pdd->dev->watch_points_lock);
415}
416
417static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
418{
419 bool owns_watch_id = false;
420
421 spin_lock(&pdd->dev->watch_points_lock);
422 owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
423 ((pdd->alloc_watch_ids >> watch_id) & 0x1);
424
425 spin_unlock(&pdd->dev->watch_points_lock);
426
427 return owns_watch_id;
428}
429
430int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
431 uint32_t watch_id)
432{
433 int r;
434
435 if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
436 return -EINVAL;
437
438 if (!pdd->dev->kfd->shared_resources.enable_mes) {
439 r = debug_lock_and_unmap(pdd->dev->dqm);
440 if (r)
441 return r;
442 }
443
444 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
445 pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
446 pdd->dev->adev,
447 watch_id);
448 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
449
450 if (!pdd->dev->kfd->shared_resources.enable_mes)
451 r = debug_map_and_unlock(pdd->dev->dqm);
452 else
453 r = kfd_dbg_set_mes_debug_mode(pdd, true);
454
455 kfd_dbg_clear_dev_watch_id(pdd, watch_id);
456
457 return r;
458}
459
460int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
461 uint64_t watch_address,
462 uint32_t watch_address_mask,
463 uint32_t *watch_id,
464 uint32_t watch_mode)
465{
466 int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
467 uint32_t xcc_mask = pdd->dev->xcc_mask;
468
469 if (r)
470 return r;
471
472 if (!pdd->dev->kfd->shared_resources.enable_mes) {
473 r = debug_lock_and_unmap(pdd->dev->dqm);
474 if (r) {
475 kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
476 return r;
477 }
478 }
479
480 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
481 for_each_inst(xcc_id, xcc_mask)
482 pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
483 pdd->dev->adev,
484 watch_address,
485 watch_address_mask,
486 *watch_id,
487 watch_mode,
488 pdd->dev->vm_info.last_vmid_kfd,
489 xcc_id);
490 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
491
492 if (!pdd->dev->kfd->shared_resources.enable_mes)
493 r = debug_map_and_unlock(pdd->dev->dqm);
494 else
495 r = kfd_dbg_set_mes_debug_mode(pdd, true);
496
497 /* HWS is broken so no point in HW rollback but release the watchpoint anyways */
498 if (r)
499 kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
500
501 return 0;
502}
503
504static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
505{
506 int i, j;
507
508 for (i = 0; i < target->n_pdds; i++)
509 for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
510 kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
511}
512
513int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
514{
515 uint32_t prev_flags = target->dbg_flags;
516 int i, r = 0, rewind_count = 0;
517
518 for (i = 0; i < target->n_pdds; i++) {
519 struct kfd_topology_device *topo_dev =
520 kfd_topology_device_by_id(target->pdds[i]->dev->id);
521 uint32_t caps = topo_dev->node_props.capability;
522
523 if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) &&
524 (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
525 *flags = prev_flags;
526 return -EACCES;
527 }
528
529 if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) &&
530 (*flags & KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP)) {
531 *flags = prev_flags;
532 return -EACCES;
533 }
534 }
535
536 target->dbg_flags = *flags;
537 *flags = prev_flags;
538 for (i = 0; i < target->n_pdds; i++) {
539 struct kfd_process_device *pdd = target->pdds[i];
540
541 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
542 continue;
543
544 if (!pdd->dev->kfd->shared_resources.enable_mes)
545 r = debug_refresh_runlist(pdd->dev->dqm);
546 else
547 r = kfd_dbg_set_mes_debug_mode(pdd, true);
548
549 if (r) {
550 target->dbg_flags = prev_flags;
551 break;
552 }
553
554 rewind_count++;
555 }
556
557 /* Rewind flags */
558 if (r) {
559 target->dbg_flags = prev_flags;
560
561 for (i = 0; i < rewind_count; i++) {
562 struct kfd_process_device *pdd = target->pdds[i];
563
564 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
565 continue;
566
567 if (!pdd->dev->kfd->shared_resources.enable_mes)
568 debug_refresh_runlist(pdd->dev->dqm);
569 else
570 kfd_dbg_set_mes_debug_mode(pdd, true);
571 }
572 }
573
574 return r;
575}
576
577/* kfd_dbg_trap_deactivate:
578 * target: target process
579 * unwind: If this is unwinding a failed kfd_dbg_trap_enable()
580 * unwind_count:
581 * If unwind == true, how far down the pdd list we need
582 * to unwind
583 * else: ignored
584 */
585void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
586{
587 int i;
588
589 if (!unwind) {
590 uint32_t flags = 0;
591 int resume_count = resume_queues(target, 0, NULL);
592
593 if (resume_count)
594 pr_debug("Resumed %d queues\n", resume_count);
595
596 cancel_work_sync(&target->debug_event_workarea);
597 kfd_dbg_clear_process_address_watch(target);
598 kfd_dbg_trap_set_wave_launch_mode(target, 0);
599
600 kfd_dbg_trap_set_flags(target, &flags);
601 }
602
603 for (i = 0; i < target->n_pdds; i++) {
604 struct kfd_process_device *pdd = target->pdds[i];
605
606 /* If this is an unwind, and we have unwound the required
607 * enable calls on the pdd list, we need to stop now
608 * otherwise we may mess up another debugger session.
609 */
610 if (unwind && i == unwind_count)
611 break;
612
613 kfd_process_set_trap_debug_flag(&pdd->qpd, false);
614
615 /* GFX off is already disabled by debug activate if not RLC restore supported. */
616 if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
617 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
618 pdd->spi_dbg_override =
619 pdd->dev->kfd2kgd->disable_debug_trap(
620 pdd->dev->adev,
621 target->runtime_info.ttmp_setup,
622 pdd->dev->vm_info.last_vmid_kfd);
623 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
624
625 if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
626 release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
627 pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
628
629 if (!pdd->dev->kfd->shared_resources.enable_mes)
630 debug_refresh_runlist(pdd->dev->dqm);
631 else
632 kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));
633 }
634
635 kfd_dbg_set_workaround(target, false);
636}
637
638static void kfd_dbg_clean_exception_status(struct kfd_process *target)
639{
640 struct process_queue_manager *pqm;
641 struct process_queue_node *pqn;
642 int i;
643
644 for (i = 0; i < target->n_pdds; i++) {
645 struct kfd_process_device *pdd = target->pdds[i];
646
647 kfd_process_drain_interrupts(pdd);
648
649 pdd->exception_status = 0;
650 }
651
652 pqm = &target->pqm;
653 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
654 if (!pqn->q)
655 continue;
656
657 pqn->q->properties.exception_status = 0;
658 }
659
660 target->exception_status = 0;
661}
662
663int kfd_dbg_trap_disable(struct kfd_process *target)
664{
665 if (!target->debug_trap_enabled)
666 return 0;
667
668 /*
669 * Defer deactivation to runtime if runtime not enabled otherwise reset
670 * attached running target runtime state to enable for re-attach.
671 */
672 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
673 kfd_dbg_trap_deactivate(target, false, 0);
674 else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
675 target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
676
677 cancel_work_sync(&target->debug_event_workarea);
678 fput(target->dbg_ev_file);
679 target->dbg_ev_file = NULL;
680
681 if (target->debugger_process) {
682 atomic_dec(&target->debugger_process->debugged_process_count);
683 target->debugger_process = NULL;
684 }
685
686 target->debug_trap_enabled = false;
687 kfd_dbg_clean_exception_status(target);
688 kfd_unref_process(target);
689
690 return 0;
691}
692
693int kfd_dbg_trap_activate(struct kfd_process *target)
694{
695 int i, r = 0;
696
697 r = kfd_dbg_set_workaround(target, true);
698 if (r)
699 return r;
700
701 for (i = 0; i < target->n_pdds; i++) {
702 struct kfd_process_device *pdd = target->pdds[i];
703
704 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
705 r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
706
707 if (r) {
708 target->runtime_info.runtime_state = (r == -EBUSY) ?
709 DEBUG_RUNTIME_STATE_ENABLED_BUSY :
710 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
711
712 goto unwind_err;
713 }
714 }
715
716 /* Disable GFX OFF to prevent garbage read/writes to debug registers.
717 * If RLC restore of debug registers is not supported and runtime enable
718 * hasn't done so already on ttmp setup request, restore the trap config registers.
719 *
720 * If RLC restore of debug registers is not supported, keep gfx off disabled for
721 * the debug session.
722 */
723 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
724 if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
725 target->runtime_info.ttmp_setup))
726 pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
727 pdd->dev->vm_info.last_vmid_kfd);
728
729 pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
730 pdd->dev->adev,
731 false,
732 pdd->dev->vm_info.last_vmid_kfd);
733
734 if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
735 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
736
737 /*
738 * Setting the debug flag in the trap handler requires that the TMA has been
739 * allocated, which occurs during CWSR initialization.
740 * In the event that CWSR has not been initialized at this point, setting the
741 * flag will be called again during CWSR initialization if the target process
742 * is still debug enabled.
743 */
744 kfd_process_set_trap_debug_flag(&pdd->qpd, true);
745
746 if (!pdd->dev->kfd->shared_resources.enable_mes)
747 r = debug_refresh_runlist(pdd->dev->dqm);
748 else
749 r = kfd_dbg_set_mes_debug_mode(pdd, true);
750
751 if (r) {
752 target->runtime_info.runtime_state =
753 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
754 goto unwind_err;
755 }
756 }
757
758 return 0;
759
760unwind_err:
761 /* Enabling debug failed, we need to disable on
762 * all GPUs so the enable is all or nothing.
763 */
764 kfd_dbg_trap_deactivate(target, true, i);
765 return r;
766}
767
768int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
769 void __user *runtime_info, uint32_t *runtime_size)
770{
771 struct file *f;
772 uint32_t copy_size;
773 int i, r = 0;
774
775 if (target->debug_trap_enabled)
776 return -EALREADY;
777
778 /* Enable pre-checks */
779 for (i = 0; i < target->n_pdds; i++) {
780 struct kfd_process_device *pdd = target->pdds[i];
781
782 if (!KFD_IS_SOC15(pdd->dev))
783 return -ENODEV;
784
785 if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
786 kfd_dbg_has_cwsr_workaround(pdd->dev)))
787 return -EBUSY;
788 }
789
790 copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
791
792 f = fget(fd);
793 if (!f) {
794 pr_err("Failed to get file for (%i)\n", fd);
795 return -EBADF;
796 }
797
798 target->dbg_ev_file = f;
799
800 /* defer activation to runtime if not runtime enabled */
801 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
802 kfd_dbg_trap_activate(target);
803
804 /* We already hold the process reference but hold another one for the
805 * debug session.
806 */
807 kref_get(&target->ref);
808 target->debug_trap_enabled = true;
809
810 if (target->debugger_process)
811 atomic_inc(&target->debugger_process->debugged_process_count);
812
813 if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
814 kfd_dbg_trap_deactivate(target, false, 0);
815 r = -EFAULT;
816 }
817
818 *runtime_size = sizeof(target->runtime_info);
819
820 return r;
821}
822
823static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
824 uint32_t trap_override,
825 uint32_t trap_mask_request,
826 uint32_t *trap_mask_supported)
827{
828 int i = 0;
829
830 *trap_mask_supported = 0xffffffff;
831
832 for (i = 0; i < p->n_pdds; i++) {
833 struct kfd_process_device *pdd = p->pdds[i];
834 int err = pdd->dev->kfd2kgd->validate_trap_override_request(
835 pdd->dev->adev,
836 trap_override,
837 trap_mask_supported);
838
839 if (err)
840 return err;
841 }
842
843 if (trap_mask_request & ~*trap_mask_supported)
844 return -EACCES;
845
846 return 0;
847}
848
849int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
850 uint32_t trap_override,
851 uint32_t trap_mask_bits,
852 uint32_t trap_mask_request,
853 uint32_t *trap_mask_prev,
854 uint32_t *trap_mask_supported)
855{
856 int r = 0, i;
857
858 r = kfd_dbg_validate_trap_override_request(target,
859 trap_override,
860 trap_mask_request,
861 trap_mask_supported);
862
863 if (r)
864 return r;
865
866 for (i = 0; i < target->n_pdds; i++) {
867 struct kfd_process_device *pdd = target->pdds[i];
868
869 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
870 pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
871 pdd->dev->adev,
872 pdd->dev->vm_info.last_vmid_kfd,
873 trap_override,
874 trap_mask_bits,
875 trap_mask_request,
876 trap_mask_prev,
877 pdd->spi_dbg_override);
878 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
879
880 if (!pdd->dev->kfd->shared_resources.enable_mes)
881 r = debug_refresh_runlist(pdd->dev->dqm);
882 else
883 r = kfd_dbg_set_mes_debug_mode(pdd, true);
884
885 if (r)
886 break;
887 }
888
889 return r;
890}
891
892int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
893 uint8_t wave_launch_mode)
894{
895 int r = 0, i;
896
897 if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
898 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
899 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
900 return -EINVAL;
901
902 for (i = 0; i < target->n_pdds; i++) {
903 struct kfd_process_device *pdd = target->pdds[i];
904
905 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
906 pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
907 pdd->dev->adev,
908 wave_launch_mode,
909 pdd->dev->vm_info.last_vmid_kfd);
910 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
911
912 if (!pdd->dev->kfd->shared_resources.enable_mes)
913 r = debug_refresh_runlist(pdd->dev->dqm);
914 else
915 r = kfd_dbg_set_mes_debug_mode(pdd, true);
916
917 if (r)
918 break;
919 }
920
921 return r;
922}
923
924int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
925 uint32_t source_id,
926 uint32_t exception_code,
927 bool clear_exception,
928 void __user *info,
929 uint32_t *info_size)
930{
931 bool found = false;
932 int r = 0;
933 uint32_t copy_size, actual_info_size = 0;
934 uint64_t *exception_status_ptr = NULL;
935
936 if (!target)
937 return -EINVAL;
938
939 if (!info || !info_size)
940 return -EINVAL;
941
942 mutex_lock(&target->event_mutex);
943
944 if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
945 /* Per queue exceptions */
946 struct queue *queue = NULL;
947 int i;
948
949 for (i = 0; i < target->n_pdds; i++) {
950 struct kfd_process_device *pdd = target->pdds[i];
951 struct qcm_process_device *qpd = &pdd->qpd;
952
953 list_for_each_entry(queue, &qpd->queues_list, list) {
954 if (!found && queue->properties.queue_id == source_id) {
955 found = true;
956 break;
957 }
958 }
959 if (found)
960 break;
961 }
962
963 if (!found) {
964 r = -EINVAL;
965 goto out;
966 }
967
968 if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
969 r = -ENODATA;
970 goto out;
971 }
972 exception_status_ptr = &queue->properties.exception_status;
973 } else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
974 /* Per device exceptions */
975 struct kfd_process_device *pdd = NULL;
976 int i;
977
978 for (i = 0; i < target->n_pdds; i++) {
979 pdd = target->pdds[i];
980 if (pdd->dev->id == source_id) {
981 found = true;
982 break;
983 }
984 }
985
986 if (!found) {
987 r = -EINVAL;
988 goto out;
989 }
990
991 if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
992 r = -ENODATA;
993 goto out;
994 }
995
996 if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
997 copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
998
999 if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
1000 r = -EFAULT;
1001 goto out;
1002 }
1003 actual_info_size = pdd->vm_fault_exc_data_size;
1004 if (clear_exception) {
1005 kfree(pdd->vm_fault_exc_data);
1006 pdd->vm_fault_exc_data = NULL;
1007 pdd->vm_fault_exc_data_size = 0;
1008 }
1009 }
1010 exception_status_ptr = &pdd->exception_status;
1011 } else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
1012 /* Per process exceptions */
1013 if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
1014 r = -ENODATA;
1015 goto out;
1016 }
1017
1018 if (exception_code == EC_PROCESS_RUNTIME) {
1019 copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
1020
1021 if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
1022 r = -EFAULT;
1023 goto out;
1024 }
1025
1026 actual_info_size = sizeof(target->runtime_info);
1027 }
1028
1029 exception_status_ptr = &target->exception_status;
1030 } else {
1031 pr_debug("Bad exception type [%i]\n", exception_code);
1032 r = -EINVAL;
1033 goto out;
1034 }
1035
1036 *info_size = actual_info_size;
1037 if (clear_exception)
1038 *exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1039out:
1040 mutex_unlock(&target->event_mutex);
1041 return r;
1042}
1043
1044int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
1045 uint64_t exception_clear_mask,
1046 void __user *user_info,
1047 uint32_t *number_of_device_infos,
1048 uint32_t *entry_size)
1049{
1050 struct kfd_dbg_device_info_entry device_info;
1051 uint32_t tmp_entry_size, tmp_num_devices;
1052 int i, r = 0;
1053
1054 if (!(target && user_info && number_of_device_infos && entry_size))
1055 return -EINVAL;
1056
1057 tmp_entry_size = *entry_size;
1058
1059 tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
1060 *number_of_device_infos = target->n_pdds;
1061 *entry_size = min_t(size_t, *entry_size, sizeof(device_info));
1062
1063 if (!tmp_num_devices)
1064 return 0;
1065
1066 memset(&device_info, 0, sizeof(device_info));
1067
1068 mutex_lock(&target->event_mutex);
1069
1070 /* Run over all pdd of the process */
1071 for (i = 0; i < tmp_num_devices; i++) {
1072 struct kfd_process_device *pdd = target->pdds[i];
1073 struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
1074
1075 device_info.gpu_id = pdd->dev->id;
1076 device_info.exception_status = pdd->exception_status;
1077 device_info.lds_base = pdd->lds_base;
1078 device_info.lds_limit = pdd->lds_limit;
1079 device_info.scratch_base = pdd->scratch_base;
1080 device_info.scratch_limit = pdd->scratch_limit;
1081 device_info.gpuvm_base = pdd->gpuvm_base;
1082 device_info.gpuvm_limit = pdd->gpuvm_limit;
1083 device_info.location_id = topo_dev->node_props.location_id;
1084 device_info.vendor_id = topo_dev->node_props.vendor_id;
1085 device_info.device_id = topo_dev->node_props.device_id;
1086 device_info.revision_id = pdd->dev->adev->pdev->revision;
1087 device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
1088 device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
1089 device_info.fw_version = pdd->dev->kfd->mec_fw_version;
1090 device_info.gfx_target_version =
1091 topo_dev->node_props.gfx_target_version;
1092 device_info.simd_count = topo_dev->node_props.simd_count;
1093 device_info.max_waves_per_simd =
1094 topo_dev->node_props.max_waves_per_simd;
1095 device_info.array_count = topo_dev->node_props.array_count;
1096 device_info.simd_arrays_per_engine =
1097 topo_dev->node_props.simd_arrays_per_engine;
1098 device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
1099 device_info.capability = topo_dev->node_props.capability;
1100 device_info.debug_prop = topo_dev->node_props.debug_prop;
1101
1102 if (exception_clear_mask)
1103 pdd->exception_status &= ~exception_clear_mask;
1104
1105 if (copy_to_user(user_info, &device_info, *entry_size)) {
1106 r = -EFAULT;
1107 break;
1108 }
1109
1110 user_info += tmp_entry_size;
1111 }
1112
1113 mutex_unlock(&target->event_mutex);
1114
1115 return r;
1116}
1117
1118void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1119 uint64_t exception_set_mask)
1120{
1121 uint64_t found_mask = 0;
1122 struct process_queue_manager *pqm;
1123 struct process_queue_node *pqn;
1124 static const char write_data = '.';
1125 loff_t pos = 0;
1126 int i;
1127
1128 mutex_lock(&target->event_mutex);
1129
1130 found_mask |= target->exception_status;
1131
1132 pqm = &target->pqm;
1133 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1134 if (!pqn->q)
1135 continue;
1136
1137 found_mask |= pqn->q->properties.exception_status;
1138 }
1139
1140 for (i = 0; i < target->n_pdds; i++) {
1141 struct kfd_process_device *pdd = target->pdds[i];
1142
1143 found_mask |= pdd->exception_status;
1144 }
1145
1146 if (exception_set_mask & found_mask)
1147 kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1148
1149 target->exception_enable_mask = exception_set_mask;
1150
1151 mutex_unlock(&target->event_mutex);
1152}