Linux Audio

Check our new training course

Linux kernel drivers training

Mar 31-Apr 9, 2025, special US time zones
Register
Loading...
v6.9.4
   1/*
   2 * Copyright 2023 Advanced Micro Devices, Inc.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice shall be included in
  12 * all copies or substantial portions of the Software.
  13 *
  14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20 * OTHER DEALINGS IN THE SOFTWARE.
  21 */
  22
  23#include "kfd_debug.h"
  24#include "kfd_device_queue_manager.h"
  25#include "kfd_topology.h"
  26#include <linux/file.h>
  27#include <uapi/linux/kfd_ioctl.h>
 
  28
  29#define MAX_WATCH_ADDRESSES	4
  30
  31int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
  32		      unsigned int *queue_id,
  33		      unsigned int *gpu_id,
  34		      uint64_t exception_clear_mask,
  35		      uint64_t *event_status)
  36{
  37	struct process_queue_manager *pqm;
  38	struct process_queue_node *pqn;
  39	int i;
  40
  41	if (!(process && process->debug_trap_enabled))
  42		return -ENODATA;
  43
  44	mutex_lock(&process->event_mutex);
  45	*event_status = 0;
  46	*queue_id = 0;
  47	*gpu_id = 0;
  48
  49	/* find and report queue events */
  50	pqm = &process->pqm;
  51	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
  52		uint64_t tmp = process->exception_enable_mask;
  53
  54		if (!pqn->q)
  55			continue;
  56
  57		tmp &= pqn->q->properties.exception_status;
  58
  59		if (!tmp)
  60			continue;
  61
  62		*event_status = pqn->q->properties.exception_status;
  63		*queue_id = pqn->q->properties.queue_id;
  64		*gpu_id = pqn->q->device->id;
  65		pqn->q->properties.exception_status &= ~exception_clear_mask;
  66		goto out;
  67	}
  68
  69	/* find and report device events */
  70	for (i = 0; i < process->n_pdds; i++) {
  71		struct kfd_process_device *pdd = process->pdds[i];
  72		uint64_t tmp = process->exception_enable_mask
  73						& pdd->exception_status;
  74
  75		if (!tmp)
  76			continue;
  77
  78		*event_status = pdd->exception_status;
  79		*gpu_id = pdd->dev->id;
  80		pdd->exception_status &= ~exception_clear_mask;
  81		goto out;
  82	}
  83
  84	/* report process events */
  85	if (process->exception_enable_mask & process->exception_status) {
  86		*event_status = process->exception_status;
  87		process->exception_status &= ~exception_clear_mask;
  88	}
  89
  90out:
  91	mutex_unlock(&process->event_mutex);
  92	return *event_status ? 0 : -EAGAIN;
  93}
  94
  95void debug_event_write_work_handler(struct work_struct *work)
  96{
  97	struct kfd_process *process;
  98
  99	static const char write_data = '.';
 100	loff_t pos = 0;
 101
 102	process = container_of(work,
 103			struct kfd_process,
 104			debug_event_workarea);
 105
 106	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
 
 107}
 108
 109/* update process/device/queue exception status, write to descriptor
 110 * only if exception_status is enabled.
 111 */
 112bool kfd_dbg_ev_raise(uint64_t event_mask,
 113			struct kfd_process *process, struct kfd_node *dev,
 114			unsigned int source_id, bool use_worker,
 115			void *exception_data, size_t exception_data_size)
 116{
 117	struct process_queue_manager *pqm;
 118	struct process_queue_node *pqn;
 119	int i;
 120	static const char write_data = '.';
 121	loff_t pos = 0;
 122	bool is_subscribed = true;
 123
 124	if (!(process && process->debug_trap_enabled))
 125		return false;
 126
 127	mutex_lock(&process->event_mutex);
 128
 129	if (event_mask & KFD_EC_MASK_DEVICE) {
 130		for (i = 0; i < process->n_pdds; i++) {
 131			struct kfd_process_device *pdd = process->pdds[i];
 132
 133			if (pdd->dev != dev)
 134				continue;
 135
 136			pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
 137
 138			if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
 139				if (!pdd->vm_fault_exc_data) {
 140					pdd->vm_fault_exc_data = kmemdup(
 141							exception_data,
 142							exception_data_size,
 143							GFP_KERNEL);
 144					if (!pdd->vm_fault_exc_data)
 145						pr_debug("Failed to allocate exception data memory");
 146				} else {
 147					pr_debug("Debugger exception data not saved\n");
 148					print_hex_dump_bytes("exception data: ",
 149							DUMP_PREFIX_OFFSET,
 150							exception_data,
 151							exception_data_size);
 152				}
 153			}
 154			break;
 155		}
 156	} else if (event_mask & KFD_EC_MASK_PROCESS) {
 157		process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
 158	} else {
 159		pqm = &process->pqm;
 160		list_for_each_entry(pqn, &pqm->queues,
 161				process_queue_list) {
 162			int target_id;
 163
 164			if (!pqn->q)
 165				continue;
 166
 167			target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
 168					pqn->q->properties.queue_id :
 169							pqn->q->doorbell_id;
 170
 171			if (pqn->q->device != dev || target_id != source_id)
 172				continue;
 173
 174			pqn->q->properties.exception_status |= event_mask;
 175			break;
 176		}
 177	}
 178
 179	if (process->exception_enable_mask & event_mask) {
 180		if (use_worker)
 181			schedule_work(&process->debug_event_workarea);
 182		else
 183			kernel_write(process->dbg_ev_file,
 184					&write_data,
 185					1,
 186					&pos);
 187	} else {
 188		is_subscribed = false;
 189	}
 190
 191	mutex_unlock(&process->event_mutex);
 192
 193	return is_subscribed;
 194}
 195
 196/* set pending event queue entry from ring entry  */
 197bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
 198				   unsigned int pasid,
 199				   uint32_t doorbell_id,
 200				   uint64_t trap_mask,
 201				   void *exception_data,
 202				   size_t exception_data_size)
 203{
 204	struct kfd_process *p;
 205	bool signaled_to_debugger_or_runtime = false;
 206
 207	p = kfd_lookup_process_by_pasid(pasid);
 208
 209	if (!p)
 210		return false;
 211
 212	if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
 213			      exception_data, exception_data_size)) {
 214		struct process_queue_manager *pqm;
 215		struct process_queue_node *pqn;
 216
 217		if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
 218		       p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
 219			mutex_lock(&p->mutex);
 220
 221			pqm = &p->pqm;
 222			list_for_each_entry(pqn, &pqm->queues,
 223							process_queue_list) {
 224
 225				if (!(pqn->q && pqn->q->device == dev &&
 226				      pqn->q->doorbell_id == doorbell_id))
 227					continue;
 228
 229				kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
 230							      trap_mask);
 231
 232				signaled_to_debugger_or_runtime = true;
 233
 234				break;
 235			}
 236
 237			mutex_unlock(&p->mutex);
 238		} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
 239			kfd_dqm_evict_pasid(dev->dqm, p->pasid);
 240			kfd_signal_vm_fault_event(dev, p->pasid, NULL,
 241							exception_data);
 242
 243			signaled_to_debugger_or_runtime = true;
 244		}
 245	} else {
 246		signaled_to_debugger_or_runtime = true;
 247	}
 248
 249	kfd_unref_process(p);
 250
 251	return signaled_to_debugger_or_runtime;
 252}
 253
 254int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
 255					unsigned int dev_id,
 256					unsigned int queue_id,
 257					uint64_t error_reason)
 258{
 259	if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
 260		struct kfd_process_device *pdd = NULL;
 261		struct kfd_hsa_memory_exception_data *data;
 262		int i;
 263
 264		for (i = 0; i < p->n_pdds; i++) {
 265			if (p->pdds[i]->dev->id == dev_id) {
 266				pdd = p->pdds[i];
 267				break;
 268			}
 269		}
 270
 271		if (!pdd)
 272			return -ENODEV;
 273
 274		data = (struct kfd_hsa_memory_exception_data *)
 275						pdd->vm_fault_exc_data;
 276
 277		kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
 278		kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
 279		error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
 280	}
 281
 282	if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
 283		/*
 284		 * block should only happen after the debugger receives runtime
 285		 * enable notice.
 286		 */
 287		up(&p->runtime_enable_sema);
 288		error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
 289	}
 290
 291	if (error_reason)
 292		return kfd_send_exception_to_runtime(p, queue_id, error_reason);
 293
 294	return 0;
 295}
 296
 297static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
 298{
 299	struct mqd_update_info minfo = {0};
 300	int err;
 301
 302	if (!q)
 303		return 0;
 304
 305	if (!kfd_dbg_has_cwsr_workaround(q->device))
 306		return 0;
 307
 308	if (enable && q->properties.is_user_cu_masked)
 309		return -EBUSY;
 310
 311	minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
 312
 313	q->properties.is_dbg_wa = enable;
 314	err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
 315	if (err)
 316		q->properties.is_dbg_wa = false;
 317
 318	return err;
 319}
 320
 321static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
 322{
 323	struct process_queue_manager *pqm = &target->pqm;
 324	struct process_queue_node *pqn;
 325	int r = 0;
 326
 327	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
 328		r = kfd_dbg_set_queue_workaround(pqn->q, enable);
 329		if (enable && r)
 330			goto unwind;
 331	}
 332
 333	return 0;
 334
 335unwind:
 336	list_for_each_entry(pqn, &pqm->queues, process_queue_list)
 337		kfd_dbg_set_queue_workaround(pqn->q, false);
 338
 339	if (enable)
 340		target->runtime_info.runtime_state = r == -EBUSY ?
 341				DEBUG_RUNTIME_STATE_ENABLED_BUSY :
 342				DEBUG_RUNTIME_STATE_ENABLED_ERROR;
 343
 344	return r;
 345}
 346
 347int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
 348{
 349	uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
 350	uint32_t flags = pdd->process->dbg_flags;
 
 
 351
 352	if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
 353		return 0;
 354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 355	return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
 356						pdd->watch_points, flags, sq_trap_en);
 357}
 358
 359#define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
 360static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
 361{
 362	int i;
 363
 364	*watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
 365
 366	spin_lock(&pdd->dev->kfd->watch_points_lock);
 367
 368	for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
 369		/* device watchpoint in use so skip */
 370		if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1)
 371			continue;
 372
 373		pdd->alloc_watch_ids |= 0x1 << i;
 374		pdd->dev->kfd->alloc_watch_ids |= 0x1 << i;
 375		*watch_id = i;
 376		spin_unlock(&pdd->dev->kfd->watch_points_lock);
 377		return 0;
 378	}
 379
 380	spin_unlock(&pdd->dev->kfd->watch_points_lock);
 381
 382	return -ENOMEM;
 383}
 384
 385static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
 386{
 387	spin_lock(&pdd->dev->kfd->watch_points_lock);
 388
 389	/* process owns device watch point so safe to clear */
 390	if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
 391		pdd->alloc_watch_ids &= ~(0x1 << watch_id);
 392		pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id);
 393	}
 394
 395	spin_unlock(&pdd->dev->kfd->watch_points_lock);
 396}
 397
 398static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
 399{
 400	bool owns_watch_id = false;
 401
 402	spin_lock(&pdd->dev->kfd->watch_points_lock);
 403	owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
 404			((pdd->alloc_watch_ids >> watch_id) & 0x1);
 405
 406	spin_unlock(&pdd->dev->kfd->watch_points_lock);
 407
 408	return owns_watch_id;
 409}
 410
 411int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
 412					uint32_t watch_id)
 413{
 414	int r;
 415
 416	if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
 417		return -EINVAL;
 418
 419	if (!pdd->dev->kfd->shared_resources.enable_mes) {
 420		r = debug_lock_and_unmap(pdd->dev->dqm);
 421		if (r)
 422			return r;
 423	}
 424
 425	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
 426	pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
 427							pdd->dev->adev,
 428							watch_id);
 429	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
 430
 431	if (!pdd->dev->kfd->shared_resources.enable_mes)
 432		r = debug_map_and_unlock(pdd->dev->dqm);
 433	else
 434		r = kfd_dbg_set_mes_debug_mode(pdd, true);
 435
 436	kfd_dbg_clear_dev_watch_id(pdd, watch_id);
 437
 438	return r;
 439}
 440
 441int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
 442					uint64_t watch_address,
 443					uint32_t watch_address_mask,
 444					uint32_t *watch_id,
 445					uint32_t watch_mode)
 446{
 447	int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
 448	uint32_t xcc_mask = pdd->dev->xcc_mask;
 449
 450	if (r)
 451		return r;
 452
 453	if (!pdd->dev->kfd->shared_resources.enable_mes) {
 454		r = debug_lock_and_unmap(pdd->dev->dqm);
 455		if (r) {
 456			kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
 457			return r;
 458		}
 459	}
 460
 461	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
 462	for_each_inst(xcc_id, xcc_mask)
 463		pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
 464				pdd->dev->adev,
 465				watch_address,
 466				watch_address_mask,
 467				*watch_id,
 468				watch_mode,
 469				pdd->dev->vm_info.last_vmid_kfd,
 470				xcc_id);
 471	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
 472
 473	if (!pdd->dev->kfd->shared_resources.enable_mes)
 474		r = debug_map_and_unlock(pdd->dev->dqm);
 475	else
 476		r = kfd_dbg_set_mes_debug_mode(pdd, true);
 477
 478	/* HWS is broken so no point in HW rollback but release the watchpoint anyways */
 479	if (r)
 480		kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
 481
 482	return 0;
 483}
 484
 485static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
 486{
 487	int i, j;
 488
 489	for (i = 0; i < target->n_pdds; i++)
 490		for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
 491			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
 492}
 493
 494int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
 495{
 496	uint32_t prev_flags = target->dbg_flags;
 497	int i, r = 0, rewind_count = 0;
 498
 499	for (i = 0; i < target->n_pdds; i++) {
 500		if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
 
 
 
 
 501			(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
 502			*flags = prev_flags;
 503			return -EACCES;
 504		}
 
 
 
 
 
 
 505	}
 506
 507	target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP;
 508	*flags = prev_flags;
 509	for (i = 0; i < target->n_pdds; i++) {
 510		struct kfd_process_device *pdd = target->pdds[i];
 511
 512		if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
 513			continue;
 514
 515		if (!pdd->dev->kfd->shared_resources.enable_mes)
 516			r = debug_refresh_runlist(pdd->dev->dqm);
 517		else
 518			r = kfd_dbg_set_mes_debug_mode(pdd, true);
 519
 520		if (r) {
 521			target->dbg_flags = prev_flags;
 522			break;
 523		}
 524
 525		rewind_count++;
 526	}
 527
 528	/* Rewind flags */
 529	if (r) {
 530		target->dbg_flags = prev_flags;
 531
 532		for (i = 0; i < rewind_count; i++) {
 533			struct kfd_process_device *pdd = target->pdds[i];
 534
 535			if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
 536				continue;
 537
 538			if (!pdd->dev->kfd->shared_resources.enable_mes)
 539				debug_refresh_runlist(pdd->dev->dqm);
 540			else
 541				kfd_dbg_set_mes_debug_mode(pdd, true);
 542		}
 543	}
 544
 545	return r;
 546}
 547
 548/* kfd_dbg_trap_deactivate:
 549 *	target: target process
 550 *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
 551 *	unwind_count:
 552 *		If unwind == true, how far down the pdd list we need
 553 *				to unwind
 554 *		else: ignored
 555 */
 556void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
 557{
 558	int i;
 559
 560	if (!unwind) {
 561		uint32_t flags = 0;
 562		int resume_count = resume_queues(target, 0, NULL);
 563
 564		if (resume_count)
 565			pr_debug("Resumed %d queues\n", resume_count);
 566
 567		cancel_work_sync(&target->debug_event_workarea);
 568		kfd_dbg_clear_process_address_watch(target);
 569		kfd_dbg_trap_set_wave_launch_mode(target, 0);
 570
 571		kfd_dbg_trap_set_flags(target, &flags);
 572	}
 573
 574	for (i = 0; i < target->n_pdds; i++) {
 575		struct kfd_process_device *pdd = target->pdds[i];
 576
 577		/* If this is an unwind, and we have unwound the required
 578		 * enable calls on the pdd list, we need to stop now
 579		 * otherwise we may mess up another debugger session.
 580		 */
 581		if (unwind && i == unwind_count)
 582			break;
 583
 584		kfd_process_set_trap_debug_flag(&pdd->qpd, false);
 585
 586		/* GFX off is already disabled by debug activate if not RLC restore supported. */
 587		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
 588			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
 589		pdd->spi_dbg_override =
 590				pdd->dev->kfd2kgd->disable_debug_trap(
 591				pdd->dev->adev,
 592				target->runtime_info.ttmp_setup,
 593				pdd->dev->vm_info.last_vmid_kfd);
 594		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
 595
 596		if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
 597				release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
 598			pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
 599
 600		if (!pdd->dev->kfd->shared_resources.enable_mes)
 601			debug_refresh_runlist(pdd->dev->dqm);
 602		else
 603			kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));
 604	}
 605
 606	kfd_dbg_set_workaround(target, false);
 607}
 608
 609static void kfd_dbg_clean_exception_status(struct kfd_process *target)
 610{
 611	struct process_queue_manager *pqm;
 612	struct process_queue_node *pqn;
 613	int i;
 614
 615	for (i = 0; i < target->n_pdds; i++) {
 616		struct kfd_process_device *pdd = target->pdds[i];
 617
 618		kfd_process_drain_interrupts(pdd);
 619
 620		pdd->exception_status = 0;
 621	}
 622
 623	pqm = &target->pqm;
 624	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
 625		if (!pqn->q)
 626			continue;
 627
 628		pqn->q->properties.exception_status = 0;
 629	}
 630
 631	target->exception_status = 0;
 632}
 633
 634int kfd_dbg_trap_disable(struct kfd_process *target)
 635{
 636	if (!target->debug_trap_enabled)
 637		return 0;
 638
 639	/*
 640	 * Defer deactivation to runtime if runtime not enabled otherwise reset
 641	 * attached running target runtime state to enable for re-attach.
 642	 */
 643	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
 644		kfd_dbg_trap_deactivate(target, false, 0);
 645	else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
 646		target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
 647
 
 648	fput(target->dbg_ev_file);
 649	target->dbg_ev_file = NULL;
 650
 651	if (target->debugger_process) {
 652		atomic_dec(&target->debugger_process->debugged_process_count);
 653		target->debugger_process = NULL;
 654	}
 655
 656	target->debug_trap_enabled = false;
 657	kfd_dbg_clean_exception_status(target);
 658	kfd_unref_process(target);
 659
 660	return 0;
 661}
 662
 663int kfd_dbg_trap_activate(struct kfd_process *target)
 664{
 665	int i, r = 0;
 666
 667	r = kfd_dbg_set_workaround(target, true);
 668	if (r)
 669		return r;
 670
 671	for (i = 0; i < target->n_pdds; i++) {
 672		struct kfd_process_device *pdd = target->pdds[i];
 673
 674		if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
 675			r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
 676
 677			if (r) {
 678				target->runtime_info.runtime_state = (r == -EBUSY) ?
 679							DEBUG_RUNTIME_STATE_ENABLED_BUSY :
 680							DEBUG_RUNTIME_STATE_ENABLED_ERROR;
 681
 682				goto unwind_err;
 683			}
 684		}
 685
 686		/* Disable GFX OFF to prevent garbage read/writes to debug registers.
 687		 * If RLC restore of debug registers is not supported and runtime enable
 688		 * hasn't done so already on ttmp setup request, restore the trap config registers.
 689		 *
 690		 * If RLC restore of debug registers is not supported, keep gfx off disabled for
 691		 * the debug session.
 692		 */
 693		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
 694		if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
 695						target->runtime_info.ttmp_setup))
 696			pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
 697								pdd->dev->vm_info.last_vmid_kfd);
 698
 699		pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
 700					pdd->dev->adev,
 701					false,
 702					pdd->dev->vm_info.last_vmid_kfd);
 703
 704		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
 705			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
 706
 707		/*
 708		 * Setting the debug flag in the trap handler requires that the TMA has been
 709		 * allocated, which occurs during CWSR initialization.
 710		 * In the event that CWSR has not been initialized at this point, setting the
 711		 * flag will be called again during CWSR initialization if the target process
 712		 * is still debug enabled.
 713		 */
 714		kfd_process_set_trap_debug_flag(&pdd->qpd, true);
 715
 716		if (!pdd->dev->kfd->shared_resources.enable_mes)
 717			r = debug_refresh_runlist(pdd->dev->dqm);
 718		else
 719			r = kfd_dbg_set_mes_debug_mode(pdd, true);
 720
 721		if (r) {
 722			target->runtime_info.runtime_state =
 723					DEBUG_RUNTIME_STATE_ENABLED_ERROR;
 724			goto unwind_err;
 725		}
 726	}
 727
 728	return 0;
 729
 730unwind_err:
 731	/* Enabling debug failed, we need to disable on
 732	 * all GPUs so the enable is all or nothing.
 733	 */
 734	kfd_dbg_trap_deactivate(target, true, i);
 735	return r;
 736}
 737
 738int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 739			void __user *runtime_info, uint32_t *runtime_size)
 740{
 741	struct file *f;
 742	uint32_t copy_size;
 743	int i, r = 0;
 744
 745	if (target->debug_trap_enabled)
 746		return -EALREADY;
 747
 748	/* Enable pre-checks */
 749	for (i = 0; i < target->n_pdds; i++) {
 750		struct kfd_process_device *pdd = target->pdds[i];
 751
 752		if (!KFD_IS_SOC15(pdd->dev))
 753			return -ENODEV;
 754
 755		if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
 756					 kfd_dbg_has_cwsr_workaround(pdd->dev)))
 757			return -EBUSY;
 758	}
 759
 760	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
 761
 762	f = fget(fd);
 763	if (!f) {
 764		pr_err("Failed to get file for (%i)\n", fd);
 765		return -EBADF;
 766	}
 767
 768	target->dbg_ev_file = f;
 769
 770	/* defer activation to runtime if not runtime enabled */
 771	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
 772		kfd_dbg_trap_activate(target);
 773
 774	/* We already hold the process reference but hold another one for the
 775	 * debug session.
 776	 */
 777	kref_get(&target->ref);
 778	target->debug_trap_enabled = true;
 779
 780	if (target->debugger_process)
 781		atomic_inc(&target->debugger_process->debugged_process_count);
 782
 783	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
 784		kfd_dbg_trap_deactivate(target, false, 0);
 785		r = -EFAULT;
 786	}
 787
 788	*runtime_size = sizeof(target->runtime_info);
 789
 790	return r;
 791}
 792
 793static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
 794						uint32_t trap_override,
 795						uint32_t trap_mask_request,
 796						uint32_t *trap_mask_supported)
 797{
 798	int i = 0;
 799
 800	*trap_mask_supported = 0xffffffff;
 801
 802	for (i = 0; i < p->n_pdds; i++) {
 803		struct kfd_process_device *pdd = p->pdds[i];
 804		int err = pdd->dev->kfd2kgd->validate_trap_override_request(
 805								pdd->dev->adev,
 806								trap_override,
 807								trap_mask_supported);
 808
 809		if (err)
 810			return err;
 811	}
 812
 813	if (trap_mask_request & ~*trap_mask_supported)
 814		return -EACCES;
 815
 816	return 0;
 817}
 818
 819int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
 820					uint32_t trap_override,
 821					uint32_t trap_mask_bits,
 822					uint32_t trap_mask_request,
 823					uint32_t *trap_mask_prev,
 824					uint32_t *trap_mask_supported)
 825{
 826	int r = 0, i;
 827
 828	r = kfd_dbg_validate_trap_override_request(target,
 829						trap_override,
 830						trap_mask_request,
 831						trap_mask_supported);
 832
 833	if (r)
 834		return r;
 835
 836	for (i = 0; i < target->n_pdds; i++) {
 837		struct kfd_process_device *pdd = target->pdds[i];
 838
 839		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
 840		pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
 841				pdd->dev->adev,
 842				pdd->dev->vm_info.last_vmid_kfd,
 843				trap_override,
 844				trap_mask_bits,
 845				trap_mask_request,
 846				trap_mask_prev,
 847				pdd->spi_dbg_override);
 848		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
 849
 850		if (!pdd->dev->kfd->shared_resources.enable_mes)
 851			r = debug_refresh_runlist(pdd->dev->dqm);
 852		else
 853			r = kfd_dbg_set_mes_debug_mode(pdd, true);
 854
 855		if (r)
 856			break;
 857	}
 858
 859	return r;
 860}
 861
 862int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
 863					uint8_t wave_launch_mode)
 864{
 865	int r = 0, i;
 866
 867	if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
 868			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
 869			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
 870		return -EINVAL;
 871
 872	for (i = 0; i < target->n_pdds; i++) {
 873		struct kfd_process_device *pdd = target->pdds[i];
 874
 875		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
 876		pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
 877				pdd->dev->adev,
 878				wave_launch_mode,
 879				pdd->dev->vm_info.last_vmid_kfd);
 880		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
 881
 882		if (!pdd->dev->kfd->shared_resources.enable_mes)
 883			r = debug_refresh_runlist(pdd->dev->dqm);
 884		else
 885			r = kfd_dbg_set_mes_debug_mode(pdd, true);
 886
 887		if (r)
 888			break;
 889	}
 890
 891	return r;
 892}
 893
 894int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
 895		uint32_t source_id,
 896		uint32_t exception_code,
 897		bool clear_exception,
 898		void __user *info,
 899		uint32_t *info_size)
 900{
 901	bool found = false;
 902	int r = 0;
 903	uint32_t copy_size, actual_info_size = 0;
 904	uint64_t *exception_status_ptr = NULL;
 905
 906	if (!target)
 907		return -EINVAL;
 908
 909	if (!info || !info_size)
 910		return -EINVAL;
 911
 912	mutex_lock(&target->event_mutex);
 913
 914	if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
 915		/* Per queue exceptions */
 916		struct queue *queue = NULL;
 917		int i;
 918
 919		for (i = 0; i < target->n_pdds; i++) {
 920			struct kfd_process_device *pdd = target->pdds[i];
 921			struct qcm_process_device *qpd = &pdd->qpd;
 922
 923			list_for_each_entry(queue, &qpd->queues_list, list) {
 924				if (!found && queue->properties.queue_id == source_id) {
 925					found = true;
 926					break;
 927				}
 928			}
 929			if (found)
 930				break;
 931		}
 932
 933		if (!found) {
 934			r = -EINVAL;
 935			goto out;
 936		}
 937
 938		if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
 939			r = -ENODATA;
 940			goto out;
 941		}
 942		exception_status_ptr = &queue->properties.exception_status;
 943	} else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
 944		/* Per device exceptions */
 945		struct kfd_process_device *pdd = NULL;
 946		int i;
 947
 948		for (i = 0; i < target->n_pdds; i++) {
 949			pdd = target->pdds[i];
 950			if (pdd->dev->id == source_id) {
 951				found = true;
 952				break;
 953			}
 954		}
 955
 956		if (!found) {
 957			r = -EINVAL;
 958			goto out;
 959		}
 960
 961		if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
 962			r = -ENODATA;
 963			goto out;
 964		}
 965
 966		if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
 967			copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
 968
 969			if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
 970				r = -EFAULT;
 971				goto out;
 972			}
 973			actual_info_size = pdd->vm_fault_exc_data_size;
 974			if (clear_exception) {
 975				kfree(pdd->vm_fault_exc_data);
 976				pdd->vm_fault_exc_data = NULL;
 977				pdd->vm_fault_exc_data_size = 0;
 978			}
 979		}
 980		exception_status_ptr = &pdd->exception_status;
 981	} else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
 982		/* Per process exceptions */
 983		if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
 984			r = -ENODATA;
 985			goto out;
 986		}
 987
 988		if (exception_code == EC_PROCESS_RUNTIME) {
 989			copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
 990
 991			if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
 992				r = -EFAULT;
 993				goto out;
 994			}
 995
 996			actual_info_size = sizeof(target->runtime_info);
 997		}
 998
 999		exception_status_ptr = &target->exception_status;
1000	} else {
1001		pr_debug("Bad exception type [%i]\n", exception_code);
1002		r = -EINVAL;
1003		goto out;
1004	}
1005
1006	*info_size = actual_info_size;
1007	if (clear_exception)
1008		*exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1009out:
1010	mutex_unlock(&target->event_mutex);
1011	return r;
1012}
1013
1014int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
1015		uint64_t exception_clear_mask,
1016		void __user *user_info,
1017		uint32_t *number_of_device_infos,
1018		uint32_t *entry_size)
1019{
1020	struct kfd_dbg_device_info_entry device_info;
1021	uint32_t tmp_entry_size, tmp_num_devices;
1022	int i, r = 0;
1023
1024	if (!(target && user_info && number_of_device_infos && entry_size))
1025		return -EINVAL;
1026
1027	tmp_entry_size = *entry_size;
1028
1029	tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
1030	*number_of_device_infos = target->n_pdds;
1031	*entry_size = min_t(size_t, *entry_size, sizeof(device_info));
1032
1033	if (!tmp_num_devices)
1034		return 0;
1035
1036	memset(&device_info, 0, sizeof(device_info));
1037
1038	mutex_lock(&target->event_mutex);
1039
1040	/* Run over all pdd of the process */
1041	for (i = 0; i < tmp_num_devices; i++) {
1042		struct kfd_process_device *pdd = target->pdds[i];
1043		struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
1044
1045		device_info.gpu_id = pdd->dev->id;
1046		device_info.exception_status = pdd->exception_status;
1047		device_info.lds_base = pdd->lds_base;
1048		device_info.lds_limit = pdd->lds_limit;
1049		device_info.scratch_base = pdd->scratch_base;
1050		device_info.scratch_limit = pdd->scratch_limit;
1051		device_info.gpuvm_base = pdd->gpuvm_base;
1052		device_info.gpuvm_limit = pdd->gpuvm_limit;
1053		device_info.location_id = topo_dev->node_props.location_id;
1054		device_info.vendor_id = topo_dev->node_props.vendor_id;
1055		device_info.device_id = topo_dev->node_props.device_id;
1056		device_info.revision_id = pdd->dev->adev->pdev->revision;
1057		device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
1058		device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
1059		device_info.fw_version = pdd->dev->kfd->mec_fw_version;
1060		device_info.gfx_target_version =
1061			topo_dev->node_props.gfx_target_version;
1062		device_info.simd_count = topo_dev->node_props.simd_count;
1063		device_info.max_waves_per_simd =
1064			topo_dev->node_props.max_waves_per_simd;
1065		device_info.array_count = topo_dev->node_props.array_count;
1066		device_info.simd_arrays_per_engine =
1067			topo_dev->node_props.simd_arrays_per_engine;
1068		device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
1069		device_info.capability = topo_dev->node_props.capability;
1070		device_info.debug_prop = topo_dev->node_props.debug_prop;
1071
1072		if (exception_clear_mask)
1073			pdd->exception_status &= ~exception_clear_mask;
1074
1075		if (copy_to_user(user_info, &device_info, *entry_size)) {
1076			r = -EFAULT;
1077			break;
1078		}
1079
1080		user_info += tmp_entry_size;
1081	}
1082
1083	mutex_unlock(&target->event_mutex);
1084
1085	return r;
1086}
1087
1088void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1089					uint64_t exception_set_mask)
1090{
1091	uint64_t found_mask = 0;
1092	struct process_queue_manager *pqm;
1093	struct process_queue_node *pqn;
1094	static const char write_data = '.';
1095	loff_t pos = 0;
1096	int i;
1097
1098	mutex_lock(&target->event_mutex);
1099
1100	found_mask |= target->exception_status;
1101
1102	pqm = &target->pqm;
1103	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1104		if (!pqn->q)
1105			continue;
1106
1107		found_mask |= pqn->q->properties.exception_status;
1108	}
1109
1110	for (i = 0; i < target->n_pdds; i++) {
1111		struct kfd_process_device *pdd = target->pdds[i];
1112
1113		found_mask |= pdd->exception_status;
1114	}
1115
1116	if (exception_set_mask & found_mask)
1117		kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1118
1119	target->exception_enable_mask = exception_set_mask;
1120
1121	mutex_unlock(&target->event_mutex);
1122}
v6.13.7
   1/*
   2 * Copyright 2023 Advanced Micro Devices, Inc.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice shall be included in
  12 * all copies or substantial portions of the Software.
  13 *
  14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20 * OTHER DEALINGS IN THE SOFTWARE.
  21 */
  22
  23#include "kfd_debug.h"
  24#include "kfd_device_queue_manager.h"
  25#include "kfd_topology.h"
  26#include <linux/file.h>
  27#include <uapi/linux/kfd_ioctl.h>
  28#include <uapi/linux/kfd_sysfs.h>
  29
  30#define MAX_WATCH_ADDRESSES	4
  31
  32int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
  33		      unsigned int *queue_id,
  34		      unsigned int *gpu_id,
  35		      uint64_t exception_clear_mask,
  36		      uint64_t *event_status)
  37{
  38	struct process_queue_manager *pqm;
  39	struct process_queue_node *pqn;
  40	int i;
  41
  42	if (!(process && process->debug_trap_enabled))
  43		return -ENODATA;
  44
  45	mutex_lock(&process->event_mutex);
  46	*event_status = 0;
  47	*queue_id = 0;
  48	*gpu_id = 0;
  49
  50	/* find and report queue events */
  51	pqm = &process->pqm;
  52	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
  53		uint64_t tmp = process->exception_enable_mask;
  54
  55		if (!pqn->q)
  56			continue;
  57
  58		tmp &= pqn->q->properties.exception_status;
  59
  60		if (!tmp)
  61			continue;
  62
  63		*event_status = pqn->q->properties.exception_status;
  64		*queue_id = pqn->q->properties.queue_id;
  65		*gpu_id = pqn->q->device->id;
  66		pqn->q->properties.exception_status &= ~exception_clear_mask;
  67		goto out;
  68	}
  69
  70	/* find and report device events */
  71	for (i = 0; i < process->n_pdds; i++) {
  72		struct kfd_process_device *pdd = process->pdds[i];
  73		uint64_t tmp = process->exception_enable_mask
  74						& pdd->exception_status;
  75
  76		if (!tmp)
  77			continue;
  78
  79		*event_status = pdd->exception_status;
  80		*gpu_id = pdd->dev->id;
  81		pdd->exception_status &= ~exception_clear_mask;
  82		goto out;
  83	}
  84
  85	/* report process events */
  86	if (process->exception_enable_mask & process->exception_status) {
  87		*event_status = process->exception_status;
  88		process->exception_status &= ~exception_clear_mask;
  89	}
  90
  91out:
  92	mutex_unlock(&process->event_mutex);
  93	return *event_status ? 0 : -EAGAIN;
  94}
  95
  96void debug_event_write_work_handler(struct work_struct *work)
  97{
  98	struct kfd_process *process;
  99
 100	static const char write_data = '.';
 101	loff_t pos = 0;
 102
 103	process = container_of(work,
 104			struct kfd_process,
 105			debug_event_workarea);
 106
 107	if (process->debug_trap_enabled && process->dbg_ev_file)
 108		kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
 109}
 110
 111/* update process/device/queue exception status, write to descriptor
 112 * only if exception_status is enabled.
 113 */
 114bool kfd_dbg_ev_raise(uint64_t event_mask,
 115			struct kfd_process *process, struct kfd_node *dev,
 116			unsigned int source_id, bool use_worker,
 117			void *exception_data, size_t exception_data_size)
 118{
 119	struct process_queue_manager *pqm;
 120	struct process_queue_node *pqn;
 121	int i;
 122	static const char write_data = '.';
 123	loff_t pos = 0;
 124	bool is_subscribed = true;
 125
 126	if (!(process && process->debug_trap_enabled))
 127		return false;
 128
 129	mutex_lock(&process->event_mutex);
 130
 131	if (event_mask & KFD_EC_MASK_DEVICE) {
 132		for (i = 0; i < process->n_pdds; i++) {
 133			struct kfd_process_device *pdd = process->pdds[i];
 134
 135			if (pdd->dev != dev)
 136				continue;
 137
 138			pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
 139
 140			if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
 141				if (!pdd->vm_fault_exc_data) {
 142					pdd->vm_fault_exc_data = kmemdup(
 143							exception_data,
 144							exception_data_size,
 145							GFP_KERNEL);
 146					if (!pdd->vm_fault_exc_data)
 147						pr_debug("Failed to allocate exception data memory");
 148				} else {
 149					pr_debug("Debugger exception data not saved\n");
 150					print_hex_dump_bytes("exception data: ",
 151							DUMP_PREFIX_OFFSET,
 152							exception_data,
 153							exception_data_size);
 154				}
 155			}
 156			break;
 157		}
 158	} else if (event_mask & KFD_EC_MASK_PROCESS) {
 159		process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
 160	} else {
 161		pqm = &process->pqm;
 162		list_for_each_entry(pqn, &pqm->queues,
 163				process_queue_list) {
 164			int target_id;
 165
 166			if (!pqn->q)
 167				continue;
 168
 169			target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
 170					pqn->q->properties.queue_id :
 171							pqn->q->doorbell_id;
 172
 173			if (pqn->q->device != dev || target_id != source_id)
 174				continue;
 175
 176			pqn->q->properties.exception_status |= event_mask;
 177			break;
 178		}
 179	}
 180
 181	if (process->exception_enable_mask & event_mask) {
 182		if (use_worker)
 183			schedule_work(&process->debug_event_workarea);
 184		else
 185			kernel_write(process->dbg_ev_file,
 186					&write_data,
 187					1,
 188					&pos);
 189	} else {
 190		is_subscribed = false;
 191	}
 192
 193	mutex_unlock(&process->event_mutex);
 194
 195	return is_subscribed;
 196}
 197
 198/* set pending event queue entry from ring entry  */
 199bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
 200				   unsigned int pasid,
 201				   uint32_t doorbell_id,
 202				   uint64_t trap_mask,
 203				   void *exception_data,
 204				   size_t exception_data_size)
 205{
 206	struct kfd_process *p;
 207	bool signaled_to_debugger_or_runtime = false;
 208
 209	p = kfd_lookup_process_by_pasid(pasid);
 210
 211	if (!p)
 212		return false;
 213
 214	if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
 215			      exception_data, exception_data_size)) {
 216		struct process_queue_manager *pqm;
 217		struct process_queue_node *pqn;
 218
 219		if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
 220		       p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
 221			mutex_lock(&p->mutex);
 222
 223			pqm = &p->pqm;
 224			list_for_each_entry(pqn, &pqm->queues,
 225							process_queue_list) {
 226
 227				if (!(pqn->q && pqn->q->device == dev &&
 228				      pqn->q->doorbell_id == doorbell_id))
 229					continue;
 230
 231				kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
 232							      trap_mask);
 233
 234				signaled_to_debugger_or_runtime = true;
 235
 236				break;
 237			}
 238
 239			mutex_unlock(&p->mutex);
 240		} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
 241			kfd_dqm_evict_pasid(dev->dqm, p->pasid);
 242			kfd_signal_vm_fault_event(dev, p->pasid, NULL,
 243							exception_data);
 244
 245			signaled_to_debugger_or_runtime = true;
 246		}
 247	} else {
 248		signaled_to_debugger_or_runtime = true;
 249	}
 250
 251	kfd_unref_process(p);
 252
 253	return signaled_to_debugger_or_runtime;
 254}
 255
 256int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
 257					unsigned int dev_id,
 258					unsigned int queue_id,
 259					uint64_t error_reason)
 260{
 261	if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
 262		struct kfd_process_device *pdd = NULL;
 263		struct kfd_hsa_memory_exception_data *data;
 264		int i;
 265
 266		for (i = 0; i < p->n_pdds; i++) {
 267			if (p->pdds[i]->dev->id == dev_id) {
 268				pdd = p->pdds[i];
 269				break;
 270			}
 271		}
 272
 273		if (!pdd)
 274			return -ENODEV;
 275
 276		data = (struct kfd_hsa_memory_exception_data *)
 277						pdd->vm_fault_exc_data;
 278
 279		kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
 280		kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
 281		error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
 282	}
 283
 284	if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
 285		/*
 286		 * block should only happen after the debugger receives runtime
 287		 * enable notice.
 288		 */
 289		up(&p->runtime_enable_sema);
 290		error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
 291	}
 292
 293	if (error_reason)
 294		return kfd_send_exception_to_runtime(p, queue_id, error_reason);
 295
 296	return 0;
 297}
 298
 299static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
 300{
 301	struct mqd_update_info minfo = {0};
 302	int err;
 303
 304	if (!q)
 305		return 0;
 306
 307	if (!kfd_dbg_has_cwsr_workaround(q->device))
 308		return 0;
 309
 310	if (enable && q->properties.is_user_cu_masked)
 311		return -EBUSY;
 312
 313	minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
 314
 315	q->properties.is_dbg_wa = enable;
 316	err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
 317	if (err)
 318		q->properties.is_dbg_wa = false;
 319
 320	return err;
 321}
 322
 323static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
 324{
 325	struct process_queue_manager *pqm = &target->pqm;
 326	struct process_queue_node *pqn;
 327	int r = 0;
 328
 329	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
 330		r = kfd_dbg_set_queue_workaround(pqn->q, enable);
 331		if (enable && r)
 332			goto unwind;
 333	}
 334
 335	return 0;
 336
 337unwind:
 338	list_for_each_entry(pqn, &pqm->queues, process_queue_list)
 339		kfd_dbg_set_queue_workaround(pqn->q, false);
 340
 341	if (enable)
 342		target->runtime_info.runtime_state = r == -EBUSY ?
 343				DEBUG_RUNTIME_STATE_ENABLED_BUSY :
 344				DEBUG_RUNTIME_STATE_ENABLED_ERROR;
 345
 346	return r;
 347}
 348
 349int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
 350{
 351	uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
 352	uint32_t flags = pdd->process->dbg_flags;
 353	struct amdgpu_device *adev = pdd->dev->adev;
 354	int r;
 355
 356	if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
 357		return 0;
 358
 359	if (!pdd->proc_ctx_cpu_ptr) {
 360			r = amdgpu_amdkfd_alloc_gtt_mem(adev,
 361				AMDGPU_MES_PROC_CTX_SIZE,
 362				&pdd->proc_ctx_bo,
 363				&pdd->proc_ctx_gpu_addr,
 364				&pdd->proc_ctx_cpu_ptr,
 365				false);
 366		if (r) {
 367			dev_err(adev->dev,
 368			"failed to allocate process context bo\n");
 369			return r;
 370		}
 371		memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
 372	}
 373
 374	return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
 375						pdd->watch_points, flags, sq_trap_en);
 376}
 377
 378#define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
 379static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
 380{
 381	int i;
 382
 383	*watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
 384
 385	spin_lock(&pdd->dev->watch_points_lock);
 386
 387	for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
 388		/* device watchpoint in use so skip */
 389		if ((pdd->dev->alloc_watch_ids >> i) & 0x1)
 390			continue;
 391
 392		pdd->alloc_watch_ids |= 0x1 << i;
 393		pdd->dev->alloc_watch_ids |= 0x1 << i;
 394		*watch_id = i;
 395		spin_unlock(&pdd->dev->watch_points_lock);
 396		return 0;
 397	}
 398
 399	spin_unlock(&pdd->dev->watch_points_lock);
 400
 401	return -ENOMEM;
 402}
 403
 404static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
 405{
 406	spin_lock(&pdd->dev->watch_points_lock);
 407
 408	/* process owns device watch point so safe to clear */
 409	if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
 410		pdd->alloc_watch_ids &= ~(0x1 << watch_id);
 411		pdd->dev->alloc_watch_ids &= ~(0x1 << watch_id);
 412	}
 413
 414	spin_unlock(&pdd->dev->watch_points_lock);
 415}
 416
 417static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
 418{
 419	bool owns_watch_id = false;
 420
 421	spin_lock(&pdd->dev->watch_points_lock);
 422	owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
 423			((pdd->alloc_watch_ids >> watch_id) & 0x1);
 424
 425	spin_unlock(&pdd->dev->watch_points_lock);
 426
 427	return owns_watch_id;
 428}
 429
 430int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
 431					uint32_t watch_id)
 432{
 433	int r;
 434
 435	if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
 436		return -EINVAL;
 437
 438	if (!pdd->dev->kfd->shared_resources.enable_mes) {
 439		r = debug_lock_and_unmap(pdd->dev->dqm);
 440		if (r)
 441			return r;
 442	}
 443
 444	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
 445	pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
 446							pdd->dev->adev,
 447							watch_id);
 448	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
 449
 450	if (!pdd->dev->kfd->shared_resources.enable_mes)
 451		r = debug_map_and_unlock(pdd->dev->dqm);
 452	else
 453		r = kfd_dbg_set_mes_debug_mode(pdd, true);
 454
 455	kfd_dbg_clear_dev_watch_id(pdd, watch_id);
 456
 457	return r;
 458}
 459
 460int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
 461					uint64_t watch_address,
 462					uint32_t watch_address_mask,
 463					uint32_t *watch_id,
 464					uint32_t watch_mode)
 465{
 466	int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
 467	uint32_t xcc_mask = pdd->dev->xcc_mask;
 468
 469	if (r)
 470		return r;
 471
 472	if (!pdd->dev->kfd->shared_resources.enable_mes) {
 473		r = debug_lock_and_unmap(pdd->dev->dqm);
 474		if (r) {
 475			kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
 476			return r;
 477		}
 478	}
 479
 480	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
 481	for_each_inst(xcc_id, xcc_mask)
 482		pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
 483				pdd->dev->adev,
 484				watch_address,
 485				watch_address_mask,
 486				*watch_id,
 487				watch_mode,
 488				pdd->dev->vm_info.last_vmid_kfd,
 489				xcc_id);
 490	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
 491
 492	if (!pdd->dev->kfd->shared_resources.enable_mes)
 493		r = debug_map_and_unlock(pdd->dev->dqm);
 494	else
 495		r = kfd_dbg_set_mes_debug_mode(pdd, true);
 496
 497	/* HWS is broken so no point in HW rollback but release the watchpoint anyways */
 498	if (r)
 499		kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
 500
 501	return 0;
 502}
 503
 504static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
 505{
 506	int i, j;
 507
 508	for (i = 0; i < target->n_pdds; i++)
 509		for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
 510			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
 511}
 512
 513int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
 514{
 515	uint32_t prev_flags = target->dbg_flags;
 516	int i, r = 0, rewind_count = 0;
 517
 518	for (i = 0; i < target->n_pdds; i++) {
 519		struct kfd_topology_device *topo_dev =
 520				kfd_topology_device_by_id(target->pdds[i]->dev->id);
 521		uint32_t caps = topo_dev->node_props.capability;
 522
 523		if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) &&
 524			(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
 525			*flags = prev_flags;
 526			return -EACCES;
 527		}
 528
 529		if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) &&
 530		    (*flags & KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP)) {
 531			*flags = prev_flags;
 532			return -EACCES;
 533		}
 534	}
 535
 536	target->dbg_flags = *flags;
 537	*flags = prev_flags;
 538	for (i = 0; i < target->n_pdds; i++) {
 539		struct kfd_process_device *pdd = target->pdds[i];
 540
 541		if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
 542			continue;
 543
 544		if (!pdd->dev->kfd->shared_resources.enable_mes)
 545			r = debug_refresh_runlist(pdd->dev->dqm);
 546		else
 547			r = kfd_dbg_set_mes_debug_mode(pdd, true);
 548
 549		if (r) {
 550			target->dbg_flags = prev_flags;
 551			break;
 552		}
 553
 554		rewind_count++;
 555	}
 556
 557	/* Rewind flags */
 558	if (r) {
 559		target->dbg_flags = prev_flags;
 560
 561		for (i = 0; i < rewind_count; i++) {
 562			struct kfd_process_device *pdd = target->pdds[i];
 563
 564			if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
 565				continue;
 566
 567			if (!pdd->dev->kfd->shared_resources.enable_mes)
 568				debug_refresh_runlist(pdd->dev->dqm);
 569			else
 570				kfd_dbg_set_mes_debug_mode(pdd, true);
 571		}
 572	}
 573
 574	return r;
 575}
 576
 577/* kfd_dbg_trap_deactivate:
 578 *	target: target process
 579 *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
 580 *	unwind_count:
 581 *		If unwind == true, how far down the pdd list we need
 582 *				to unwind
 583 *		else: ignored
 584 */
 585void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
 586{
 587	int i;
 588
 589	if (!unwind) {
 590		uint32_t flags = 0;
 591		int resume_count = resume_queues(target, 0, NULL);
 592
 593		if (resume_count)
 594			pr_debug("Resumed %d queues\n", resume_count);
 595
 596		cancel_work_sync(&target->debug_event_workarea);
 597		kfd_dbg_clear_process_address_watch(target);
 598		kfd_dbg_trap_set_wave_launch_mode(target, 0);
 599
 600		kfd_dbg_trap_set_flags(target, &flags);
 601	}
 602
 603	for (i = 0; i < target->n_pdds; i++) {
 604		struct kfd_process_device *pdd = target->pdds[i];
 605
 606		/* If this is an unwind, and we have unwound the required
 607		 * enable calls on the pdd list, we need to stop now
 608		 * otherwise we may mess up another debugger session.
 609		 */
 610		if (unwind && i == unwind_count)
 611			break;
 612
 613		kfd_process_set_trap_debug_flag(&pdd->qpd, false);
 614
 615		/* GFX off is already disabled by debug activate if not RLC restore supported. */
 616		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
 617			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
 618		pdd->spi_dbg_override =
 619				pdd->dev->kfd2kgd->disable_debug_trap(
 620				pdd->dev->adev,
 621				target->runtime_info.ttmp_setup,
 622				pdd->dev->vm_info.last_vmid_kfd);
 623		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
 624
 625		if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
 626				release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
 627			pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
 628
 629		if (!pdd->dev->kfd->shared_resources.enable_mes)
 630			debug_refresh_runlist(pdd->dev->dqm);
 631		else
 632			kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));
 633	}
 634
 635	kfd_dbg_set_workaround(target, false);
 636}
 637
 638static void kfd_dbg_clean_exception_status(struct kfd_process *target)
 639{
 640	struct process_queue_manager *pqm;
 641	struct process_queue_node *pqn;
 642	int i;
 643
 644	for (i = 0; i < target->n_pdds; i++) {
 645		struct kfd_process_device *pdd = target->pdds[i];
 646
 647		kfd_process_drain_interrupts(pdd);
 648
 649		pdd->exception_status = 0;
 650	}
 651
 652	pqm = &target->pqm;
 653	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
 654		if (!pqn->q)
 655			continue;
 656
 657		pqn->q->properties.exception_status = 0;
 658	}
 659
 660	target->exception_status = 0;
 661}
 662
 663int kfd_dbg_trap_disable(struct kfd_process *target)
 664{
 665	if (!target->debug_trap_enabled)
 666		return 0;
 667
 668	/*
 669	 * Defer deactivation to runtime if runtime not enabled otherwise reset
 670	 * attached running target runtime state to enable for re-attach.
 671	 */
 672	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
 673		kfd_dbg_trap_deactivate(target, false, 0);
 674	else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
 675		target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
 676
 677	cancel_work_sync(&target->debug_event_workarea);
 678	fput(target->dbg_ev_file);
 679	target->dbg_ev_file = NULL;
 680
 681	if (target->debugger_process) {
 682		atomic_dec(&target->debugger_process->debugged_process_count);
 683		target->debugger_process = NULL;
 684	}
 685
 686	target->debug_trap_enabled = false;
 687	kfd_dbg_clean_exception_status(target);
 688	kfd_unref_process(target);
 689
 690	return 0;
 691}
 692
 693int kfd_dbg_trap_activate(struct kfd_process *target)
 694{
 695	int i, r = 0;
 696
 697	r = kfd_dbg_set_workaround(target, true);
 698	if (r)
 699		return r;
 700
 701	for (i = 0; i < target->n_pdds; i++) {
 702		struct kfd_process_device *pdd = target->pdds[i];
 703
 704		if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
 705			r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
 706
 707			if (r) {
 708				target->runtime_info.runtime_state = (r == -EBUSY) ?
 709							DEBUG_RUNTIME_STATE_ENABLED_BUSY :
 710							DEBUG_RUNTIME_STATE_ENABLED_ERROR;
 711
 712				goto unwind_err;
 713			}
 714		}
 715
 716		/* Disable GFX OFF to prevent garbage read/writes to debug registers.
 717		 * If RLC restore of debug registers is not supported and runtime enable
 718		 * hasn't done so already on ttmp setup request, restore the trap config registers.
 719		 *
 720		 * If RLC restore of debug registers is not supported, keep gfx off disabled for
 721		 * the debug session.
 722		 */
 723		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
 724		if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
 725						target->runtime_info.ttmp_setup))
 726			pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
 727								pdd->dev->vm_info.last_vmid_kfd);
 728
 729		pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
 730					pdd->dev->adev,
 731					false,
 732					pdd->dev->vm_info.last_vmid_kfd);
 733
 734		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
 735			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
 736
 737		/*
 738		 * Setting the debug flag in the trap handler requires that the TMA has been
 739		 * allocated, which occurs during CWSR initialization.
 740		 * In the event that CWSR has not been initialized at this point, setting the
 741		 * flag will be called again during CWSR initialization if the target process
 742		 * is still debug enabled.
 743		 */
 744		kfd_process_set_trap_debug_flag(&pdd->qpd, true);
 745
 746		if (!pdd->dev->kfd->shared_resources.enable_mes)
 747			r = debug_refresh_runlist(pdd->dev->dqm);
 748		else
 749			r = kfd_dbg_set_mes_debug_mode(pdd, true);
 750
 751		if (r) {
 752			target->runtime_info.runtime_state =
 753					DEBUG_RUNTIME_STATE_ENABLED_ERROR;
 754			goto unwind_err;
 755		}
 756	}
 757
 758	return 0;
 759
 760unwind_err:
 761	/* Enabling debug failed, we need to disable on
 762	 * all GPUs so the enable is all or nothing.
 763	 */
 764	kfd_dbg_trap_deactivate(target, true, i);
 765	return r;
 766}
 767
 768int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 769			void __user *runtime_info, uint32_t *runtime_size)
 770{
 771	struct file *f;
 772	uint32_t copy_size;
 773	int i, r = 0;
 774
 775	if (target->debug_trap_enabled)
 776		return -EALREADY;
 777
 778	/* Enable pre-checks */
 779	for (i = 0; i < target->n_pdds; i++) {
 780		struct kfd_process_device *pdd = target->pdds[i];
 781
 782		if (!KFD_IS_SOC15(pdd->dev))
 783			return -ENODEV;
 784
 785		if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
 786					 kfd_dbg_has_cwsr_workaround(pdd->dev)))
 787			return -EBUSY;
 788	}
 789
 790	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
 791
 792	f = fget(fd);
 793	if (!f) {
 794		pr_err("Failed to get file for (%i)\n", fd);
 795		return -EBADF;
 796	}
 797
 798	target->dbg_ev_file = f;
 799
 800	/* defer activation to runtime if not runtime enabled */
 801	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
 802		kfd_dbg_trap_activate(target);
 803
 804	/* We already hold the process reference but hold another one for the
 805	 * debug session.
 806	 */
 807	kref_get(&target->ref);
 808	target->debug_trap_enabled = true;
 809
 810	if (target->debugger_process)
 811		atomic_inc(&target->debugger_process->debugged_process_count);
 812
 813	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
 814		kfd_dbg_trap_deactivate(target, false, 0);
 815		r = -EFAULT;
 816	}
 817
 818	*runtime_size = sizeof(target->runtime_info);
 819
 820	return r;
 821}
 822
 823static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
 824						uint32_t trap_override,
 825						uint32_t trap_mask_request,
 826						uint32_t *trap_mask_supported)
 827{
 828	int i = 0;
 829
 830	*trap_mask_supported = 0xffffffff;
 831
 832	for (i = 0; i < p->n_pdds; i++) {
 833		struct kfd_process_device *pdd = p->pdds[i];
 834		int err = pdd->dev->kfd2kgd->validate_trap_override_request(
 835								pdd->dev->adev,
 836								trap_override,
 837								trap_mask_supported);
 838
 839		if (err)
 840			return err;
 841	}
 842
 843	if (trap_mask_request & ~*trap_mask_supported)
 844		return -EACCES;
 845
 846	return 0;
 847}
 848
 849int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
 850					uint32_t trap_override,
 851					uint32_t trap_mask_bits,
 852					uint32_t trap_mask_request,
 853					uint32_t *trap_mask_prev,
 854					uint32_t *trap_mask_supported)
 855{
 856	int r = 0, i;
 857
 858	r = kfd_dbg_validate_trap_override_request(target,
 859						trap_override,
 860						trap_mask_request,
 861						trap_mask_supported);
 862
 863	if (r)
 864		return r;
 865
 866	for (i = 0; i < target->n_pdds; i++) {
 867		struct kfd_process_device *pdd = target->pdds[i];
 868
 869		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
 870		pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
 871				pdd->dev->adev,
 872				pdd->dev->vm_info.last_vmid_kfd,
 873				trap_override,
 874				trap_mask_bits,
 875				trap_mask_request,
 876				trap_mask_prev,
 877				pdd->spi_dbg_override);
 878		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
 879
 880		if (!pdd->dev->kfd->shared_resources.enable_mes)
 881			r = debug_refresh_runlist(pdd->dev->dqm);
 882		else
 883			r = kfd_dbg_set_mes_debug_mode(pdd, true);
 884
 885		if (r)
 886			break;
 887	}
 888
 889	return r;
 890}
 891
 892int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
 893					uint8_t wave_launch_mode)
 894{
 895	int r = 0, i;
 896
 897	if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
 898			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
 899			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
 900		return -EINVAL;
 901
 902	for (i = 0; i < target->n_pdds; i++) {
 903		struct kfd_process_device *pdd = target->pdds[i];
 904
 905		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
 906		pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
 907				pdd->dev->adev,
 908				wave_launch_mode,
 909				pdd->dev->vm_info.last_vmid_kfd);
 910		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
 911
 912		if (!pdd->dev->kfd->shared_resources.enable_mes)
 913			r = debug_refresh_runlist(pdd->dev->dqm);
 914		else
 915			r = kfd_dbg_set_mes_debug_mode(pdd, true);
 916
 917		if (r)
 918			break;
 919	}
 920
 921	return r;
 922}
 923
 924int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
 925		uint32_t source_id,
 926		uint32_t exception_code,
 927		bool clear_exception,
 928		void __user *info,
 929		uint32_t *info_size)
 930{
 931	bool found = false;
 932	int r = 0;
 933	uint32_t copy_size, actual_info_size = 0;
 934	uint64_t *exception_status_ptr = NULL;
 935
 936	if (!target)
 937		return -EINVAL;
 938
 939	if (!info || !info_size)
 940		return -EINVAL;
 941
 942	mutex_lock(&target->event_mutex);
 943
 944	if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
 945		/* Per queue exceptions */
 946		struct queue *queue = NULL;
 947		int i;
 948
 949		for (i = 0; i < target->n_pdds; i++) {
 950			struct kfd_process_device *pdd = target->pdds[i];
 951			struct qcm_process_device *qpd = &pdd->qpd;
 952
 953			list_for_each_entry(queue, &qpd->queues_list, list) {
 954				if (!found && queue->properties.queue_id == source_id) {
 955					found = true;
 956					break;
 957				}
 958			}
 959			if (found)
 960				break;
 961		}
 962
 963		if (!found) {
 964			r = -EINVAL;
 965			goto out;
 966		}
 967
 968		if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
 969			r = -ENODATA;
 970			goto out;
 971		}
 972		exception_status_ptr = &queue->properties.exception_status;
 973	} else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
 974		/* Per device exceptions */
 975		struct kfd_process_device *pdd = NULL;
 976		int i;
 977
 978		for (i = 0; i < target->n_pdds; i++) {
 979			pdd = target->pdds[i];
 980			if (pdd->dev->id == source_id) {
 981				found = true;
 982				break;
 983			}
 984		}
 985
 986		if (!found) {
 987			r = -EINVAL;
 988			goto out;
 989		}
 990
 991		if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
 992			r = -ENODATA;
 993			goto out;
 994		}
 995
 996		if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
 997			copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
 998
 999			if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
1000				r = -EFAULT;
1001				goto out;
1002			}
1003			actual_info_size = pdd->vm_fault_exc_data_size;
1004			if (clear_exception) {
1005				kfree(pdd->vm_fault_exc_data);
1006				pdd->vm_fault_exc_data = NULL;
1007				pdd->vm_fault_exc_data_size = 0;
1008			}
1009		}
1010		exception_status_ptr = &pdd->exception_status;
1011	} else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
1012		/* Per process exceptions */
1013		if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
1014			r = -ENODATA;
1015			goto out;
1016		}
1017
1018		if (exception_code == EC_PROCESS_RUNTIME) {
1019			copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
1020
1021			if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
1022				r = -EFAULT;
1023				goto out;
1024			}
1025
1026			actual_info_size = sizeof(target->runtime_info);
1027		}
1028
1029		exception_status_ptr = &target->exception_status;
1030	} else {
1031		pr_debug("Bad exception type [%i]\n", exception_code);
1032		r = -EINVAL;
1033		goto out;
1034	}
1035
1036	*info_size = actual_info_size;
1037	if (clear_exception)
1038		*exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1039out:
1040	mutex_unlock(&target->event_mutex);
1041	return r;
1042}
1043
1044int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
1045		uint64_t exception_clear_mask,
1046		void __user *user_info,
1047		uint32_t *number_of_device_infos,
1048		uint32_t *entry_size)
1049{
1050	struct kfd_dbg_device_info_entry device_info;
1051	uint32_t tmp_entry_size, tmp_num_devices;
1052	int i, r = 0;
1053
1054	if (!(target && user_info && number_of_device_infos && entry_size))
1055		return -EINVAL;
1056
1057	tmp_entry_size = *entry_size;
1058
1059	tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
1060	*number_of_device_infos = target->n_pdds;
1061	*entry_size = min_t(size_t, *entry_size, sizeof(device_info));
1062
1063	if (!tmp_num_devices)
1064		return 0;
1065
1066	memset(&device_info, 0, sizeof(device_info));
1067
1068	mutex_lock(&target->event_mutex);
1069
1070	/* Run over all pdd of the process */
1071	for (i = 0; i < tmp_num_devices; i++) {
1072		struct kfd_process_device *pdd = target->pdds[i];
1073		struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
1074
1075		device_info.gpu_id = pdd->dev->id;
1076		device_info.exception_status = pdd->exception_status;
1077		device_info.lds_base = pdd->lds_base;
1078		device_info.lds_limit = pdd->lds_limit;
1079		device_info.scratch_base = pdd->scratch_base;
1080		device_info.scratch_limit = pdd->scratch_limit;
1081		device_info.gpuvm_base = pdd->gpuvm_base;
1082		device_info.gpuvm_limit = pdd->gpuvm_limit;
1083		device_info.location_id = topo_dev->node_props.location_id;
1084		device_info.vendor_id = topo_dev->node_props.vendor_id;
1085		device_info.device_id = topo_dev->node_props.device_id;
1086		device_info.revision_id = pdd->dev->adev->pdev->revision;
1087		device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
1088		device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
1089		device_info.fw_version = pdd->dev->kfd->mec_fw_version;
1090		device_info.gfx_target_version =
1091			topo_dev->node_props.gfx_target_version;
1092		device_info.simd_count = topo_dev->node_props.simd_count;
1093		device_info.max_waves_per_simd =
1094			topo_dev->node_props.max_waves_per_simd;
1095		device_info.array_count = topo_dev->node_props.array_count;
1096		device_info.simd_arrays_per_engine =
1097			topo_dev->node_props.simd_arrays_per_engine;
1098		device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
1099		device_info.capability = topo_dev->node_props.capability;
1100		device_info.debug_prop = topo_dev->node_props.debug_prop;
1101
1102		if (exception_clear_mask)
1103			pdd->exception_status &= ~exception_clear_mask;
1104
1105		if (copy_to_user(user_info, &device_info, *entry_size)) {
1106			r = -EFAULT;
1107			break;
1108		}
1109
1110		user_info += tmp_entry_size;
1111	}
1112
1113	mutex_unlock(&target->event_mutex);
1114
1115	return r;
1116}
1117
1118void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1119					uint64_t exception_set_mask)
1120{
1121	uint64_t found_mask = 0;
1122	struct process_queue_manager *pqm;
1123	struct process_queue_node *pqn;
1124	static const char write_data = '.';
1125	loff_t pos = 0;
1126	int i;
1127
1128	mutex_lock(&target->event_mutex);
1129
1130	found_mask |= target->exception_status;
1131
1132	pqm = &target->pqm;
1133	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1134		if (!pqn->q)
1135			continue;
1136
1137		found_mask |= pqn->q->properties.exception_status;
1138	}
1139
1140	for (i = 0; i < target->n_pdds; i++) {
1141		struct kfd_process_device *pdd = target->pdds[i];
1142
1143		found_mask |= pdd->exception_status;
1144	}
1145
1146	if (exception_set_mask & found_mask)
1147		kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1148
1149	target->exception_enable_mask = exception_set_mask;
1150
1151	mutex_unlock(&target->event_mutex);
1152}