link.c - drivers/misc/ocxl/link.c - Linux diff v6.8

  1// SPDX-License-Identifier: GPL-2.0+
  2// Copyright 2017 IBM Corp.
  3#include <linux/sched/mm.h>
  4#include <linux/mutex.h>
  5#include <linux/mm.h>
  6#include <linux/mm_types.h>
  7#include <linux/mmu_context.h>
  8#include <linux/mmu_notifier.h>
  9#include <linux/irqdomain.h>
 10#include <asm/copro.h>
 11#include <asm/pnv-ocxl.h>
 12#include <asm/xive.h>
 13#include <misc/ocxl.h>
 14#include "ocxl_internal.h"
 15#include "trace.h"
 16
 17
 18#define SPA_PASID_BITS		15
 19#define SPA_PASID_MAX		((1 << SPA_PASID_BITS) - 1)
 20#define SPA_PE_MASK		SPA_PASID_MAX
 21#define SPA_SPA_SIZE_LOG	22 /* Each SPA is 4 Mb */
 22
 23#define SPA_CFG_SF		(1ull << (63-0))
 24#define SPA_CFG_TA		(1ull << (63-1))
 25#define SPA_CFG_HV		(1ull << (63-3))
 26#define SPA_CFG_UV		(1ull << (63-4))
 27#define SPA_CFG_XLAT_hpt	(0ull << (63-6)) /* Hashed page table (HPT) mode */
 28#define SPA_CFG_XLAT_roh	(2ull << (63-6)) /* Radix on HPT mode */
 29#define SPA_CFG_XLAT_ror	(3ull << (63-6)) /* Radix on Radix mode */
 30#define SPA_CFG_PR		(1ull << (63-49))
 31#define SPA_CFG_TC		(1ull << (63-54))
 32#define SPA_CFG_DR		(1ull << (63-59))
 33
 34#define SPA_XSL_TF		(1ull << (63-3))  /* Translation fault */
 35#define SPA_XSL_S		(1ull << (63-38)) /* Store operation */
 36
 37#define SPA_PE_VALID		0x80000000
 38
 39struct ocxl_link;
 40
 41struct pe_data {
 42	struct mm_struct *mm;
 43	/* callback to trigger when a translation fault occurs */
 44	void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr);
 45	/* opaque pointer to be passed to the above callback */
 46	void *xsl_err_data;
 47	struct rcu_head rcu;
 48	struct ocxl_link *link;
 49	struct mmu_notifier mmu_notifier;
 50};
 51
 52struct spa {
 53	struct ocxl_process_element *spa_mem;
 54	int spa_order;
 55	struct mutex spa_lock;
 56	struct radix_tree_root pe_tree; /* Maps PE handles to pe_data */
 57	char *irq_name;
 58	int virq;
 59	void __iomem *reg_dsisr;
 60	void __iomem *reg_dar;
 61	void __iomem *reg_tfc;
 62	void __iomem *reg_pe_handle;
 63	/*
 64	 * The following field are used by the memory fault
 65	 * interrupt handler. We can only have one interrupt at a
 66	 * time. The NPU won't raise another interrupt until the
 67	 * previous one has been ack'd by writing to the TFC register
 68	 */
 69	struct xsl_fault {
 70		struct work_struct fault_work;
 71		u64 pe;
 72		u64 dsisr;
 73		u64 dar;
 74		struct pe_data pe_data;
 75	} xsl_fault;
 76};
 77
 78/*
 79 * A opencapi link can be used be by several PCI functions. We have
 80 * one link per device slot.
 81 *
 82 * A linked list of opencapi links should suffice, as there's a
 83 * limited number of opencapi slots on a system and lookup is only
 84 * done when the device is probed
 85 */
 86struct ocxl_link {
 87	struct list_head list;
 88	struct kref ref;
 89	int domain;
 90	int bus;
 91	int dev;
 92	void __iomem *arva;     /* ATSD register virtual address */
 93	spinlock_t atsd_lock;   /* to serialize shootdowns */
 94	atomic_t irq_available;
 95	struct spa *spa;
 96	void *platform_data;
 97};
 98static LIST_HEAD(links_list);
 99static DEFINE_MUTEX(links_list_lock);
100
101enum xsl_response {
102	CONTINUE,
103	ADDRESS_ERROR,
104	RESTART,
105};
106
107
108static void read_irq(struct spa *spa, u64 *dsisr, u64 *dar, u64 *pe)
109{
110	u64 reg;
111
112	*dsisr = in_be64(spa->reg_dsisr);
113	*dar = in_be64(spa->reg_dar);
114	reg = in_be64(spa->reg_pe_handle);
115	*pe = reg & SPA_PE_MASK;
116}
117
118static void ack_irq(struct spa *spa, enum xsl_response r)
119{
120	u64 reg = 0;
121
122	/* continue is not supported */
123	if (r == RESTART)
124		reg = PPC_BIT(31);
125	else if (r == ADDRESS_ERROR)
126		reg = PPC_BIT(30);
127	else
128		WARN(1, "Invalid irq response %d\n", r);
129
130	if (reg) {
131		trace_ocxl_fault_ack(spa->spa_mem, spa->xsl_fault.pe,
132				spa->xsl_fault.dsisr, spa->xsl_fault.dar, reg);
133		out_be64(spa->reg_tfc, reg);
134	}
135}
136
137static void xsl_fault_handler_bh(struct work_struct *fault_work)
138{
139	vm_fault_t flt = 0;
140	unsigned long access, flags, inv_flags = 0;
141	enum xsl_response r;
142	struct xsl_fault *fault = container_of(fault_work, struct xsl_fault,
143					fault_work);
144	struct spa *spa = container_of(fault, struct spa, xsl_fault);
145
146	int rc;
147
148	/*
149	 * We must release a reference on mm_users whenever exiting this
150	 * function (taken in the memory fault interrupt handler)
151	 */
152	rc = copro_handle_mm_fault(fault->pe_data.mm, fault->dar, fault->dsisr,
153				&flt);
154	if (rc) {
155		pr_debug("copro_handle_mm_fault failed: %d\n", rc);
156		if (fault->pe_data.xsl_err_cb) {
157			fault->pe_data.xsl_err_cb(
158				fault->pe_data.xsl_err_data,
159				fault->dar, fault->dsisr);
160		}
161		r = ADDRESS_ERROR;
162		goto ack;
163	}
164
165	if (!radix_enabled()) {
166		/*
167		 * update_mmu_cache() will not have loaded the hash
168		 * since current->trap is not a 0x400 or 0x300, so
169		 * just call hash_page_mm() here.
170		 */
171		access = _PAGE_PRESENT | _PAGE_READ;
172		if (fault->dsisr & SPA_XSL_S)
173			access |= _PAGE_WRITE;
174
175		if (get_region_id(fault->dar) != USER_REGION_ID)
176			access |= _PAGE_PRIVILEGED;
177
178		local_irq_save(flags);
179		hash_page_mm(fault->pe_data.mm, fault->dar, access, 0x300,
180			inv_flags);
181		local_irq_restore(flags);
182	}
183	r = RESTART;
184ack:
185	mmput(fault->pe_data.mm);
186	ack_irq(spa, r);
187}
188
189static irqreturn_t xsl_fault_handler(int irq, void *data)
190{
191	struct ocxl_link *link = data;
192	struct spa *spa = link->spa;
193	u64 dsisr, dar, pe_handle;
194	struct pe_data *pe_data;
195	struct ocxl_process_element *pe;
196	int pid;
197	bool schedule = false;
198
199	read_irq(spa, &dsisr, &dar, &pe_handle);
200	trace_ocxl_fault(spa->spa_mem, pe_handle, dsisr, dar, -1);
201
202	WARN_ON(pe_handle > SPA_PE_MASK);
203	pe = spa->spa_mem + pe_handle;
204	pid = be32_to_cpu(pe->pid);
205	/* We could be reading all null values here if the PE is being
206	 * removed while an interrupt kicks in. It's not supposed to
207	 * happen if the driver notified the AFU to terminate the
208	 * PASID, and the AFU waited for pending operations before
209	 * acknowledging. But even if it happens, we won't find a
210	 * memory context below and fail silently, so it should be ok.
211	 */
212	if (!(dsisr & SPA_XSL_TF)) {
213		WARN(1, "Invalid xsl interrupt fault register %#llx\n", dsisr);
214		ack_irq(spa, ADDRESS_ERROR);
215		return IRQ_HANDLED;
216	}
217
218	rcu_read_lock();
219	pe_data = radix_tree_lookup(&spa->pe_tree, pe_handle);
220	if (!pe_data) {
221		/*
222		 * Could only happen if the driver didn't notify the
223		 * AFU about PASID termination before removing the PE,
224		 * or the AFU didn't wait for all memory access to
225		 * have completed.
226		 *
227		 * Either way, we fail early, but we shouldn't log an
228		 * error message, as it is a valid (if unexpected)
229		 * scenario
230		 */
231		rcu_read_unlock();
232		pr_debug("Unknown mm context for xsl interrupt\n");
233		ack_irq(spa, ADDRESS_ERROR);
234		return IRQ_HANDLED;
235	}
236
237	if (!pe_data->mm) {
238		/*
239		 * translation fault from a kernel context - an OpenCAPI
240		 * device tried to access a bad kernel address
241		 */
242		rcu_read_unlock();
243		pr_warn("Unresolved OpenCAPI xsl fault in kernel context\n");
244		ack_irq(spa, ADDRESS_ERROR);
245		return IRQ_HANDLED;
246	}
247	WARN_ON(pe_data->mm->context.id != pid);
248
249	if (mmget_not_zero(pe_data->mm)) {
250			spa->xsl_fault.pe = pe_handle;
251			spa->xsl_fault.dar = dar;
252			spa->xsl_fault.dsisr = dsisr;
253			spa->xsl_fault.pe_data = *pe_data;
254			schedule = true;
255			/* mm_users count released by bottom half */
256	}
257	rcu_read_unlock();
258	if (schedule)
259		schedule_work(&spa->xsl_fault.fault_work);
260	else
261		ack_irq(spa, ADDRESS_ERROR);
262	return IRQ_HANDLED;
263}
264
265static void unmap_irq_registers(struct spa *spa)
266{
267	pnv_ocxl_unmap_xsl_regs(spa->reg_dsisr, spa->reg_dar, spa->reg_tfc,
268				spa->reg_pe_handle);
269}
270
271static int map_irq_registers(struct pci_dev *dev, struct spa *spa)
272{
273	return pnv_ocxl_map_xsl_regs(dev, &spa->reg_dsisr, &spa->reg_dar,
274				&spa->reg_tfc, &spa->reg_pe_handle);
275}
276
277static int setup_xsl_irq(struct pci_dev *dev, struct ocxl_link *link)
278{
279	struct spa *spa = link->spa;
280	int rc;
281	int hwirq;
282
283	rc = pnv_ocxl_get_xsl_irq(dev, &hwirq);
284	if (rc)
285		return rc;
286
287	rc = map_irq_registers(dev, spa);
288	if (rc)
289		return rc;
290
291	spa->irq_name = kasprintf(GFP_KERNEL, "ocxl-xsl-%x-%x-%x",
292				link->domain, link->bus, link->dev);
293	if (!spa->irq_name) {
294		dev_err(&dev->dev, "Can't allocate name for xsl interrupt\n");
295		rc = -ENOMEM;
296		goto err_xsl;
297	}
298	/*
299	 * At some point, we'll need to look into allowing a higher
300	 * number of interrupts. Could we have an IRQ domain per link?
301	 */
302	spa->virq = irq_create_mapping(NULL, hwirq);
303	if (!spa->virq) {
304		dev_err(&dev->dev,
305			"irq_create_mapping failed for translation interrupt\n");
306		rc = -EINVAL;
307		goto err_name;
308	}
309
310	dev_dbg(&dev->dev, "hwirq %d mapped to virq %d\n", hwirq, spa->virq);
311
312	rc = request_irq(spa->virq, xsl_fault_handler, 0, spa->irq_name,
313			link);
314	if (rc) {
315		dev_err(&dev->dev,
316			"request_irq failed for translation interrupt: %d\n",
317			rc);
318		rc = -EINVAL;
319		goto err_mapping;
320	}
321	return 0;
322
323err_mapping:
324	irq_dispose_mapping(spa->virq);
325err_name:
326	kfree(spa->irq_name);
327err_xsl:
328	unmap_irq_registers(spa);
329	return rc;
330}
331
332static void release_xsl_irq(struct ocxl_link *link)
333{
334	struct spa *spa = link->spa;
335
336	if (spa->virq) {
337		free_irq(spa->virq, link);
338		irq_dispose_mapping(spa->virq);
339	}
340	kfree(spa->irq_name);
341	unmap_irq_registers(spa);
342}
343
344static int alloc_spa(struct pci_dev *dev, struct ocxl_link *link)
345{
346	struct spa *spa;
347
348	spa = kzalloc(sizeof(struct spa), GFP_KERNEL);
349	if (!spa)
350		return -ENOMEM;
351
352	mutex_init(&spa->spa_lock);
353	INIT_RADIX_TREE(&spa->pe_tree, GFP_KERNEL);
354	INIT_WORK(&spa->xsl_fault.fault_work, xsl_fault_handler_bh);
355
356	spa->spa_order = SPA_SPA_SIZE_LOG - PAGE_SHIFT;
357	spa->spa_mem = (struct ocxl_process_element *)
358		__get_free_pages(GFP_KERNEL | __GFP_ZERO, spa->spa_order);
359	if (!spa->spa_mem) {
360		dev_err(&dev->dev, "Can't allocate Shared Process Area\n");
361		kfree(spa);
362		return -ENOMEM;
363	}
364	pr_debug("Allocated SPA for %x:%x:%x at %p\n", link->domain, link->bus,
365		link->dev, spa->spa_mem);
366
367	link->spa = spa;
368	return 0;
369}
370
371static void free_spa(struct ocxl_link *link)
372{
373	struct spa *spa = link->spa;
374
375	pr_debug("Freeing SPA for %x:%x:%x\n", link->domain, link->bus,
376		link->dev);
377
378	if (spa && spa->spa_mem) {
379		free_pages((unsigned long) spa->spa_mem, spa->spa_order);
380		kfree(spa);
381		link->spa = NULL;
382	}
383}
384
385static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_link)
386{
387	struct ocxl_link *link;
388	int rc;
389
390	link = kzalloc(sizeof(struct ocxl_link), GFP_KERNEL);
391	if (!link)
392		return -ENOMEM;
393
394	kref_init(&link->ref);
395	link->domain = pci_domain_nr(dev->bus);
396	link->bus = dev->bus->number;
397	link->dev = PCI_SLOT(dev->devfn);
398	atomic_set(&link->irq_available, MAX_IRQ_PER_LINK);
399	spin_lock_init(&link->atsd_lock);
400
401	rc = alloc_spa(dev, link);
402	if (rc)
403		goto err_free;
404
405	rc = setup_xsl_irq(dev, link);
406	if (rc)
407		goto err_spa;
408
409	/* platform specific hook */
410	rc = pnv_ocxl_spa_setup(dev, link->spa->spa_mem, PE_mask,
411				&link->platform_data);
412	if (rc)
413		goto err_xsl_irq;
414
415	/* if link->arva is not defeined, MMIO registers are not used to
416	 * generate TLB invalidate. PowerBus snooping is enabled.
417	 * Otherwise, PowerBus snooping is disabled. TLB Invalidates are
418	 * initiated using MMIO registers.
419	 */
420	pnv_ocxl_map_lpar(dev, mfspr(SPRN_LPID), 0, &link->arva);
421
422	*out_link = link;
423	return 0;
424
425err_xsl_irq:
426	release_xsl_irq(link);
427err_spa:
428	free_spa(link);
429err_free:
430	kfree(link);
431	return rc;
432}
433
434static void free_link(struct ocxl_link *link)
435{
436	release_xsl_irq(link);
437	free_spa(link);
438	kfree(link);
439}
440
441int ocxl_link_setup(struct pci_dev *dev, int PE_mask, void **link_handle)
442{
443	int rc = 0;
444	struct ocxl_link *link;
445
446	mutex_lock(&links_list_lock);
447	list_for_each_entry(link, &links_list, list) {
448		/* The functions of a device all share the same link */
449		if (link->domain == pci_domain_nr(dev->bus) &&
450			link->bus == dev->bus->number &&
451			link->dev == PCI_SLOT(dev->devfn)) {
452			kref_get(&link->ref);
453			*link_handle = link;
454			goto unlock;
455		}
456	}
457	rc = alloc_link(dev, PE_mask, &link);
458	if (rc)
459		goto unlock;
460
461	list_add(&link->list, &links_list);
462	*link_handle = link;
463unlock:
464	mutex_unlock(&links_list_lock);
465	return rc;
466}
467EXPORT_SYMBOL_GPL(ocxl_link_setup);
468
469static void release_xsl(struct kref *ref)
470{
471	struct ocxl_link *link = container_of(ref, struct ocxl_link, ref);
472
473	if (link->arva) {
474		pnv_ocxl_unmap_lpar(link->arva);
475		link->arva = NULL;
476	}
477
478	list_del(&link->list);
479	/* call platform code before releasing data */
480	pnv_ocxl_spa_release(link->platform_data);
481	free_link(link);
482}
483
484void ocxl_link_release(struct pci_dev *dev, void *link_handle)
485{
486	struct ocxl_link *link = link_handle;
487
488	mutex_lock(&links_list_lock);
489	kref_put(&link->ref, release_xsl);
490	mutex_unlock(&links_list_lock);
491}
492EXPORT_SYMBOL_GPL(ocxl_link_release);
493
494static void arch_invalidate_secondary_tlbs(struct mmu_notifier *mn,
495					struct mm_struct *mm,
496					unsigned long start, unsigned long end)
497{
498	struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier);
499	struct ocxl_link *link = pe_data->link;
500	unsigned long addr, pid, page_size = PAGE_SIZE;
501
502	pid = mm->context.id;
503	trace_ocxl_mmu_notifier_range(start, end, pid);
504
505	spin_lock(&link->atsd_lock);
506	for (addr = start; addr < end; addr += page_size)
507		pnv_ocxl_tlb_invalidate(link->arva, pid, addr, page_size);
508	spin_unlock(&link->atsd_lock);
509}
510
511static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = {
512	.arch_invalidate_secondary_tlbs = arch_invalidate_secondary_tlbs,
513};
514
515static u64 calculate_cfg_state(bool kernel)
516{
517	u64 state;
518
519	state = SPA_CFG_DR;
520	if (mfspr(SPRN_LPCR) & LPCR_TC)
521		state |= SPA_CFG_TC;
522	if (radix_enabled())
523		state |= SPA_CFG_XLAT_ror;
524	else
525		state |= SPA_CFG_XLAT_hpt;
526	state |= SPA_CFG_HV;
527	if (kernel) {
528		if (mfmsr() & MSR_SF)
529			state |= SPA_CFG_SF;
530	} else {
531		state |= SPA_CFG_PR;
532		if (!test_tsk_thread_flag(current, TIF_32BIT))
533			state |= SPA_CFG_SF;
534	}
535	return state;
536}
537
538int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
539		u64 amr, u16 bdf, struct mm_struct *mm,
540		void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr),
541		void *xsl_err_data)
542{
543	struct ocxl_link *link = link_handle;
544	struct spa *spa = link->spa;
545	struct ocxl_process_element *pe;
546	int pe_handle, rc = 0;
547	struct pe_data *pe_data;
548
549	BUILD_BUG_ON(sizeof(struct ocxl_process_element) != 128);
550	if (pasid > SPA_PASID_MAX)
551		return -EINVAL;
552
553	mutex_lock(&spa->spa_lock);
554	pe_handle = pasid & SPA_PE_MASK;
555	pe = spa->spa_mem + pe_handle;
556
557	if (pe->software_state) {
558		rc = -EBUSY;
559		goto unlock;
560	}
561
562	pe_data = kmalloc(sizeof(*pe_data), GFP_KERNEL);
563	if (!pe_data) {
564		rc = -ENOMEM;
565		goto unlock;
566	}
567
568	pe_data->mm = mm;
569	pe_data->xsl_err_cb = xsl_err_cb;
570	pe_data->xsl_err_data = xsl_err_data;
571	pe_data->link = link;
572	pe_data->mmu_notifier.ops = &ocxl_mmu_notifier_ops;
573
574	memset(pe, 0, sizeof(struct ocxl_process_element));
575	pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0));
576	pe->pasid = cpu_to_be32(pasid << (31 - 19));
577	pe->bdf = cpu_to_be16(bdf);
578	pe->lpid = cpu_to_be32(mfspr(SPRN_LPID));
579	pe->pid = cpu_to_be32(pidr);
580	pe->tid = cpu_to_be32(tidr);
581	pe->amr = cpu_to_be64(amr);
582	pe->software_state = cpu_to_be32(SPA_PE_VALID);
583
584	/*
585	 * For user contexts, register a copro so that TLBIs are seen
586	 * by the nest MMU. If we have a kernel context, TLBIs are
587	 * already global.
588	 */
589	if (mm) {
590		mm_context_add_copro(mm);
591		if (link->arva) {
592			/* Use MMIO registers for the TLB Invalidate
593			 * operations.
594			 */
595			trace_ocxl_init_mmu_notifier(pasid, mm->context.id);
596			mmu_notifier_register(&pe_data->mmu_notifier, mm);
597		}
598	}
599
600	/*
601	 * Barrier is to make sure PE is visible in the SPA before it
602	 * is used by the device. It also helps with the global TLBI
603	 * invalidation
604	 */
605	mb();
606	radix_tree_insert(&spa->pe_tree, pe_handle, pe_data);
607
608	/*
609	 * The mm must stay valid for as long as the device uses it. We
610	 * lower the count when the context is removed from the SPA.
611	 *
612	 * We grab mm_count (and not mm_users), as we don't want to
613	 * end up in a circular dependency if a process mmaps its
614	 * mmio, therefore incrementing the file ref count when
615	 * calling mmap(), and forgets to unmap before exiting. In
616	 * that scenario, when the kernel handles the death of the
617	 * process, the file is not cleaned because unmap was not
618	 * called, and the mm wouldn't be freed because we would still
619	 * have a reference on mm_users. Incrementing mm_count solves
620	 * the problem.
621	 */
622	if (mm)
623		mmgrab(mm);
624	trace_ocxl_context_add(current->pid, spa->spa_mem, pasid, pidr, tidr);
625unlock:
626	mutex_unlock(&spa->spa_lock);
627	return rc;
628}
629EXPORT_SYMBOL_GPL(ocxl_link_add_pe);
630
631int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid)
632{
633	struct ocxl_link *link = link_handle;
634	struct spa *spa = link->spa;
635	struct ocxl_process_element *pe;
636	int pe_handle, rc;
637
638	if (pasid > SPA_PASID_MAX)
639		return -EINVAL;
640
641	pe_handle = pasid & SPA_PE_MASK;
642	pe = spa->spa_mem + pe_handle;
643
644	mutex_lock(&spa->spa_lock);
645
646	pe->tid = cpu_to_be32(tid);
647
648	/*
649	 * The barrier makes sure the PE is updated
650	 * before we clear the NPU context cache below, so that the
651	 * old PE cannot be reloaded erroneously.
652	 */
653	mb();
654
655	/*
656	 * hook to platform code
657	 * On powerpc, the entry needs to be cleared from the context
658	 * cache of the NPU.
659	 */
660	rc = pnv_ocxl_spa_remove_pe_from_cache(link->platform_data, pe_handle);
661	WARN_ON(rc);
662
663	mutex_unlock(&spa->spa_lock);
664	return rc;
665}
666
667int ocxl_link_remove_pe(void *link_handle, int pasid)
668{
669	struct ocxl_link *link = link_handle;
670	struct spa *spa = link->spa;
671	struct ocxl_process_element *pe;
672	struct pe_data *pe_data;
673	int pe_handle, rc;
674
675	if (pasid > SPA_PASID_MAX)
676		return -EINVAL;
677
678	/*
679	 * About synchronization with our memory fault handler:
680	 *
681	 * Before removing the PE, the driver is supposed to have
682	 * notified the AFU, which should have cleaned up and make
683	 * sure the PASID is no longer in use, including pending
684	 * interrupts. However, there's no way to be sure...
685	 *
686	 * We clear the PE and remove the context from our radix
687	 * tree. From that point on, any new interrupt for that
688	 * context will fail silently, which is ok. As mentioned
689	 * above, that's not expected, but it could happen if the
690	 * driver or AFU didn't do the right thing.
691	 *
692	 * There could still be a bottom half running, but we don't
693	 * need to wait/flush, as it is managing a reference count on
694	 * the mm it reads from the radix tree.
695	 */
696	pe_handle = pasid & SPA_PE_MASK;
697	pe = spa->spa_mem + pe_handle;
698
699	mutex_lock(&spa->spa_lock);
700
701	if (!(be32_to_cpu(pe->software_state) & SPA_PE_VALID)) {
702		rc = -EINVAL;
703		goto unlock;
704	}
705
706	trace_ocxl_context_remove(current->pid, spa->spa_mem, pasid,
707				be32_to_cpu(pe->pid), be32_to_cpu(pe->tid));
708
709	memset(pe, 0, sizeof(struct ocxl_process_element));
710	/*
711	 * The barrier makes sure the PE is removed from the SPA
712	 * before we clear the NPU context cache below, so that the
713	 * old PE cannot be reloaded erroneously.
714	 */
715	mb();
716
717	/*
718	 * hook to platform code
719	 * On powerpc, the entry needs to be cleared from the context
720	 * cache of the NPU.
721	 */
722	rc = pnv_ocxl_spa_remove_pe_from_cache(link->platform_data, pe_handle);
723	WARN_ON(rc);
724
725	pe_data = radix_tree_delete(&spa->pe_tree, pe_handle);
726	if (!pe_data) {
727		WARN(1, "Couldn't find pe data when removing PE\n");
728	} else {
729		if (pe_data->mm) {
730			if (link->arva) {
731				trace_ocxl_release_mmu_notifier(pasid,
732								pe_data->mm->context.id);
733				mmu_notifier_unregister(&pe_data->mmu_notifier,
734							pe_data->mm);
735				spin_lock(&link->atsd_lock);
736				pnv_ocxl_tlb_invalidate(link->arva,
737							pe_data->mm->context.id,
738							0ull,
739							PAGE_SIZE);
740				spin_unlock(&link->atsd_lock);
741			}
742			mm_context_remove_copro(pe_data->mm);
743			mmdrop(pe_data->mm);
744		}
745		kfree_rcu(pe_data, rcu);
746	}
747unlock:
748	mutex_unlock(&spa->spa_lock);
749	return rc;
750}
751EXPORT_SYMBOL_GPL(ocxl_link_remove_pe);
752
753int ocxl_link_irq_alloc(void *link_handle, int *hw_irq)
754{
755	struct ocxl_link *link = link_handle;
756	int irq;
 
757
758	if (atomic_dec_if_positive(&link->irq_available) < 0)
759		return -ENOSPC;
760
761	irq = xive_native_alloc_irq();
762	if (!irq) {
763		atomic_inc(&link->irq_available);
764		return -ENXIO;
765	}
766
767	*hw_irq = irq;
 
768	return 0;
769}
770EXPORT_SYMBOL_GPL(ocxl_link_irq_alloc);
771
772void ocxl_link_free_irq(void *link_handle, int hw_irq)
773{
774	struct ocxl_link *link = link_handle;
775
776	xive_native_free_irq(hw_irq);
777	atomic_inc(&link->irq_available);
778}
779EXPORT_SYMBOL_GPL(ocxl_link_free_irq);

  1// SPDX-License-Identifier: GPL-2.0+
  2// Copyright 2017 IBM Corp.
  3#include <linux/sched/mm.h>
  4#include <linux/mutex.h>
 
  5#include <linux/mm_types.h>
  6#include <linux/mmu_context.h>
 
 
  7#include <asm/copro.h>
  8#include <asm/pnv-ocxl.h>
 
  9#include <misc/ocxl.h>
 10#include "ocxl_internal.h"
 11#include "trace.h"
 12
 13
 14#define SPA_PASID_BITS		15
 15#define SPA_PASID_MAX		((1 << SPA_PASID_BITS) - 1)
 16#define SPA_PE_MASK		SPA_PASID_MAX
 17#define SPA_SPA_SIZE_LOG	22 /* Each SPA is 4 Mb */
 18
 19#define SPA_CFG_SF		(1ull << (63-0))
 20#define SPA_CFG_TA		(1ull << (63-1))
 21#define SPA_CFG_HV		(1ull << (63-3))
 22#define SPA_CFG_UV		(1ull << (63-4))
 23#define SPA_CFG_XLAT_hpt	(0ull << (63-6)) /* Hashed page table (HPT) mode */
 24#define SPA_CFG_XLAT_roh	(2ull << (63-6)) /* Radix on HPT mode */
 25#define SPA_CFG_XLAT_ror	(3ull << (63-6)) /* Radix on Radix mode */
 26#define SPA_CFG_PR		(1ull << (63-49))
 27#define SPA_CFG_TC		(1ull << (63-54))
 28#define SPA_CFG_DR		(1ull << (63-59))
 29
 30#define SPA_XSL_TF		(1ull << (63-3))  /* Translation fault */
 31#define SPA_XSL_S		(1ull << (63-38)) /* Store operation */
 32
 33#define SPA_PE_VALID		0x80000000
 34
 
 35
 36struct pe_data {
 37	struct mm_struct *mm;
 38	/* callback to trigger when a translation fault occurs */
 39	void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr);
 40	/* opaque pointer to be passed to the above callback */
 41	void *xsl_err_data;
 42	struct rcu_head rcu;
 
 
 43};
 44
 45struct spa {
 46	struct ocxl_process_element *spa_mem;
 47	int spa_order;
 48	struct mutex spa_lock;
 49	struct radix_tree_root pe_tree; /* Maps PE handles to pe_data */
 50	char *irq_name;
 51	int virq;
 52	void __iomem *reg_dsisr;
 53	void __iomem *reg_dar;
 54	void __iomem *reg_tfc;
 55	void __iomem *reg_pe_handle;
 56	/*
 57	 * The following field are used by the memory fault
 58	 * interrupt handler. We can only have one interrupt at a
 59	 * time. The NPU won't raise another interrupt until the
 60	 * previous one has been ack'd by writing to the TFC register
 61	 */
 62	struct xsl_fault {
 63		struct work_struct fault_work;
 64		u64 pe;
 65		u64 dsisr;
 66		u64 dar;
 67		struct pe_data pe_data;
 68	} xsl_fault;
 69};
 70
 71/*
 72 * A opencapi link can be used be by several PCI functions. We have
 73 * one link per device slot.
 74 *
 75 * A linked list of opencapi links should suffice, as there's a
 76 * limited number of opencapi slots on a system and lookup is only
 77 * done when the device is probed
 78 */
 79struct ocxl_link {
 80	struct list_head list;
 81	struct kref ref;
 82	int domain;
 83	int bus;
 84	int dev;
 
 
 85	atomic_t irq_available;
 86	struct spa *spa;
 87	void *platform_data;
 88};
 89static struct list_head links_list = LIST_HEAD_INIT(links_list);
 90static DEFINE_MUTEX(links_list_lock);
 91
 92enum xsl_response {
 93	CONTINUE,
 94	ADDRESS_ERROR,
 95	RESTART,
 96};
 97
 98
 99static void read_irq(struct spa *spa, u64 *dsisr, u64 *dar, u64 *pe)
100{
101	u64 reg;
102
103	*dsisr = in_be64(spa->reg_dsisr);
104	*dar = in_be64(spa->reg_dar);
105	reg = in_be64(spa->reg_pe_handle);
106	*pe = reg & SPA_PE_MASK;
107}
108
109static void ack_irq(struct spa *spa, enum xsl_response r)
110{
111	u64 reg = 0;
112
113	/* continue is not supported */
114	if (r == RESTART)
115		reg = PPC_BIT(31);
116	else if (r == ADDRESS_ERROR)
117		reg = PPC_BIT(30);
118	else
119		WARN(1, "Invalid irq response %d\n", r);
120
121	if (reg) {
122		trace_ocxl_fault_ack(spa->spa_mem, spa->xsl_fault.pe,
123				spa->xsl_fault.dsisr, spa->xsl_fault.dar, reg);
124		out_be64(spa->reg_tfc, reg);
125	}
126}
127
128static void xsl_fault_handler_bh(struct work_struct *fault_work)
129{
130	vm_fault_t flt = 0;
131	unsigned long access, flags, inv_flags = 0;
132	enum xsl_response r;
133	struct xsl_fault *fault = container_of(fault_work, struct xsl_fault,
134					fault_work);
135	struct spa *spa = container_of(fault, struct spa, xsl_fault);
136
137	int rc;
138
139	/*
140	 * We must release a reference on mm_users whenever exiting this
141	 * function (taken in the memory fault interrupt handler)
142	 */
143	rc = copro_handle_mm_fault(fault->pe_data.mm, fault->dar, fault->dsisr,
144				&flt);
145	if (rc) {
146		pr_debug("copro_handle_mm_fault failed: %d\n", rc);
147		if (fault->pe_data.xsl_err_cb) {
148			fault->pe_data.xsl_err_cb(
149				fault->pe_data.xsl_err_data,
150				fault->dar, fault->dsisr);
151		}
152		r = ADDRESS_ERROR;
153		goto ack;
154	}
155
156	if (!radix_enabled()) {
157		/*
158		 * update_mmu_cache() will not have loaded the hash
159		 * since current->trap is not a 0x400 or 0x300, so
160		 * just call hash_page_mm() here.
161		 */
162		access = _PAGE_PRESENT | _PAGE_READ;
163		if (fault->dsisr & SPA_XSL_S)
164			access |= _PAGE_WRITE;
165
166		if (get_region_id(fault->dar) != USER_REGION_ID)
167			access |= _PAGE_PRIVILEGED;
168
169		local_irq_save(flags);
170		hash_page_mm(fault->pe_data.mm, fault->dar, access, 0x300,
171			inv_flags);
172		local_irq_restore(flags);
173	}
174	r = RESTART;
175ack:
176	mmput(fault->pe_data.mm);
177	ack_irq(spa, r);
178}
179
180static irqreturn_t xsl_fault_handler(int irq, void *data)
181{
182	struct ocxl_link *link = (struct ocxl_link *) data;
183	struct spa *spa = link->spa;
184	u64 dsisr, dar, pe_handle;
185	struct pe_data *pe_data;
186	struct ocxl_process_element *pe;
187	int pid;
188	bool schedule = false;
189
190	read_irq(spa, &dsisr, &dar, &pe_handle);
191	trace_ocxl_fault(spa->spa_mem, pe_handle, dsisr, dar, -1);
192
193	WARN_ON(pe_handle > SPA_PE_MASK);
194	pe = spa->spa_mem + pe_handle;
195	pid = be32_to_cpu(pe->pid);
196	/* We could be reading all null values here if the PE is being
197	 * removed while an interrupt kicks in. It's not supposed to
198	 * happen if the driver notified the AFU to terminate the
199	 * PASID, and the AFU waited for pending operations before
200	 * acknowledging. But even if it happens, we won't find a
201	 * memory context below and fail silently, so it should be ok.
202	 */
203	if (!(dsisr & SPA_XSL_TF)) {
204		WARN(1, "Invalid xsl interrupt fault register %#llx\n", dsisr);
205		ack_irq(spa, ADDRESS_ERROR);
206		return IRQ_HANDLED;
207	}
208
209	rcu_read_lock();
210	pe_data = radix_tree_lookup(&spa->pe_tree, pe_handle);
211	if (!pe_data) {
212		/*
213		 * Could only happen if the driver didn't notify the
214		 * AFU about PASID termination before removing the PE,
215		 * or the AFU didn't wait for all memory access to
216		 * have completed.
217		 *
218		 * Either way, we fail early, but we shouldn't log an
219		 * error message, as it is a valid (if unexpected)
220		 * scenario
221		 */
222		rcu_read_unlock();
223		pr_debug("Unknown mm context for xsl interrupt\n");
224		ack_irq(spa, ADDRESS_ERROR);
225		return IRQ_HANDLED;
226	}
227
228	if (!pe_data->mm) {
229		/*
230		 * translation fault from a kernel context - an OpenCAPI
231		 * device tried to access a bad kernel address
232		 */
233		rcu_read_unlock();
234		pr_warn("Unresolved OpenCAPI xsl fault in kernel context\n");
235		ack_irq(spa, ADDRESS_ERROR);
236		return IRQ_HANDLED;
237	}
238	WARN_ON(pe_data->mm->context.id != pid);
239
240	if (mmget_not_zero(pe_data->mm)) {
241			spa->xsl_fault.pe = pe_handle;
242			spa->xsl_fault.dar = dar;
243			spa->xsl_fault.dsisr = dsisr;
244			spa->xsl_fault.pe_data = *pe_data;
245			schedule = true;
246			/* mm_users count released by bottom half */
247	}
248	rcu_read_unlock();
249	if (schedule)
250		schedule_work(&spa->xsl_fault.fault_work);
251	else
252		ack_irq(spa, ADDRESS_ERROR);
253	return IRQ_HANDLED;
254}
255
256static void unmap_irq_registers(struct spa *spa)
257{
258	pnv_ocxl_unmap_xsl_regs(spa->reg_dsisr, spa->reg_dar, spa->reg_tfc,
259				spa->reg_pe_handle);
260}
261
262static int map_irq_registers(struct pci_dev *dev, struct spa *spa)
263{
264	return pnv_ocxl_map_xsl_regs(dev, &spa->reg_dsisr, &spa->reg_dar,
265				&spa->reg_tfc, &spa->reg_pe_handle);
266}
267
268static int setup_xsl_irq(struct pci_dev *dev, struct ocxl_link *link)
269{
270	struct spa *spa = link->spa;
271	int rc;
272	int hwirq;
273
274	rc = pnv_ocxl_get_xsl_irq(dev, &hwirq);
275	if (rc)
276		return rc;
277
278	rc = map_irq_registers(dev, spa);
279	if (rc)
280		return rc;
281
282	spa->irq_name = kasprintf(GFP_KERNEL, "ocxl-xsl-%x-%x-%x",
283				link->domain, link->bus, link->dev);
284	if (!spa->irq_name) {
285		dev_err(&dev->dev, "Can't allocate name for xsl interrupt\n");
286		rc = -ENOMEM;
287		goto err_xsl;
288	}
289	/*
290	 * At some point, we'll need to look into allowing a higher
291	 * number of interrupts. Could we have an IRQ domain per link?
292	 */
293	spa->virq = irq_create_mapping(NULL, hwirq);
294	if (!spa->virq) {
295		dev_err(&dev->dev,
296			"irq_create_mapping failed for translation interrupt\n");
297		rc = -EINVAL;
298		goto err_name;
299	}
300
301	dev_dbg(&dev->dev, "hwirq %d mapped to virq %d\n", hwirq, spa->virq);
302
303	rc = request_irq(spa->virq, xsl_fault_handler, 0, spa->irq_name,
304			link);
305	if (rc) {
306		dev_err(&dev->dev,
307			"request_irq failed for translation interrupt: %d\n",
308			rc);
309		rc = -EINVAL;
310		goto err_mapping;
311	}
312	return 0;
313
314err_mapping:
315	irq_dispose_mapping(spa->virq);
316err_name:
317	kfree(spa->irq_name);
318err_xsl:
319	unmap_irq_registers(spa);
320	return rc;
321}
322
323static void release_xsl_irq(struct ocxl_link *link)
324{
325	struct spa *spa = link->spa;
326
327	if (spa->virq) {
328		free_irq(spa->virq, link);
329		irq_dispose_mapping(spa->virq);
330	}
331	kfree(spa->irq_name);
332	unmap_irq_registers(spa);
333}
334
335static int alloc_spa(struct pci_dev *dev, struct ocxl_link *link)
336{
337	struct spa *spa;
338
339	spa = kzalloc(sizeof(struct spa), GFP_KERNEL);
340	if (!spa)
341		return -ENOMEM;
342
343	mutex_init(&spa->spa_lock);
344	INIT_RADIX_TREE(&spa->pe_tree, GFP_KERNEL);
345	INIT_WORK(&spa->xsl_fault.fault_work, xsl_fault_handler_bh);
346
347	spa->spa_order = SPA_SPA_SIZE_LOG - PAGE_SHIFT;
348	spa->spa_mem = (struct ocxl_process_element *)
349		__get_free_pages(GFP_KERNEL | __GFP_ZERO, spa->spa_order);
350	if (!spa->spa_mem) {
351		dev_err(&dev->dev, "Can't allocate Shared Process Area\n");
352		kfree(spa);
353		return -ENOMEM;
354	}
355	pr_debug("Allocated SPA for %x:%x:%x at %p\n", link->domain, link->bus,
356		link->dev, spa->spa_mem);
357
358	link->spa = spa;
359	return 0;
360}
361
362static void free_spa(struct ocxl_link *link)
363{
364	struct spa *spa = link->spa;
365
366	pr_debug("Freeing SPA for %x:%x:%x\n", link->domain, link->bus,
367		link->dev);
368
369	if (spa && spa->spa_mem) {
370		free_pages((unsigned long) spa->spa_mem, spa->spa_order);
371		kfree(spa);
372		link->spa = NULL;
373	}
374}
375
376static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_link)
377{
378	struct ocxl_link *link;
379	int rc;
380
381	link = kzalloc(sizeof(struct ocxl_link), GFP_KERNEL);
382	if (!link)
383		return -ENOMEM;
384
385	kref_init(&link->ref);
386	link->domain = pci_domain_nr(dev->bus);
387	link->bus = dev->bus->number;
388	link->dev = PCI_SLOT(dev->devfn);
389	atomic_set(&link->irq_available, MAX_IRQ_PER_LINK);
 
390
391	rc = alloc_spa(dev, link);
392	if (rc)
393		goto err_free;
394
395	rc = setup_xsl_irq(dev, link);
396	if (rc)
397		goto err_spa;
398
399	/* platform specific hook */
400	rc = pnv_ocxl_spa_setup(dev, link->spa->spa_mem, PE_mask,
401				&link->platform_data);
402	if (rc)
403		goto err_xsl_irq;
404
 
 
 
 
 
 
 
405	*out_link = link;
406	return 0;
407
408err_xsl_irq:
409	release_xsl_irq(link);
410err_spa:
411	free_spa(link);
412err_free:
413	kfree(link);
414	return rc;
415}
416
417static void free_link(struct ocxl_link *link)
418{
419	release_xsl_irq(link);
420	free_spa(link);
421	kfree(link);
422}
423
424int ocxl_link_setup(struct pci_dev *dev, int PE_mask, void **link_handle)
425{
426	int rc = 0;
427	struct ocxl_link *link;
428
429	mutex_lock(&links_list_lock);
430	list_for_each_entry(link, &links_list, list) {
431		/* The functions of a device all share the same link */
432		if (link->domain == pci_domain_nr(dev->bus) &&
433			link->bus == dev->bus->number &&
434			link->dev == PCI_SLOT(dev->devfn)) {
435			kref_get(&link->ref);
436			*link_handle = link;
437			goto unlock;
438		}
439	}
440	rc = alloc_link(dev, PE_mask, &link);
441	if (rc)
442		goto unlock;
443
444	list_add(&link->list, &links_list);
445	*link_handle = link;
446unlock:
447	mutex_unlock(&links_list_lock);
448	return rc;
449}
450EXPORT_SYMBOL_GPL(ocxl_link_setup);
451
452static void release_xsl(struct kref *ref)
453{
454	struct ocxl_link *link = container_of(ref, struct ocxl_link, ref);
455
 
 
 
 
 
456	list_del(&link->list);
457	/* call platform code before releasing data */
458	pnv_ocxl_spa_release(link->platform_data);
459	free_link(link);
460}
461
462void ocxl_link_release(struct pci_dev *dev, void *link_handle)
463{
464	struct ocxl_link *link = (struct ocxl_link *) link_handle;
465
466	mutex_lock(&links_list_lock);
467	kref_put(&link->ref, release_xsl);
468	mutex_unlock(&links_list_lock);
469}
470EXPORT_SYMBOL_GPL(ocxl_link_release);
471
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472static u64 calculate_cfg_state(bool kernel)
473{
474	u64 state;
475
476	state = SPA_CFG_DR;
477	if (mfspr(SPRN_LPCR) & LPCR_TC)
478		state |= SPA_CFG_TC;
479	if (radix_enabled())
480		state |= SPA_CFG_XLAT_ror;
481	else
482		state |= SPA_CFG_XLAT_hpt;
483	state |= SPA_CFG_HV;
484	if (kernel) {
485		if (mfmsr() & MSR_SF)
486			state |= SPA_CFG_SF;
487	} else {
488		state |= SPA_CFG_PR;
489		if (!test_tsk_thread_flag(current, TIF_32BIT))
490			state |= SPA_CFG_SF;
491	}
492	return state;
493}
494
495int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
496		u64 amr, struct mm_struct *mm,
497		void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr),
498		void *xsl_err_data)
499{
500	struct ocxl_link *link = (struct ocxl_link *) link_handle;
501	struct spa *spa = link->spa;
502	struct ocxl_process_element *pe;
503	int pe_handle, rc = 0;
504	struct pe_data *pe_data;
505
506	BUILD_BUG_ON(sizeof(struct ocxl_process_element) != 128);
507	if (pasid > SPA_PASID_MAX)
508		return -EINVAL;
509
510	mutex_lock(&spa->spa_lock);
511	pe_handle = pasid & SPA_PE_MASK;
512	pe = spa->spa_mem + pe_handle;
513
514	if (pe->software_state) {
515		rc = -EBUSY;
516		goto unlock;
517	}
518
519	pe_data = kmalloc(sizeof(*pe_data), GFP_KERNEL);
520	if (!pe_data) {
521		rc = -ENOMEM;
522		goto unlock;
523	}
524
525	pe_data->mm = mm;
526	pe_data->xsl_err_cb = xsl_err_cb;
527	pe_data->xsl_err_data = xsl_err_data;
 
 
528
529	memset(pe, 0, sizeof(struct ocxl_process_element));
530	pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0));
 
 
531	pe->lpid = cpu_to_be32(mfspr(SPRN_LPID));
532	pe->pid = cpu_to_be32(pidr);
533	pe->tid = cpu_to_be32(tidr);
534	pe->amr = cpu_to_be64(amr);
535	pe->software_state = cpu_to_be32(SPA_PE_VALID);
536
537	/*
538	 * For user contexts, register a copro so that TLBIs are seen
539	 * by the nest MMU. If we have a kernel context, TLBIs are
540	 * already global.
541	 */
542	if (mm)
543		mm_context_add_copro(mm);
 
 
 
 
 
 
 
 
 
544	/*
545	 * Barrier is to make sure PE is visible in the SPA before it
546	 * is used by the device. It also helps with the global TLBI
547	 * invalidation
548	 */
549	mb();
550	radix_tree_insert(&spa->pe_tree, pe_handle, pe_data);
551
552	/*
553	 * The mm must stay valid for as long as the device uses it. We
554	 * lower the count when the context is removed from the SPA.
555	 *
556	 * We grab mm_count (and not mm_users), as we don't want to
557	 * end up in a circular dependency if a process mmaps its
558	 * mmio, therefore incrementing the file ref count when
559	 * calling mmap(), and forgets to unmap before exiting. In
560	 * that scenario, when the kernel handles the death of the
561	 * process, the file is not cleaned because unmap was not
562	 * called, and the mm wouldn't be freed because we would still
563	 * have a reference on mm_users. Incrementing mm_count solves
564	 * the problem.
565	 */
566	if (mm)
567		mmgrab(mm);
568	trace_ocxl_context_add(current->pid, spa->spa_mem, pasid, pidr, tidr);
569unlock:
570	mutex_unlock(&spa->spa_lock);
571	return rc;
572}
573EXPORT_SYMBOL_GPL(ocxl_link_add_pe);
574
575int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid)
576{
577	struct ocxl_link *link = (struct ocxl_link *) link_handle;
578	struct spa *spa = link->spa;
579	struct ocxl_process_element *pe;
580	int pe_handle, rc;
581
582	if (pasid > SPA_PASID_MAX)
583		return -EINVAL;
584
585	pe_handle = pasid & SPA_PE_MASK;
586	pe = spa->spa_mem + pe_handle;
587
588	mutex_lock(&spa->spa_lock);
589
590	pe->tid = cpu_to_be32(tid);
591
592	/*
593	 * The barrier makes sure the PE is updated
594	 * before we clear the NPU context cache below, so that the
595	 * old PE cannot be reloaded erroneously.
596	 */
597	mb();
598
599	/*
600	 * hook to platform code
601	 * On powerpc, the entry needs to be cleared from the context
602	 * cache of the NPU.
603	 */
604	rc = pnv_ocxl_spa_remove_pe_from_cache(link->platform_data, pe_handle);
605	WARN_ON(rc);
606
607	mutex_unlock(&spa->spa_lock);
608	return rc;
609}
610
611int ocxl_link_remove_pe(void *link_handle, int pasid)
612{
613	struct ocxl_link *link = (struct ocxl_link *) link_handle;
614	struct spa *spa = link->spa;
615	struct ocxl_process_element *pe;
616	struct pe_data *pe_data;
617	int pe_handle, rc;
618
619	if (pasid > SPA_PASID_MAX)
620		return -EINVAL;
621
622	/*
623	 * About synchronization with our memory fault handler:
624	 *
625	 * Before removing the PE, the driver is supposed to have
626	 * notified the AFU, which should have cleaned up and make
627	 * sure the PASID is no longer in use, including pending
628	 * interrupts. However, there's no way to be sure...
629	 *
630	 * We clear the PE and remove the context from our radix
631	 * tree. From that point on, any new interrupt for that
632	 * context will fail silently, which is ok. As mentioned
633	 * above, that's not expected, but it could happen if the
634	 * driver or AFU didn't do the right thing.
635	 *
636	 * There could still be a bottom half running, but we don't
637	 * need to wait/flush, as it is managing a reference count on
638	 * the mm it reads from the radix tree.
639	 */
640	pe_handle = pasid & SPA_PE_MASK;
641	pe = spa->spa_mem + pe_handle;
642
643	mutex_lock(&spa->spa_lock);
644
645	if (!(be32_to_cpu(pe->software_state) & SPA_PE_VALID)) {
646		rc = -EINVAL;
647		goto unlock;
648	}
649
650	trace_ocxl_context_remove(current->pid, spa->spa_mem, pasid,
651				be32_to_cpu(pe->pid), be32_to_cpu(pe->tid));
652
653	memset(pe, 0, sizeof(struct ocxl_process_element));
654	/*
655	 * The barrier makes sure the PE is removed from the SPA
656	 * before we clear the NPU context cache below, so that the
657	 * old PE cannot be reloaded erroneously.
658	 */
659	mb();
660
661	/*
662	 * hook to platform code
663	 * On powerpc, the entry needs to be cleared from the context
664	 * cache of the NPU.
665	 */
666	rc = pnv_ocxl_spa_remove_pe_from_cache(link->platform_data, pe_handle);
667	WARN_ON(rc);
668
669	pe_data = radix_tree_delete(&spa->pe_tree, pe_handle);
670	if (!pe_data) {
671		WARN(1, "Couldn't find pe data when removing PE\n");
672	} else {
673		if (pe_data->mm) {
 
 
 
 
 
 
 
 
 
 
 
 
674			mm_context_remove_copro(pe_data->mm);
675			mmdrop(pe_data->mm);
676		}
677		kfree_rcu(pe_data, rcu);
678	}
679unlock:
680	mutex_unlock(&spa->spa_lock);
681	return rc;
682}
683EXPORT_SYMBOL_GPL(ocxl_link_remove_pe);
684
685int ocxl_link_irq_alloc(void *link_handle, int *hw_irq, u64 *trigger_addr)
686{
687	struct ocxl_link *link = (struct ocxl_link *) link_handle;
688	int rc, irq;
689	u64 addr;
690
691	if (atomic_dec_if_positive(&link->irq_available) < 0)
692		return -ENOSPC;
693
694	rc = pnv_ocxl_alloc_xive_irq(&irq, &addr);
695	if (rc) {
696		atomic_inc(&link->irq_available);
697		return rc;
698	}
699
700	*hw_irq = irq;
701	*trigger_addr = addr;
702	return 0;
703}
704EXPORT_SYMBOL_GPL(ocxl_link_irq_alloc);
705
706void ocxl_link_free_irq(void *link_handle, int hw_irq)
707{
708	struct ocxl_link *link = (struct ocxl_link *) link_handle;
709
710	pnv_ocxl_free_xive_irq(hw_irq);
711	atomic_inc(&link->irq_available);
712}
713EXPORT_SYMBOL_GPL(ocxl_link_free_irq);