Linux Audio

Check our new training course

Loading...
v6.13.7
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
   4 * Author: Joerg Roedel <jroedel@suse.de>
   5 *         Leo Duran <leo.duran@amd.com>
   6 */
   7
   8#define pr_fmt(fmt)     "AMD-Vi: " fmt
   9#define dev_fmt(fmt)    pr_fmt(fmt)
  10
  11#include <linux/ratelimit.h>
  12#include <linux/pci.h>
  13#include <linux/acpi.h>
  14#include <linux/pci-ats.h>
  15#include <linux/bitmap.h>
  16#include <linux/slab.h>
  17#include <linux/debugfs.h>
  18#include <linux/scatterlist.h>
  19#include <linux/dma-map-ops.h>
  20#include <linux/dma-direct.h>
  21#include <linux/idr.h>
  22#include <linux/iommu-helper.h>
  23#include <linux/delay.h>
  24#include <linux/amd-iommu.h>
  25#include <linux/notifier.h>
  26#include <linux/export.h>
  27#include <linux/irq.h>
  28#include <linux/msi.h>
  29#include <linux/irqdomain.h>
  30#include <linux/percpu.h>
  31#include <linux/io-pgtable.h>
  32#include <linux/cc_platform.h>
  33#include <asm/irq_remapping.h>
  34#include <asm/io_apic.h>
  35#include <asm/apic.h>
  36#include <asm/hw_irq.h>
  37#include <asm/proto.h>
  38#include <asm/iommu.h>
  39#include <asm/gart.h>
  40#include <asm/dma.h>
  41#include <uapi/linux/iommufd.h>
  42
  43#include "amd_iommu.h"
  44#include "../dma-iommu.h"
  45#include "../irq_remapping.h"
  46#include "../iommu-pages.h"
  47
  48#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
  49
 
 
 
 
  50/* Reserved IOVA ranges */
  51#define MSI_RANGE_START		(0xfee00000)
  52#define MSI_RANGE_END		(0xfeefffff)
  53#define HT_RANGE_START		(0xfd00000000ULL)
  54#define HT_RANGE_END		(0xffffffffffULL)
  55
 
 
 
 
  56LIST_HEAD(ioapic_map);
  57LIST_HEAD(hpet_map);
  58LIST_HEAD(acpihid_map);
  59
  60const struct iommu_ops amd_iommu_ops;
  61static const struct iommu_dirty_ops amd_dirty_ops;
  62
  63int amd_iommu_max_glx_val = -1;
  64
  65/*
  66 * general struct to manage commands send to an IOMMU
  67 */
  68struct iommu_cmd {
  69	u32 data[4];
  70};
  71
  72/*
  73 * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap
  74 * to know which ones are already in use.
  75 */
  76DEFINE_IDA(pdom_ids);
  77
  78struct kmem_cache *amd_iommu_irq_cache;
  79
  80static int amd_iommu_attach_device(struct iommu_domain *dom,
  81				   struct device *dev);
  82
  83static void set_dte_entry(struct amd_iommu *iommu,
  84			  struct iommu_dev_data *dev_data);
  85
  86/****************************************************************************
  87 *
  88 * Helper functions
  89 *
  90 ****************************************************************************/
  91
  92static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom)
  93{
  94	return (pdom && (pdom->pd_mode == PD_MODE_V2));
  95}
  96
  97static inline bool pdom_is_in_pt_mode(struct protection_domain *pdom)
  98{
  99	return (pdom->domain.type == IOMMU_DOMAIN_IDENTITY);
 100}
 101
 102/*
 103 * We cannot support PASID w/ existing v1 page table in the same domain
 104 * since it will be nested. However, existing domain w/ v2 page table
 105 * or passthrough mode can be used for PASID.
 106 */
 107static inline bool pdom_is_sva_capable(struct protection_domain *pdom)
 108{
 109	return pdom_is_v2_pgtbl_mode(pdom) || pdom_is_in_pt_mode(pdom);
 110}
 111
 112static inline int get_acpihid_device_id(struct device *dev,
 113					struct acpihid_map_entry **entry)
 114{
 115	struct acpi_device *adev = ACPI_COMPANION(dev);
 116	struct acpihid_map_entry *p;
 117
 118	if (!adev)
 119		return -ENODEV;
 120
 121	list_for_each_entry(p, &acpihid_map, list) {
 122		if (acpi_dev_hid_uid_match(adev, p->hid,
 123					   p->uid[0] ? p->uid : NULL)) {
 124			if (entry)
 125				*entry = p;
 126			return p->devid;
 127		}
 128	}
 129	return -EINVAL;
 130}
 131
 132static inline int get_device_sbdf_id(struct device *dev)
 133{
 134	int sbdf;
 135
 136	if (dev_is_pci(dev))
 137		sbdf = get_pci_sbdf_id(to_pci_dev(dev));
 138	else
 139		sbdf = get_acpihid_device_id(dev, NULL);
 140
 141	return sbdf;
 142}
 143
 144struct dev_table_entry *get_dev_table(struct amd_iommu *iommu)
 145{
 146	struct dev_table_entry *dev_table;
 147	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 148
 149	BUG_ON(pci_seg == NULL);
 150	dev_table = pci_seg->dev_table;
 151	BUG_ON(dev_table == NULL);
 152
 153	return dev_table;
 154}
 155
 156static inline u16 get_device_segment(struct device *dev)
 157{
 158	u16 seg;
 159
 160	if (dev_is_pci(dev)) {
 161		struct pci_dev *pdev = to_pci_dev(dev);
 162
 163		seg = pci_domain_nr(pdev->bus);
 164	} else {
 165		u32 devid = get_acpihid_device_id(dev, NULL);
 166
 167		seg = PCI_SBDF_TO_SEGID(devid);
 168	}
 169
 170	return seg;
 171}
 172
 173/* Writes the specific IOMMU for a device into the PCI segment rlookup table */
 174void amd_iommu_set_rlookup_table(struct amd_iommu *iommu, u16 devid)
 175{
 176	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 177
 178	pci_seg->rlookup_table[devid] = iommu;
 179}
 180
 181static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid)
 182{
 183	struct amd_iommu_pci_seg *pci_seg;
 184
 185	for_each_pci_segment(pci_seg) {
 186		if (pci_seg->id == seg)
 187			return pci_seg->rlookup_table[devid];
 188	}
 189	return NULL;
 190}
 191
 192static struct amd_iommu *rlookup_amd_iommu(struct device *dev)
 193{
 194	u16 seg = get_device_segment(dev);
 195	int devid = get_device_sbdf_id(dev);
 196
 197	if (devid < 0)
 198		return NULL;
 199	return __rlookup_amd_iommu(seg, PCI_SBDF_TO_DEVID(devid));
 200}
 201
 
 
 
 
 
 202static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid)
 203{
 204	struct iommu_dev_data *dev_data;
 205	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 206
 207	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
 208	if (!dev_data)
 209		return NULL;
 210
 211	mutex_init(&dev_data->mutex);
 212	dev_data->devid = devid;
 213	ratelimit_default_init(&dev_data->rs);
 214
 215	llist_add(&dev_data->dev_data_list, &pci_seg->dev_data_list);
 216	return dev_data;
 217}
 218
 219static struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid)
 220{
 221	struct iommu_dev_data *dev_data;
 222	struct llist_node *node;
 223	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 224
 225	if (llist_empty(&pci_seg->dev_data_list))
 226		return NULL;
 227
 228	node = pci_seg->dev_data_list.first;
 229	llist_for_each_entry(dev_data, node, dev_data_list) {
 230		if (dev_data->devid == devid)
 231			return dev_data;
 232	}
 233
 234	return NULL;
 235}
 236
 237static int clone_alias(struct pci_dev *pdev, u16 alias, void *data)
 238{
 239	struct amd_iommu *iommu;
 240	struct dev_table_entry *dev_table;
 241	u16 devid = pci_dev_id(pdev);
 242
 243	if (devid == alias)
 244		return 0;
 245
 246	iommu = rlookup_amd_iommu(&pdev->dev);
 247	if (!iommu)
 248		return 0;
 249
 250	amd_iommu_set_rlookup_table(iommu, alias);
 251	dev_table = get_dev_table(iommu);
 252	memcpy(dev_table[alias].data,
 253	       dev_table[devid].data,
 254	       sizeof(dev_table[alias].data));
 255
 256	return 0;
 257}
 258
 259static void clone_aliases(struct amd_iommu *iommu, struct device *dev)
 260{
 261	struct pci_dev *pdev;
 262
 263	if (!dev_is_pci(dev))
 264		return;
 265	pdev = to_pci_dev(dev);
 266
 267	/*
 268	 * The IVRS alias stored in the alias table may not be
 269	 * part of the PCI DMA aliases if it's bus differs
 270	 * from the original device.
 271	 */
 272	clone_alias(pdev, iommu->pci_seg->alias_table[pci_dev_id(pdev)], NULL);
 273
 274	pci_for_each_dma_alias(pdev, clone_alias, NULL);
 275}
 276
 277static void setup_aliases(struct amd_iommu *iommu, struct device *dev)
 278{
 279	struct pci_dev *pdev = to_pci_dev(dev);
 280	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 281	u16 ivrs_alias;
 282
 283	/* For ACPI HID devices, there are no aliases */
 284	if (!dev_is_pci(dev))
 285		return;
 286
 287	/*
 288	 * Add the IVRS alias to the pci aliases if it is on the same
 289	 * bus. The IVRS table may know about a quirk that we don't.
 290	 */
 291	ivrs_alias = pci_seg->alias_table[pci_dev_id(pdev)];
 292	if (ivrs_alias != pci_dev_id(pdev) &&
 293	    PCI_BUS_NUM(ivrs_alias) == pdev->bus->number)
 294		pci_add_dma_alias(pdev, ivrs_alias & 0xff, 1);
 295
 296	clone_aliases(iommu, dev);
 297}
 298
 299static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid)
 300{
 301	struct iommu_dev_data *dev_data;
 302
 303	dev_data = search_dev_data(iommu, devid);
 304
 305	if (dev_data == NULL) {
 306		dev_data = alloc_dev_data(iommu, devid);
 307		if (!dev_data)
 308			return NULL;
 309
 310		if (translation_pre_enabled(iommu))
 311			dev_data->defer_attach = true;
 312	}
 313
 314	return dev_data;
 315}
 316
 317/*
 318* Find or create an IOMMU group for a acpihid device.
 319*/
 320static struct iommu_group *acpihid_device_group(struct device *dev)
 321{
 322	struct acpihid_map_entry *p, *entry = NULL;
 323	int devid;
 324
 325	devid = get_acpihid_device_id(dev, &entry);
 326	if (devid < 0)
 327		return ERR_PTR(devid);
 328
 329	list_for_each_entry(p, &acpihid_map, list) {
 330		if ((devid == p->devid) && p->group)
 331			entry->group = p->group;
 332	}
 333
 334	if (!entry->group)
 335		entry->group = generic_device_group(dev);
 336	else
 337		iommu_group_ref_get(entry->group);
 338
 339	return entry->group;
 340}
 341
 342static inline bool pdev_pasid_supported(struct iommu_dev_data *dev_data)
 343{
 344	return (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP);
 345}
 346
 347static u32 pdev_get_caps(struct pci_dev *pdev)
 348{
 349	int features;
 350	u32 flags = 0;
 351
 352	if (pci_ats_supported(pdev))
 353		flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
 354
 355	if (pci_pri_supported(pdev))
 356		flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
 357
 358	features = pci_pasid_features(pdev);
 359	if (features >= 0) {
 360		flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
 361
 362		if (features & PCI_PASID_CAP_EXEC)
 363			flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
 364
 365		if (features & PCI_PASID_CAP_PRIV)
 366			flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
 367	}
 368
 369	return flags;
 370}
 371
 372static inline int pdev_enable_cap_ats(struct pci_dev *pdev)
 373{
 374	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 375	int ret = -EINVAL;
 376
 377	if (dev_data->ats_enabled)
 378		return 0;
 379
 380	if (amd_iommu_iotlb_sup &&
 381	    (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP)) {
 382		ret = pci_enable_ats(pdev, PAGE_SHIFT);
 383		if (!ret) {
 384			dev_data->ats_enabled = 1;
 385			dev_data->ats_qdep    = pci_ats_queue_depth(pdev);
 386		}
 387	}
 388
 389	return ret;
 390}
 391
 392static inline void pdev_disable_cap_ats(struct pci_dev *pdev)
 393{
 394	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 395
 396	if (dev_data->ats_enabled) {
 397		pci_disable_ats(pdev);
 398		dev_data->ats_enabled = 0;
 399	}
 400}
 401
 402static inline int pdev_enable_cap_pri(struct pci_dev *pdev)
 403{
 404	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 405	int ret = -EINVAL;
 406
 407	if (dev_data->pri_enabled)
 408		return 0;
 409
 410	if (!dev_data->ats_enabled)
 411		return 0;
 412
 413	if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) {
 414		/*
 415		 * First reset the PRI state of the device.
 416		 * FIXME: Hardcode number of outstanding requests for now
 417		 */
 418		if (!pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) {
 419			dev_data->pri_enabled = 1;
 420			dev_data->pri_tlp     = pci_prg_resp_pasid_required(pdev);
 421
 422			ret = 0;
 423		}
 424	}
 425
 426	return ret;
 427}
 428
 429static inline void pdev_disable_cap_pri(struct pci_dev *pdev)
 430{
 431	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 432
 433	if (dev_data->pri_enabled) {
 434		pci_disable_pri(pdev);
 435		dev_data->pri_enabled = 0;
 436	}
 437}
 438
 439static inline int pdev_enable_cap_pasid(struct pci_dev *pdev)
 440{
 441	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 442	int ret = -EINVAL;
 443
 444	if (dev_data->pasid_enabled)
 445		return 0;
 446
 447	if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) {
 448		/* Only allow access to user-accessible pages */
 449		ret = pci_enable_pasid(pdev, 0);
 450		if (!ret)
 451			dev_data->pasid_enabled = 1;
 452	}
 453
 454	return ret;
 455}
 456
 457static inline void pdev_disable_cap_pasid(struct pci_dev *pdev)
 458{
 459	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 460
 461	if (dev_data->pasid_enabled) {
 462		pci_disable_pasid(pdev);
 463		dev_data->pasid_enabled = 0;
 464	}
 465}
 466
 467static void pdev_enable_caps(struct pci_dev *pdev)
 468{
 469	pdev_enable_cap_ats(pdev);
 470	pdev_enable_cap_pasid(pdev);
 471	pdev_enable_cap_pri(pdev);
 
 472}
 473
 474static void pdev_disable_caps(struct pci_dev *pdev)
 475{
 476	pdev_disable_cap_ats(pdev);
 477	pdev_disable_cap_pasid(pdev);
 478	pdev_disable_cap_pri(pdev);
 479}
 480
 481/*
 482 * This function checks if the driver got a valid device from the caller to
 483 * avoid dereferencing invalid pointers.
 484 */
 485static bool check_device(struct device *dev)
 486{
 487	struct amd_iommu_pci_seg *pci_seg;
 488	struct amd_iommu *iommu;
 489	int devid, sbdf;
 490
 491	if (!dev)
 492		return false;
 493
 494	sbdf = get_device_sbdf_id(dev);
 495	if (sbdf < 0)
 496		return false;
 497	devid = PCI_SBDF_TO_DEVID(sbdf);
 498
 499	iommu = rlookup_amd_iommu(dev);
 500	if (!iommu)
 501		return false;
 502
 503	/* Out of our scope? */
 504	pci_seg = iommu->pci_seg;
 505	if (devid > pci_seg->last_bdf)
 506		return false;
 507
 508	return true;
 509}
 510
 511static int iommu_init_device(struct amd_iommu *iommu, struct device *dev)
 512{
 513	struct iommu_dev_data *dev_data;
 514	int devid, sbdf;
 515
 516	if (dev_iommu_priv_get(dev))
 517		return 0;
 518
 519	sbdf = get_device_sbdf_id(dev);
 520	if (sbdf < 0)
 521		return sbdf;
 522
 523	devid = PCI_SBDF_TO_DEVID(sbdf);
 524	dev_data = find_dev_data(iommu, devid);
 525	if (!dev_data)
 526		return -ENOMEM;
 527
 528	dev_data->dev = dev;
 529	setup_aliases(iommu, dev);
 530
 531	/*
 532	 * By default we use passthrough mode for IOMMUv2 capable device.
 533	 * But if amd_iommu=force_isolation is set (e.g. to debug DMA to
 534	 * invalid address), we ignore the capability for the device so
 535	 * it'll be forced to go into translation mode.
 536	 */
 537	if ((iommu_default_passthrough() || !amd_iommu_force_isolation) &&
 538	    dev_is_pci(dev) && amd_iommu_gt_ppr_supported()) {
 539		dev_data->flags = pdev_get_caps(to_pci_dev(dev));
 540	}
 541
 542	dev_iommu_priv_set(dev, dev_data);
 543
 544	return 0;
 545}
 546
 547static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev)
 548{
 549	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 550	struct dev_table_entry *dev_table = get_dev_table(iommu);
 551	int devid, sbdf;
 552
 553	sbdf = get_device_sbdf_id(dev);
 554	if (sbdf < 0)
 555		return;
 556
 557	devid = PCI_SBDF_TO_DEVID(sbdf);
 558	pci_seg->rlookup_table[devid] = NULL;
 559	memset(&dev_table[devid], 0, sizeof(struct dev_table_entry));
 560
 561	setup_aliases(iommu, dev);
 562}
 563
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 564
 565/****************************************************************************
 566 *
 567 * Interrupt handling functions
 568 *
 569 ****************************************************************************/
 570
 571static void dump_dte_entry(struct amd_iommu *iommu, u16 devid)
 572{
 573	int i;
 574	struct dev_table_entry *dev_table = get_dev_table(iommu);
 575
 576	for (i = 0; i < 4; ++i)
 577		pr_err("DTE[%d]: %016llx\n", i, dev_table[devid].data[i]);
 578}
 579
 580static void dump_command(unsigned long phys_addr)
 581{
 582	struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr);
 583	int i;
 584
 585	for (i = 0; i < 4; ++i)
 586		pr_err("CMD[%d]: %08x\n", i, cmd->data[i]);
 587}
 588
 589static void amd_iommu_report_rmp_hw_error(struct amd_iommu *iommu, volatile u32 *event)
 590{
 591	struct iommu_dev_data *dev_data = NULL;
 592	int devid, vmg_tag, flags;
 593	struct pci_dev *pdev;
 594	u64 spa;
 595
 596	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
 597	vmg_tag = (event[1]) & 0xFFFF;
 598	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
 599	spa     = ((u64)event[3] << 32) | (event[2] & 0xFFFFFFF8);
 600
 601	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
 602					   devid & 0xff);
 603	if (pdev)
 604		dev_data = dev_iommu_priv_get(&pdev->dev);
 605
 606	if (dev_data) {
 607		if (__ratelimit(&dev_data->rs)) {
 608			pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
 609				vmg_tag, spa, flags);
 610		}
 611	} else {
 612		pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
 613			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 614			vmg_tag, spa, flags);
 615	}
 616
 617	if (pdev)
 618		pci_dev_put(pdev);
 619}
 620
 621static void amd_iommu_report_rmp_fault(struct amd_iommu *iommu, volatile u32 *event)
 622{
 623	struct iommu_dev_data *dev_data = NULL;
 624	int devid, flags_rmp, vmg_tag, flags;
 625	struct pci_dev *pdev;
 626	u64 gpa;
 627
 628	devid     = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
 629	flags_rmp = (event[0] >> EVENT_FLAGS_SHIFT) & 0xFF;
 630	vmg_tag   = (event[1]) & 0xFFFF;
 631	flags     = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
 632	gpa       = ((u64)event[3] << 32) | event[2];
 633
 634	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
 635					   devid & 0xff);
 636	if (pdev)
 637		dev_data = dev_iommu_priv_get(&pdev->dev);
 638
 639	if (dev_data) {
 640		if (__ratelimit(&dev_data->rs)) {
 641			pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
 642				vmg_tag, gpa, flags_rmp, flags);
 643		}
 644	} else {
 645		pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
 646			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 647			vmg_tag, gpa, flags_rmp, flags);
 648	}
 649
 650	if (pdev)
 651		pci_dev_put(pdev);
 652}
 653
 654#define IS_IOMMU_MEM_TRANSACTION(flags)		\
 655	(((flags) & EVENT_FLAG_I) == 0)
 656
 657#define IS_WRITE_REQUEST(flags)			\
 658	((flags) & EVENT_FLAG_RW)
 659
 660static void amd_iommu_report_page_fault(struct amd_iommu *iommu,
 661					u16 devid, u16 domain_id,
 662					u64 address, int flags)
 663{
 664	struct iommu_dev_data *dev_data = NULL;
 665	struct pci_dev *pdev;
 666
 667	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
 668					   devid & 0xff);
 669	if (pdev)
 670		dev_data = dev_iommu_priv_get(&pdev->dev);
 671
 672	if (dev_data) {
 673		/*
 674		 * If this is a DMA fault (for which the I(nterrupt)
 675		 * bit will be unset), allow report_iommu_fault() to
 676		 * prevent logging it.
 677		 */
 678		if (IS_IOMMU_MEM_TRANSACTION(flags)) {
 679			/* Device not attached to domain properly */
 680			if (dev_data->domain == NULL) {
 681				pr_err_ratelimited("Event logged [Device not attached to domain properly]\n");
 682				pr_err_ratelimited("  device=%04x:%02x:%02x.%x domain=0x%04x\n",
 683						   iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
 684						   PCI_FUNC(devid), domain_id);
 685				goto out;
 686			}
 687
 688			if (!report_iommu_fault(&dev_data->domain->domain,
 689						&pdev->dev, address,
 690						IS_WRITE_REQUEST(flags) ?
 691							IOMMU_FAULT_WRITE :
 692							IOMMU_FAULT_READ))
 693				goto out;
 694		}
 695
 696		if (__ratelimit(&dev_data->rs)) {
 697			pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n",
 698				domain_id, address, flags);
 699		}
 700	} else {
 701		pr_err_ratelimited("Event logged [IO_PAGE_FAULT device=%04x:%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n",
 702			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 703			domain_id, address, flags);
 704	}
 705
 706out:
 707	if (pdev)
 708		pci_dev_put(pdev);
 709}
 710
 711static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
 712{
 713	struct device *dev = iommu->iommu.dev;
 714	int type, devid, flags, tag;
 715	volatile u32 *event = __evt;
 716	int count = 0;
 717	u64 address;
 718	u32 pasid;
 719
 720retry:
 721	type    = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
 722	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
 723	pasid   = (event[0] & EVENT_DOMID_MASK_HI) |
 724		  (event[1] & EVENT_DOMID_MASK_LO);
 725	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
 726	address = (u64)(((u64)event[3]) << 32) | event[2];
 727
 728	if (type == 0) {
 729		/* Did we hit the erratum? */
 730		if (++count == LOOP_TIMEOUT) {
 731			pr_err("No event written to event log\n");
 732			return;
 733		}
 734		udelay(1);
 735		goto retry;
 736	}
 737
 738	if (type == EVENT_TYPE_IO_FAULT) {
 739		amd_iommu_report_page_fault(iommu, devid, pasid, address, flags);
 740		return;
 741	}
 742
 743	switch (type) {
 744	case EVENT_TYPE_ILL_DEV:
 745		dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
 746			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 747			pasid, address, flags);
 748		dump_dte_entry(iommu, devid);
 749		break;
 750	case EVENT_TYPE_DEV_TAB_ERR:
 751		dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x "
 752			"address=0x%llx flags=0x%04x]\n",
 753			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 754			address, flags);
 755		break;
 756	case EVENT_TYPE_PAGE_TAB_ERR:
 757		dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x pasid=0x%04x address=0x%llx flags=0x%04x]\n",
 758			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 759			pasid, address, flags);
 760		break;
 761	case EVENT_TYPE_ILL_CMD:
 762		dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%llx]\n", address);
 763		dump_command(address);
 764		break;
 765	case EVENT_TYPE_CMD_HARD_ERR:
 766		dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%llx flags=0x%04x]\n",
 767			address, flags);
 768		break;
 769	case EVENT_TYPE_IOTLB_INV_TO:
 770		dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%04x:%02x:%02x.%x address=0x%llx]\n",
 771			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 772			address);
 773		break;
 774	case EVENT_TYPE_INV_DEV_REQ:
 775		dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
 776			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 777			pasid, address, flags);
 778		break;
 779	case EVENT_TYPE_RMP_FAULT:
 780		amd_iommu_report_rmp_fault(iommu, event);
 781		break;
 782	case EVENT_TYPE_RMP_HW_ERR:
 783		amd_iommu_report_rmp_hw_error(iommu, event);
 784		break;
 785	case EVENT_TYPE_INV_PPR_REQ:
 786		pasid = PPR_PASID(*((u64 *)__evt));
 787		tag = event[1] & 0x03FF;
 788		dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n",
 789			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 790			pasid, address, flags, tag);
 791		break;
 792	default:
 793		dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n",
 794			event[0], event[1], event[2], event[3]);
 795	}
 796
 797	/*
 798	 * To detect the hardware errata 732 we need to clear the
 799	 * entry back to zero. This issue does not exist on SNP
 800	 * enabled system. Also this buffer is not writeable on
 801	 * SNP enabled system.
 802	 */
 803	if (!amd_iommu_snp_en)
 804		memset(__evt, 0, 4 * sizeof(u32));
 805}
 806
 807static void iommu_poll_events(struct amd_iommu *iommu)
 808{
 809	u32 head, tail;
 810
 811	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
 812	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
 813
 814	while (head != tail) {
 815		iommu_print_event(iommu, iommu->evt_buf + head);
 816
 817		/* Update head pointer of hardware ring-buffer */
 818		head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
 819		writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
 820	}
 821
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 822}
 823
 824#ifdef CONFIG_IRQ_REMAP
 825static int (*iommu_ga_log_notifier)(u32);
 826
 827int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
 828{
 829	iommu_ga_log_notifier = notifier;
 830
 831	return 0;
 832}
 833EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
 834
 835static void iommu_poll_ga_log(struct amd_iommu *iommu)
 836{
 837	u32 head, tail;
 838
 839	if (iommu->ga_log == NULL)
 840		return;
 841
 842	head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
 843	tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
 844
 845	while (head != tail) {
 846		volatile u64 *raw;
 847		u64 log_entry;
 848
 849		raw = (u64 *)(iommu->ga_log + head);
 850
 851		/* Avoid memcpy function-call overhead */
 852		log_entry = *raw;
 853
 854		/* Update head pointer of hardware ring-buffer */
 855		head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
 856		writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
 857
 858		/* Handle GA entry */
 859		switch (GA_REQ_TYPE(log_entry)) {
 860		case GA_GUEST_NR:
 861			if (!iommu_ga_log_notifier)
 862				break;
 863
 864			pr_debug("%s: devid=%#x, ga_tag=%#x\n",
 865				 __func__, GA_DEVID(log_entry),
 866				 GA_TAG(log_entry));
 867
 868			if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
 869				pr_err("GA log notifier failed.\n");
 870			break;
 871		default:
 872			break;
 873		}
 874	}
 875}
 876
 877static void
 878amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu)
 879{
 880	if (!irq_remapping_enabled || !dev_is_pci(dev) ||
 881	    !pci_dev_has_default_msi_parent_domain(to_pci_dev(dev)))
 882		return;
 883
 884	dev_set_msi_domain(dev, iommu->ir_domain);
 885}
 886
 887#else /* CONFIG_IRQ_REMAP */
 888static inline void
 889amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { }
 890#endif /* !CONFIG_IRQ_REMAP */
 891
 892static void amd_iommu_handle_irq(void *data, const char *evt_type,
 893				 u32 int_mask, u32 overflow_mask,
 894				 void (*int_handler)(struct amd_iommu *),
 895				 void (*overflow_handler)(struct amd_iommu *))
 896{
 897	struct amd_iommu *iommu = (struct amd_iommu *) data;
 898	u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
 899	u32 mask = int_mask | overflow_mask;
 900
 901	while (status & mask) {
 902		/* Enable interrupt sources again */
 903		writel(mask, iommu->mmio_base + MMIO_STATUS_OFFSET);
 904
 905		if (int_handler) {
 906			pr_devel("Processing IOMMU (ivhd%d) %s Log\n",
 907				 iommu->index, evt_type);
 908			int_handler(iommu);
 909		}
 910
 911		if ((status & overflow_mask) && overflow_handler)
 912			overflow_handler(iommu);
 913
 914		/*
 915		 * Hardware bug: ERBT1312
 916		 * When re-enabling interrupt (by writing 1
 917		 * to clear the bit), the hardware might also try to set
 918		 * the interrupt bit in the event status register.
 919		 * In this scenario, the bit will be set, and disable
 920		 * subsequent interrupts.
 921		 *
 922		 * Workaround: The IOMMU driver should read back the
 923		 * status register and check if the interrupt bits are cleared.
 924		 * If not, driver will need to go through the interrupt handler
 925		 * again and re-clear the bits
 926		 */
 927		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
 928	}
 929}
 930
 931irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data)
 932{
 933	amd_iommu_handle_irq(data, "Evt", MMIO_STATUS_EVT_INT_MASK,
 934			     MMIO_STATUS_EVT_OVERFLOW_MASK,
 935			     iommu_poll_events, amd_iommu_restart_event_logging);
 936
 937	return IRQ_HANDLED;
 938}
 939
 940irqreturn_t amd_iommu_int_thread_pprlog(int irq, void *data)
 941{
 942	amd_iommu_handle_irq(data, "PPR", MMIO_STATUS_PPR_INT_MASK,
 943			     MMIO_STATUS_PPR_OVERFLOW_MASK,
 944			     amd_iommu_poll_ppr_log, amd_iommu_restart_ppr_log);
 945
 946	return IRQ_HANDLED;
 947}
 948
 949irqreturn_t amd_iommu_int_thread_galog(int irq, void *data)
 950{
 951#ifdef CONFIG_IRQ_REMAP
 952	amd_iommu_handle_irq(data, "GA", MMIO_STATUS_GALOG_INT_MASK,
 953			     MMIO_STATUS_GALOG_OVERFLOW_MASK,
 954			     iommu_poll_ga_log, amd_iommu_restart_ga_log);
 955#endif
 956
 957	return IRQ_HANDLED;
 958}
 959
 960irqreturn_t amd_iommu_int_thread(int irq, void *data)
 961{
 962	amd_iommu_int_thread_evtlog(irq, data);
 963	amd_iommu_int_thread_pprlog(irq, data);
 964	amd_iommu_int_thread_galog(irq, data);
 965
 966	return IRQ_HANDLED;
 967}
 968
 969irqreturn_t amd_iommu_int_handler(int irq, void *data)
 970{
 971	return IRQ_WAKE_THREAD;
 972}
 973
 974/****************************************************************************
 975 *
 976 * IOMMU command queuing functions
 977 *
 978 ****************************************************************************/
 979
 980static int wait_on_sem(struct amd_iommu *iommu, u64 data)
 981{
 982	int i = 0;
 983
 984	while (*iommu->cmd_sem != data && i < LOOP_TIMEOUT) {
 985		udelay(1);
 986		i += 1;
 987	}
 988
 989	if (i == LOOP_TIMEOUT) {
 990		pr_alert("Completion-Wait loop timed out\n");
 991		return -EIO;
 992	}
 993
 994	return 0;
 995}
 996
 997static void copy_cmd_to_buffer(struct amd_iommu *iommu,
 998			       struct iommu_cmd *cmd)
 999{
1000	u8 *target;
1001	u32 tail;
1002
1003	/* Copy command to buffer */
1004	tail = iommu->cmd_buf_tail;
1005	target = iommu->cmd_buf + tail;
1006	memcpy(target, cmd, sizeof(*cmd));
1007
1008	tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
1009	iommu->cmd_buf_tail = tail;
1010
1011	/* Tell the IOMMU about it */
1012	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
1013}
1014
1015static void build_completion_wait(struct iommu_cmd *cmd,
1016				  struct amd_iommu *iommu,
1017				  u64 data)
1018{
1019	u64 paddr = iommu_virt_to_phys((void *)iommu->cmd_sem);
1020
1021	memset(cmd, 0, sizeof(*cmd));
1022	cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
1023	cmd->data[1] = upper_32_bits(paddr);
1024	cmd->data[2] = lower_32_bits(data);
1025	cmd->data[3] = upper_32_bits(data);
1026	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
1027}
1028
1029static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
1030{
1031	memset(cmd, 0, sizeof(*cmd));
1032	cmd->data[0] = devid;
1033	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
1034}
1035
1036/*
1037 * Builds an invalidation address which is suitable for one page or multiple
1038 * pages. Sets the size bit (S) as needed is more than one page is flushed.
1039 */
1040static inline u64 build_inv_address(u64 address, size_t size)
1041{
1042	u64 pages, end, msb_diff;
1043
1044	pages = iommu_num_pages(address, size, PAGE_SIZE);
1045
1046	if (pages == 1)
1047		return address & PAGE_MASK;
1048
1049	end = address + size - 1;
1050
1051	/*
1052	 * msb_diff would hold the index of the most significant bit that
1053	 * flipped between the start and end.
1054	 */
1055	msb_diff = fls64(end ^ address) - 1;
1056
1057	/*
1058	 * Bits 63:52 are sign extended. If for some reason bit 51 is different
1059	 * between the start and the end, invalidate everything.
1060	 */
1061	if (unlikely(msb_diff > 51)) {
1062		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
1063	} else {
1064		/*
1065		 * The msb-bit must be clear on the address. Just set all the
1066		 * lower bits.
1067		 */
1068		address |= (1ull << msb_diff) - 1;
1069	}
1070
1071	/* Clear bits 11:0 */
1072	address &= PAGE_MASK;
1073
1074	/* Set the size bit - we flush more than one 4kb page */
1075	return address | CMD_INV_IOMMU_PAGES_SIZE_MASK;
1076}
1077
1078static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
1079				  size_t size, u16 domid,
1080				  ioasid_t pasid, bool gn)
1081{
1082	u64 inv_address = build_inv_address(address, size);
1083
1084	memset(cmd, 0, sizeof(*cmd));
1085
1086	cmd->data[1] |= domid;
1087	cmd->data[2]  = lower_32_bits(inv_address);
1088	cmd->data[3]  = upper_32_bits(inv_address);
1089	/* PDE bit - we want to flush everything, not only the PTEs */
1090	cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
1091	if (gn) {
1092		cmd->data[0] |= pasid;
1093		cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1094	}
1095	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
1096}
1097
1098static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
1099				  u64 address, size_t size,
1100				  ioasid_t pasid, bool gn)
1101{
1102	u64 inv_address = build_inv_address(address, size);
1103
1104	memset(cmd, 0, sizeof(*cmd));
1105
1106	cmd->data[0]  = devid;
1107	cmd->data[0] |= (qdep & 0xff) << 24;
1108	cmd->data[1]  = devid;
1109	cmd->data[2]  = lower_32_bits(inv_address);
1110	cmd->data[3]  = upper_32_bits(inv_address);
1111	if (gn) {
1112		cmd->data[0] |= ((pasid >> 8) & 0xff) << 16;
1113		cmd->data[1] |= (pasid & 0xff) << 16;
1114		cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1115	}
1116
1117	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
1118}
1119
1120static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid,
1121			       int status, int tag, u8 gn)
1122{
1123	memset(cmd, 0, sizeof(*cmd));
1124
1125	cmd->data[0]  = devid;
1126	if (gn) {
1127		cmd->data[1]  = pasid;
1128		cmd->data[2]  = CMD_INV_IOMMU_PAGES_GN_MASK;
1129	}
1130	cmd->data[3]  = tag & 0x1ff;
1131	cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;
1132
1133	CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
1134}
1135
1136static void build_inv_all(struct iommu_cmd *cmd)
1137{
1138	memset(cmd, 0, sizeof(*cmd));
1139	CMD_SET_TYPE(cmd, CMD_INV_ALL);
1140}
1141
1142static void build_inv_irt(struct iommu_cmd *cmd, u16 devid)
1143{
1144	memset(cmd, 0, sizeof(*cmd));
1145	cmd->data[0] = devid;
1146	CMD_SET_TYPE(cmd, CMD_INV_IRT);
1147}
1148
1149/*
1150 * Writes the command to the IOMMUs command buffer and informs the
1151 * hardware about the new command.
1152 */
1153static int __iommu_queue_command_sync(struct amd_iommu *iommu,
1154				      struct iommu_cmd *cmd,
1155				      bool sync)
1156{
1157	unsigned int count = 0;
1158	u32 left, next_tail;
1159
1160	next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
1161again:
1162	left      = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE;
1163
1164	if (left <= 0x20) {
1165		/* Skip udelay() the first time around */
1166		if (count++) {
1167			if (count == LOOP_TIMEOUT) {
1168				pr_err("Command buffer timeout\n");
1169				return -EIO;
1170			}
1171
1172			udelay(1);
1173		}
1174
1175		/* Update head and recheck remaining space */
1176		iommu->cmd_buf_head = readl(iommu->mmio_base +
1177					    MMIO_CMD_HEAD_OFFSET);
1178
1179		goto again;
1180	}
1181
1182	copy_cmd_to_buffer(iommu, cmd);
1183
1184	/* Do we need to make sure all commands are processed? */
1185	iommu->need_sync = sync;
1186
1187	return 0;
1188}
1189
1190static int iommu_queue_command_sync(struct amd_iommu *iommu,
1191				    struct iommu_cmd *cmd,
1192				    bool sync)
1193{
1194	unsigned long flags;
1195	int ret;
1196
1197	raw_spin_lock_irqsave(&iommu->lock, flags);
1198	ret = __iommu_queue_command_sync(iommu, cmd, sync);
1199	raw_spin_unlock_irqrestore(&iommu->lock, flags);
1200
1201	return ret;
1202}
1203
1204static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
1205{
1206	return iommu_queue_command_sync(iommu, cmd, true);
1207}
1208
1209/*
1210 * This function queues a completion wait command into the command
1211 * buffer of an IOMMU
1212 */
1213static int iommu_completion_wait(struct amd_iommu *iommu)
1214{
1215	struct iommu_cmd cmd;
1216	unsigned long flags;
1217	int ret;
1218	u64 data;
1219
1220	if (!iommu->need_sync)
1221		return 0;
1222
1223	data = atomic64_inc_return(&iommu->cmd_sem_val);
1224	build_completion_wait(&cmd, iommu, data);
1225
1226	raw_spin_lock_irqsave(&iommu->lock, flags);
1227
1228	ret = __iommu_queue_command_sync(iommu, &cmd, false);
1229	if (ret)
1230		goto out_unlock;
1231
1232	ret = wait_on_sem(iommu, data);
1233
1234out_unlock:
1235	raw_spin_unlock_irqrestore(&iommu->lock, flags);
1236
1237	return ret;
1238}
1239
1240static void domain_flush_complete(struct protection_domain *domain)
1241{
1242	struct pdom_iommu_info *pdom_iommu_info;
1243	unsigned long i;
1244
1245	lockdep_assert_held(&domain->lock);
1246
1247	/*
1248	 * Devices of this domain are behind this IOMMU
1249	 * We need to wait for completion of all commands.
1250	 */
1251	 xa_for_each(&domain->iommu_array, i, pdom_iommu_info)
1252		iommu_completion_wait(pdom_iommu_info->iommu);
1253}
1254
1255static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
1256{
1257	struct iommu_cmd cmd;
1258
1259	build_inv_dte(&cmd, devid);
1260
1261	return iommu_queue_command(iommu, &cmd);
1262}
1263
1264static void amd_iommu_flush_dte_all(struct amd_iommu *iommu)
1265{
1266	u32 devid;
1267	u16 last_bdf = iommu->pci_seg->last_bdf;
1268
1269	for (devid = 0; devid <= last_bdf; ++devid)
1270		iommu_flush_dte(iommu, devid);
1271
1272	iommu_completion_wait(iommu);
1273}
1274
1275/*
1276 * This function uses heavy locking and may disable irqs for some time. But
1277 * this is no issue because it is only called during resume.
1278 */
1279static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu)
1280{
1281	u32 dom_id;
1282	u16 last_bdf = iommu->pci_seg->last_bdf;
1283
1284	for (dom_id = 0; dom_id <= last_bdf; ++dom_id) {
1285		struct iommu_cmd cmd;
1286		build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1287				      dom_id, IOMMU_NO_PASID, false);
1288		iommu_queue_command(iommu, &cmd);
1289	}
1290
1291	iommu_completion_wait(iommu);
1292}
1293
1294static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
1295{
1296	struct iommu_cmd cmd;
1297
1298	build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1299			      dom_id, IOMMU_NO_PASID, false);
1300	iommu_queue_command(iommu, &cmd);
1301
1302	iommu_completion_wait(iommu);
1303}
1304
1305static void amd_iommu_flush_all(struct amd_iommu *iommu)
1306{
1307	struct iommu_cmd cmd;
1308
1309	build_inv_all(&cmd);
1310
1311	iommu_queue_command(iommu, &cmd);
1312	iommu_completion_wait(iommu);
1313}
1314
1315static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid)
1316{
1317	struct iommu_cmd cmd;
1318
1319	build_inv_irt(&cmd, devid);
1320
1321	iommu_queue_command(iommu, &cmd);
1322}
1323
1324static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
1325{
1326	u32 devid;
1327	u16 last_bdf = iommu->pci_seg->last_bdf;
1328
1329	if (iommu->irtcachedis_enabled)
1330		return;
1331
1332	for (devid = 0; devid <= last_bdf; devid++)
1333		iommu_flush_irt(iommu, devid);
1334
1335	iommu_completion_wait(iommu);
1336}
1337
1338void amd_iommu_flush_all_caches(struct amd_iommu *iommu)
1339{
1340	if (check_feature(FEATURE_IA)) {
1341		amd_iommu_flush_all(iommu);
1342	} else {
1343		amd_iommu_flush_dte_all(iommu);
1344		amd_iommu_flush_irt_all(iommu);
1345		amd_iommu_flush_tlb_all(iommu);
1346	}
1347}
1348
1349/*
1350 * Command send function for flushing on-device TLB
1351 */
1352static int device_flush_iotlb(struct iommu_dev_data *dev_data, u64 address,
1353			      size_t size, ioasid_t pasid, bool gn)
1354{
1355	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
1356	struct iommu_cmd cmd;
1357	int qdep = dev_data->ats_qdep;
 
 
 
 
 
1358
1359	build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address,
1360			      size, pasid, gn);
1361
1362	return iommu_queue_command(iommu, &cmd);
1363}
1364
1365static int device_flush_dte_alias(struct pci_dev *pdev, u16 alias, void *data)
1366{
1367	struct amd_iommu *iommu = data;
1368
1369	return iommu_flush_dte(iommu, alias);
1370}
1371
1372/*
1373 * Command send function for invalidating a device table entry
1374 */
1375static int device_flush_dte(struct iommu_dev_data *dev_data)
1376{
1377	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
1378	struct pci_dev *pdev = NULL;
1379	struct amd_iommu_pci_seg *pci_seg;
1380	u16 alias;
1381	int ret;
1382
 
 
 
 
1383	if (dev_is_pci(dev_data->dev))
1384		pdev = to_pci_dev(dev_data->dev);
1385
1386	if (pdev)
1387		ret = pci_for_each_dma_alias(pdev,
1388					     device_flush_dte_alias, iommu);
1389	else
1390		ret = iommu_flush_dte(iommu, dev_data->devid);
1391	if (ret)
1392		return ret;
1393
1394	pci_seg = iommu->pci_seg;
1395	alias = pci_seg->alias_table[dev_data->devid];
1396	if (alias != dev_data->devid) {
1397		ret = iommu_flush_dte(iommu, alias);
1398		if (ret)
1399			return ret;
1400	}
1401
1402	if (dev_data->ats_enabled) {
1403		/* Invalidate the entire contents of an IOTLB */
1404		ret = device_flush_iotlb(dev_data, 0, ~0UL,
1405					 IOMMU_NO_PASID, false);
1406	}
1407
1408	return ret;
1409}
1410
1411static int domain_flush_pages_v2(struct protection_domain *pdom,
 
 
 
 
 
1412				 u64 address, size_t size)
1413{
1414	struct iommu_dev_data *dev_data;
1415	struct iommu_cmd cmd;
1416	int ret = 0;
1417
1418	lockdep_assert_held(&pdom->lock);
1419	list_for_each_entry(dev_data, &pdom->dev_list, list) {
1420		struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
1421		u16 domid = dev_data->gcr3_info.domid;
1422
1423		build_inv_iommu_pages(&cmd, address, size,
1424				      domid, IOMMU_NO_PASID, true);
1425
1426		ret |= iommu_queue_command(iommu, &cmd);
1427	}
1428
1429	return ret;
1430}
1431
1432static int domain_flush_pages_v1(struct protection_domain *pdom,
1433				 u64 address, size_t size)
1434{
1435	struct pdom_iommu_info *pdom_iommu_info;
1436	struct iommu_cmd cmd;
1437	int ret = 0;
1438	unsigned long i;
1439
1440	lockdep_assert_held(&pdom->lock);
1441
1442	build_inv_iommu_pages(&cmd, address, size,
1443			      pdom->id, IOMMU_NO_PASID, false);
 
1444
1445	xa_for_each(&pdom->iommu_array, i, pdom_iommu_info) {
1446		/*
1447		 * Devices of this domain are behind this IOMMU
1448		 * We need a TLB flush
1449		 */
1450		ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd);
1451	}
1452
1453	return ret;
1454}
1455
1456/*
1457 * TLB invalidation function which is called from the mapping functions.
1458 * It flushes range of PTEs of the domain.
1459 */
1460static void __domain_flush_pages(struct protection_domain *domain,
1461				 u64 address, size_t size)
1462{
1463	struct iommu_dev_data *dev_data;
1464	int ret = 0;
1465	ioasid_t pasid = IOMMU_NO_PASID;
1466	bool gn = false;
1467
1468	lockdep_assert_held(&domain->lock);
1469
1470	if (pdom_is_v2_pgtbl_mode(domain)) {
1471		gn = true;
1472		ret = domain_flush_pages_v2(domain, address, size);
1473	} else {
1474		ret = domain_flush_pages_v1(domain, address, size);
1475	}
1476
1477	list_for_each_entry(dev_data, &domain->dev_list, list) {
1478
1479		if (!dev_data->ats_enabled)
1480			continue;
1481
1482		ret |= device_flush_iotlb(dev_data, address, size, pasid, gn);
1483	}
1484
1485	WARN_ON(ret);
1486}
1487
1488void amd_iommu_domain_flush_pages(struct protection_domain *domain,
1489				  u64 address, size_t size)
1490{
1491	lockdep_assert_held(&domain->lock);
1492
1493	if (likely(!amd_iommu_np_cache)) {
1494		__domain_flush_pages(domain, address, size);
1495
1496		/* Wait until IOMMU TLB and all device IOTLB flushes are complete */
1497		domain_flush_complete(domain);
1498
1499		return;
1500	}
1501
1502	/*
1503	 * When NpCache is on, we infer that we run in a VM and use a vIOMMU.
1504	 * In such setups it is best to avoid flushes of ranges which are not
1505	 * naturally aligned, since it would lead to flushes of unmodified
1506	 * PTEs. Such flushes would require the hypervisor to do more work than
1507	 * necessary. Therefore, perform repeated flushes of aligned ranges
1508	 * until you cover the range. Each iteration flushes the smaller
1509	 * between the natural alignment of the address that we flush and the
1510	 * greatest naturally aligned region that fits in the range.
1511	 */
1512	while (size != 0) {
1513		int addr_alignment = __ffs(address);
1514		int size_alignment = __fls(size);
1515		int min_alignment;
1516		size_t flush_size;
1517
1518		/*
1519		 * size is always non-zero, but address might be zero, causing
1520		 * addr_alignment to be negative. As the casting of the
1521		 * argument in __ffs(address) to long might trim the high bits
1522		 * of the address on x86-32, cast to long when doing the check.
1523		 */
1524		if (likely((unsigned long)address != 0))
1525			min_alignment = min(addr_alignment, size_alignment);
1526		else
1527			min_alignment = size_alignment;
1528
1529		flush_size = 1ul << min_alignment;
1530
1531		__domain_flush_pages(domain, address, flush_size);
1532		address += flush_size;
1533		size -= flush_size;
1534	}
1535
1536	/* Wait until IOMMU TLB and all device IOTLB flushes are complete */
1537	domain_flush_complete(domain);
1538}
1539
1540/* Flush the whole IO/TLB for a given protection domain - including PDE */
1541static void amd_iommu_domain_flush_all(struct protection_domain *domain)
1542{
1543	amd_iommu_domain_flush_pages(domain, 0,
1544				     CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
1545}
1546
1547void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data,
1548				     ioasid_t pasid, u64 address, size_t size)
1549{
1550	struct iommu_cmd cmd;
1551	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
1552
1553	build_inv_iommu_pages(&cmd, address, size,
1554			      dev_data->gcr3_info.domid, pasid, true);
1555	iommu_queue_command(iommu, &cmd);
1556
1557	if (dev_data->ats_enabled)
1558		device_flush_iotlb(dev_data, address, size, pasid, true);
1559
1560	iommu_completion_wait(iommu);
1561}
 
1562
1563static void dev_flush_pasid_all(struct iommu_dev_data *dev_data,
1564				ioasid_t pasid)
1565{
1566	amd_iommu_dev_flush_pasid_pages(dev_data, pasid, 0,
1567					CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
 
1568}
1569
1570/* Flush the not present cache if it exists */
1571static void domain_flush_np_cache(struct protection_domain *domain,
1572		dma_addr_t iova, size_t size)
1573{
1574	if (unlikely(amd_iommu_np_cache)) {
1575		unsigned long flags;
1576
1577		spin_lock_irqsave(&domain->lock, flags);
1578		amd_iommu_domain_flush_pages(domain, iova, size);
1579		spin_unlock_irqrestore(&domain->lock, flags);
1580	}
1581}
1582
1583
1584/*
1585 * This function flushes the DTEs for all devices in domain
1586 */
1587void amd_iommu_update_and_flush_device_table(struct protection_domain *domain)
1588{
1589	struct iommu_dev_data *dev_data;
1590
1591	lockdep_assert_held(&domain->lock);
1592
1593	list_for_each_entry(dev_data, &domain->dev_list, list) {
1594		struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
1595
1596		set_dte_entry(iommu, dev_data);
1597		clone_aliases(iommu, dev_data->dev);
1598	}
1599
1600	list_for_each_entry(dev_data, &domain->dev_list, list)
1601		device_flush_dte(dev_data);
1602
1603	domain_flush_complete(domain);
1604}
1605
1606int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag)
1607{
1608	struct iommu_dev_data *dev_data;
1609	struct amd_iommu *iommu;
1610	struct iommu_cmd cmd;
1611
1612	dev_data = dev_iommu_priv_get(dev);
1613	iommu    = get_amd_iommu_from_dev(dev);
1614
1615	build_complete_ppr(&cmd, dev_data->devid, pasid, status,
1616			   tag, dev_data->pri_tlp);
1617
1618	return iommu_queue_command(iommu, &cmd);
1619}
1620
1621/****************************************************************************
1622 *
1623 * The next functions belong to the domain allocation. A domain is
1624 * allocated for every IOMMU as the default domain. If device isolation
1625 * is enabled, every device get its own domain. The most important thing
1626 * about domains is the page table mapping the DMA address space they
1627 * contain.
1628 *
1629 ****************************************************************************/
1630
1631static int pdom_id_alloc(void)
1632{
1633	return ida_alloc_range(&pdom_ids, 1, MAX_DOMAIN_ID - 1, GFP_ATOMIC);
 
 
 
 
 
 
 
 
 
 
 
1634}
1635
1636static void pdom_id_free(int id)
1637{
1638	ida_free(&pdom_ids, id);
 
 
 
1639}
1640
1641static void free_gcr3_tbl_level1(u64 *tbl)
1642{
1643	u64 *ptr;
1644	int i;
1645
1646	for (i = 0; i < 512; ++i) {
1647		if (!(tbl[i] & GCR3_VALID))
1648			continue;
1649
1650		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1651
1652		iommu_free_page(ptr);
1653	}
1654}
1655
1656static void free_gcr3_tbl_level2(u64 *tbl)
1657{
1658	u64 *ptr;
1659	int i;
1660
1661	for (i = 0; i < 512; ++i) {
1662		if (!(tbl[i] & GCR3_VALID))
1663			continue;
1664
1665		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1666
1667		free_gcr3_tbl_level1(ptr);
1668	}
1669}
1670
1671static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info)
1672{
1673	if (gcr3_info->glx == 2)
1674		free_gcr3_tbl_level2(gcr3_info->gcr3_tbl);
1675	else if (gcr3_info->glx == 1)
1676		free_gcr3_tbl_level1(gcr3_info->gcr3_tbl);
1677	else
1678		WARN_ON_ONCE(gcr3_info->glx != 0);
1679
1680	gcr3_info->glx = 0;
1681
1682	/* Free per device domain ID */
1683	pdom_id_free(gcr3_info->domid);
1684
1685	iommu_free_page(gcr3_info->gcr3_tbl);
1686	gcr3_info->gcr3_tbl = NULL;
1687}
1688
1689/*
1690 * Number of GCR3 table levels required. Level must be 4-Kbyte
1691 * page and can contain up to 512 entries.
1692 */
1693static int get_gcr3_levels(int pasids)
1694{
1695	int levels;
1696
1697	if (pasids == -1)
1698		return amd_iommu_max_glx_val;
1699
1700	levels = get_count_order(pasids);
1701
1702	return levels ? (DIV_ROUND_UP(levels, 9) - 1) : levels;
1703}
1704
1705static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info,
1706			    struct amd_iommu *iommu, int pasids)
1707{
1708	int levels = get_gcr3_levels(pasids);
1709	int nid = iommu ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE;
1710	int domid;
1711
1712	if (levels > amd_iommu_max_glx_val)
1713		return -EINVAL;
1714
1715	if (gcr3_info->gcr3_tbl)
1716		return -EBUSY;
1717
1718	/* Allocate per device domain ID */
1719	domid = pdom_id_alloc();
1720	if (domid <= 0)
1721		return -ENOSPC;
1722	gcr3_info->domid = domid;
1723
1724	gcr3_info->gcr3_tbl = iommu_alloc_page_node(nid, GFP_ATOMIC);
1725	if (gcr3_info->gcr3_tbl == NULL) {
1726		pdom_id_free(domid);
1727		return -ENOMEM;
1728	}
1729
1730	gcr3_info->glx = levels;
 
1731
1732	return 0;
1733}
1734
1735static u64 *__get_gcr3_pte(struct gcr3_tbl_info *gcr3_info,
1736			   ioasid_t pasid, bool alloc)
1737{
1738	int index;
1739	u64 *pte;
1740	u64 *root = gcr3_info->gcr3_tbl;
1741	int level = gcr3_info->glx;
1742
1743	while (true) {
1744
1745		index = (pasid >> (9 * level)) & 0x1ff;
1746		pte   = &root[index];
1747
1748		if (level == 0)
1749			break;
1750
1751		if (!(*pte & GCR3_VALID)) {
1752			if (!alloc)
1753				return NULL;
1754
1755			root = (void *)get_zeroed_page(GFP_ATOMIC);
1756			if (root == NULL)
1757				return NULL;
1758
1759			*pte = iommu_virt_to_phys(root) | GCR3_VALID;
1760		}
1761
1762		root = iommu_phys_to_virt(*pte & PAGE_MASK);
1763
1764		level -= 1;
1765	}
1766
1767	return pte;
1768}
1769
1770static int update_gcr3(struct iommu_dev_data *dev_data,
1771		       ioasid_t pasid, unsigned long gcr3, bool set)
1772{
1773	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1774	u64 *pte;
1775
1776	pte = __get_gcr3_pte(gcr3_info, pasid, true);
1777	if (pte == NULL)
1778		return -ENOMEM;
1779
1780	if (set)
1781		*pte = (gcr3 & PAGE_MASK) | GCR3_VALID;
1782	else
1783		*pte = 0;
1784
1785	dev_flush_pasid_all(dev_data, pasid);
1786	return 0;
1787}
1788
1789int amd_iommu_set_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid,
1790		       unsigned long gcr3)
1791{
1792	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1793	int ret;
1794
1795	iommu_group_mutex_assert(dev_data->dev);
1796
1797	ret = update_gcr3(dev_data, pasid, gcr3, true);
1798	if (ret)
1799		return ret;
1800
1801	gcr3_info->pasid_cnt++;
1802	return ret;
1803}
1804
1805int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid)
1806{
1807	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1808	int ret;
1809
1810	iommu_group_mutex_assert(dev_data->dev);
1811
1812	ret = update_gcr3(dev_data, pasid, 0, false);
1813	if (ret)
1814		return ret;
1815
1816	gcr3_info->pasid_cnt--;
1817	return ret;
1818}
1819
1820static void set_dte_entry(struct amd_iommu *iommu,
1821			  struct iommu_dev_data *dev_data)
1822{
1823	u64 pte_root = 0;
1824	u64 flags = 0;
1825	u32 old_domid;
1826	u16 devid = dev_data->devid;
1827	u16 domid;
1828	struct protection_domain *domain = dev_data->domain;
1829	struct dev_table_entry *dev_table = get_dev_table(iommu);
1830	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1831
1832	if (gcr3_info && gcr3_info->gcr3_tbl)
1833		domid = dev_data->gcr3_info.domid;
1834	else
1835		domid = domain->id;
1836
1837	if (domain->iop.mode != PAGE_MODE_NONE)
1838		pte_root = iommu_virt_to_phys(domain->iop.root);
1839
1840	pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
1841		    << DEV_ENTRY_MODE_SHIFT;
1842
1843	pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
1844
1845	/*
1846	 * When SNP is enabled, Only set TV bit when IOMMU
1847	 * page translation is in use.
1848	 */
1849	if (!amd_iommu_snp_en || (domid != 0))
1850		pte_root |= DTE_FLAG_TV;
1851
1852	flags = dev_table[devid].data[1];
1853
1854	if (dev_data->ats_enabled)
1855		flags |= DTE_FLAG_IOTLB;
1856
1857	if (dev_data->ppr)
1858		pte_root |= 1ULL << DEV_ENTRY_PPR;
1859
1860	if (domain->dirty_tracking)
1861		pte_root |= DTE_FLAG_HAD;
1862
1863	if (gcr3_info && gcr3_info->gcr3_tbl) {
1864		u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
1865		u64 glx  = gcr3_info->glx;
1866		u64 tmp;
1867
1868		pte_root |= DTE_FLAG_GV;
1869		pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
1870
1871		/* First mask out possible old values for GCR3 table */
1872		tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
1873		flags    &= ~tmp;
1874
1875		tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
1876		flags    &= ~tmp;
1877
1878		/* Encode GCR3 table into DTE */
1879		tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
1880		pte_root |= tmp;
1881
1882		tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
1883		flags    |= tmp;
1884
1885		tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
1886		flags    |= tmp;
1887
1888		if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
1889			dev_table[devid].data[2] |=
1890				((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
1891		}
1892
1893		/* GIOV is supported with V2 page table mode only */
1894		if (pdom_is_v2_pgtbl_mode(domain))
1895			pte_root |= DTE_FLAG_GIOV;
1896	}
1897
1898	flags &= ~DEV_DOMID_MASK;
1899	flags |= domid;
1900
1901	old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
1902	dev_table[devid].data[1]  = flags;
1903	dev_table[devid].data[0]  = pte_root;
1904
1905	/*
1906	 * A kdump kernel might be replacing a domain ID that was copied from
1907	 * the previous kernel--if so, it needs to flush the translation cache
1908	 * entries for the old domain ID that is being overwritten
1909	 */
1910	if (old_domid) {
1911		amd_iommu_flush_tlb_domid(iommu, old_domid);
1912	}
1913}
1914
1915static void clear_dte_entry(struct amd_iommu *iommu, u16 devid)
1916{
1917	struct dev_table_entry *dev_table = get_dev_table(iommu);
1918
1919	/* remove entry from the device table seen by the hardware */
1920	dev_table[devid].data[0]  = DTE_FLAG_V;
1921
1922	if (!amd_iommu_snp_en)
1923		dev_table[devid].data[0] |= DTE_FLAG_TV;
1924
1925	dev_table[devid].data[1] &= DTE_FLAG_MASK;
1926
1927	amd_iommu_apply_erratum_63(iommu, devid);
1928}
1929
1930/* Update and flush DTE for the given device */
1931static void dev_update_dte(struct iommu_dev_data *dev_data, bool set)
1932{
1933	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
1934
1935	if (set)
1936		set_dte_entry(iommu, dev_data);
1937	else
1938		clear_dte_entry(iommu, dev_data->devid);
1939
1940	clone_aliases(iommu, dev_data->dev);
1941	device_flush_dte(dev_data);
1942	iommu_completion_wait(iommu);
1943}
1944
1945/*
1946 * If domain is SVA capable then initialize GCR3 table. Also if domain is
1947 * in v2 page table mode then update GCR3[0].
1948 */
1949static int init_gcr3_table(struct iommu_dev_data *dev_data,
1950			   struct protection_domain *pdom)
1951{
1952	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
1953	int max_pasids = dev_data->max_pasids;
1954	int ret = 0;
1955
1956	 /*
1957	  * If domain is in pt mode then setup GCR3 table only if device
1958	  * is PASID capable
1959	  */
1960	if (pdom_is_in_pt_mode(pdom) && !pdev_pasid_supported(dev_data))
1961		return ret;
1962
1963	/*
1964	 * By default, setup GCR3 table to support MAX PASIDs
1965	 * supported by the device/IOMMU.
1966	 */
1967	ret = setup_gcr3_table(&dev_data->gcr3_info, iommu,
1968			       max_pasids > 0 ?  max_pasids : 1);
1969	if (ret)
1970		return ret;
1971
1972	/* Setup GCR3[0] only if domain is setup with v2 page table mode */
1973	if (!pdom_is_v2_pgtbl_mode(pdom))
1974		return ret;
1975
1976	ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true);
1977	if (ret)
1978		free_gcr3_table(&dev_data->gcr3_info);
 
1979
1980	return ret;
1981}
1982
1983static void destroy_gcr3_table(struct iommu_dev_data *dev_data,
1984			       struct protection_domain *pdom)
1985{
1986	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1987
1988	if (pdom_is_v2_pgtbl_mode(pdom))
1989		update_gcr3(dev_data, 0, 0, false);
1990
1991	if (gcr3_info->gcr3_tbl == NULL)
 
1992		return;
1993
1994	free_gcr3_table(gcr3_info);
1995}
1996
1997static int pdom_attach_iommu(struct amd_iommu *iommu,
1998			     struct protection_domain *pdom)
1999{
2000	struct pdom_iommu_info *pdom_iommu_info, *curr;
2001	struct io_pgtable_cfg *cfg = &pdom->iop.pgtbl.cfg;
2002	unsigned long flags;
2003	int ret = 0;
2004
2005	spin_lock_irqsave(&pdom->lock, flags);
2006
2007	pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index);
2008	if (pdom_iommu_info) {
2009		pdom_iommu_info->refcnt++;
2010		goto out_unlock;
2011	}
2012
2013	pdom_iommu_info = kzalloc(sizeof(*pdom_iommu_info), GFP_ATOMIC);
2014	if (!pdom_iommu_info) {
2015		ret = -ENOMEM;
2016		goto out_unlock;
2017	}
2018
2019	pdom_iommu_info->iommu = iommu;
2020	pdom_iommu_info->refcnt = 1;
2021
2022	curr = xa_cmpxchg(&pdom->iommu_array, iommu->index,
2023			  NULL, pdom_iommu_info, GFP_ATOMIC);
2024	if (curr) {
2025		kfree(pdom_iommu_info);
2026		ret = -ENOSPC;
2027		goto out_unlock;
2028	}
2029
2030	/* Update NUMA Node ID */
2031	if (cfg->amd.nid == NUMA_NO_NODE)
2032		cfg->amd.nid = dev_to_node(&iommu->dev->dev);
2033
2034out_unlock:
2035	spin_unlock_irqrestore(&pdom->lock, flags);
2036	return ret;
2037}
2038
2039static void pdom_detach_iommu(struct amd_iommu *iommu,
2040			      struct protection_domain *pdom)
2041{
2042	struct pdom_iommu_info *pdom_iommu_info;
2043	unsigned long flags;
2044
2045	spin_lock_irqsave(&pdom->lock, flags);
2046
2047	pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index);
2048	if (!pdom_iommu_info) {
2049		spin_unlock_irqrestore(&pdom->lock, flags);
2050		return;
2051	}
2052
2053	pdom_iommu_info->refcnt--;
2054	if (pdom_iommu_info->refcnt == 0) {
2055		xa_erase(&pdom->iommu_array, iommu->index);
2056		kfree(pdom_iommu_info);
2057	}
2058
2059	spin_unlock_irqrestore(&pdom->lock, flags);
 
 
2060}
2061
2062/*
2063 * If a device is not yet associated with a domain, this function makes the
2064 * device visible in the domain
2065 */
2066static int attach_device(struct device *dev,
2067			 struct protection_domain *domain)
2068{
2069	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2070	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
2071	struct pci_dev *pdev;
2072	unsigned long flags;
2073	int ret = 0;
2074
2075	mutex_lock(&dev_data->mutex);
 
 
 
 
2076
2077	if (dev_data->domain != NULL) {
2078		ret = -EBUSY;
2079		goto out;
2080	}
2081
2082	/* Do reference counting */
2083	ret = pdom_attach_iommu(iommu, domain);
2084	if (ret)
2085		goto out;
2086
2087	/* Setup GCR3 table */
2088	if (pdom_is_sva_capable(domain)) {
2089		ret = init_gcr3_table(dev_data, domain);
2090		if (ret) {
2091			pdom_detach_iommu(iommu, domain);
2092			goto out;
2093		}
2094	}
2095
2096	pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL;
2097	if (pdev && pdom_is_sva_capable(domain)) {
2098		pdev_enable_caps(pdev);
2099
2100		/*
2101		 * Device can continue to function even if IOPF
2102		 * enablement failed. Hence in error path just
2103		 * disable device PRI support.
2104		 */
2105		if (amd_iommu_iopf_add_device(iommu, dev_data))
2106			pdev_disable_cap_pri(pdev);
2107	} else if (pdev) {
2108		pdev_enable_cap_ats(pdev);
2109	}
2110
2111	/* Update data structures */
2112	dev_data->domain = domain;
2113	spin_lock_irqsave(&domain->lock, flags);
2114	list_add(&dev_data->list, &domain->dev_list);
2115	spin_unlock_irqrestore(&domain->lock, flags);
2116
2117	/* Update device table */
2118	dev_update_dte(dev_data, true);
2119
2120out:
2121	mutex_unlock(&dev_data->mutex);
2122
2123	return ret;
2124}
2125
2126/*
2127 * Removes a device from a protection domain (with devtable_lock held)
2128 */
2129static void detach_device(struct device *dev)
2130{
2131	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2132	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
2133	struct protection_domain *domain = dev_data->domain;
2134	unsigned long flags;
2135
2136	mutex_lock(&dev_data->mutex);
 
 
 
 
 
2137
2138	/*
2139	 * First check if the device is still attached. It might already
2140	 * be detached from its domain because the generic
2141	 * iommu_detach_group code detached it and we try again here in
2142	 * our alias handling.
2143	 */
2144	if (WARN_ON(!dev_data->domain))
2145		goto out;
2146
2147	/* Remove IOPF handler */
2148	if (dev_data->ppr) {
2149		iopf_queue_flush_dev(dev);
2150		amd_iommu_iopf_remove_device(iommu, dev_data);
2151	}
2152
2153	if (dev_is_pci(dev))
2154		pdev_disable_caps(to_pci_dev(dev));
2155
2156	/* Clear DTE and flush the entry */
2157	dev_update_dte(dev_data, false);
2158
2159	/* Flush IOTLB and wait for the flushes to finish */
2160	spin_lock_irqsave(&domain->lock, flags);
2161	amd_iommu_domain_flush_all(domain);
2162	list_del(&dev_data->list);
2163	spin_unlock_irqrestore(&domain->lock, flags);
2164
2165	/* Clear GCR3 table */
2166	if (pdom_is_sva_capable(domain))
2167		destroy_gcr3_table(dev_data, domain);
2168
2169	/* Update data structures */
2170	dev_data->domain = NULL;
2171
2172	/* decrease reference counters - needs to happen after the flushes */
2173	pdom_detach_iommu(iommu, domain);
2174
2175out:
2176	mutex_unlock(&dev_data->mutex);
2177}
2178
2179static struct iommu_device *amd_iommu_probe_device(struct device *dev)
2180{
2181	struct iommu_device *iommu_dev;
2182	struct amd_iommu *iommu;
2183	struct iommu_dev_data *dev_data;
2184	int ret;
2185
2186	if (!check_device(dev))
2187		return ERR_PTR(-ENODEV);
2188
2189	iommu = rlookup_amd_iommu(dev);
2190	if (!iommu)
2191		return ERR_PTR(-ENODEV);
2192
2193	/* Not registered yet? */
2194	if (!iommu->iommu.ops)
2195		return ERR_PTR(-ENODEV);
2196
2197	if (dev_iommu_priv_get(dev))
2198		return &iommu->iommu;
2199
2200	ret = iommu_init_device(iommu, dev);
2201	if (ret) {
2202		dev_err(dev, "Failed to initialize - trying to proceed anyway\n");
 
2203		iommu_dev = ERR_PTR(ret);
2204		iommu_ignore_device(iommu, dev);
2205		goto out_err;
2206	}
2207
2208	amd_iommu_set_pci_msi_domain(dev, iommu);
2209	iommu_dev = &iommu->iommu;
2210
2211	/*
2212	 * If IOMMU and device supports PASID then it will contain max
2213	 * supported PASIDs, else it will be zero.
2214	 */
2215	dev_data = dev_iommu_priv_get(dev);
2216	if (amd_iommu_pasid_supported() && dev_is_pci(dev) &&
2217	    pdev_pasid_supported(dev_data)) {
2218		dev_data->max_pasids = min_t(u32, iommu->iommu.max_pasids,
2219					     pci_max_pasids(to_pci_dev(dev)));
2220	}
2221
2222out_err:
2223	iommu_completion_wait(iommu);
2224
2225	if (dev_is_pci(dev))
2226		pci_prepare_ats(to_pci_dev(dev), PAGE_SHIFT);
2227
2228	return iommu_dev;
2229}
2230
 
 
 
 
 
 
 
2231static void amd_iommu_release_device(struct device *dev)
2232{
2233	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2234
2235	WARN_ON(dev_data->domain);
 
2236
2237	/*
2238	 * We keep dev_data around for unplugged devices and reuse it when the
2239	 * device is re-plugged - not doing so would introduce a ton of races.
2240	 */
 
 
2241}
2242
2243static struct iommu_group *amd_iommu_device_group(struct device *dev)
2244{
2245	if (dev_is_pci(dev))
2246		return pci_device_group(dev);
2247
2248	return acpihid_device_group(dev);
2249}
2250
2251/*****************************************************************************
2252 *
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2253 * The following functions belong to the exported interface of AMD IOMMU
2254 *
2255 * This interface allows access to lower level functions of the IOMMU
2256 * like protection domain handling and assignement of devices to domains
2257 * which is not possible with the dma_ops interface.
2258 *
2259 *****************************************************************************/
2260
2261void protection_domain_free(struct protection_domain *domain)
2262{
2263	WARN_ON(!list_empty(&domain->dev_list));
2264	if (domain->domain.type & __IOMMU_DOMAIN_PAGING)
2265		free_io_pgtable_ops(&domain->iop.pgtbl.ops);
2266	pdom_id_free(domain->id);
2267	kfree(domain);
 
 
 
 
 
 
 
 
 
2268}
2269
2270static void protection_domain_init(struct protection_domain *domain, int nid)
2271{
2272	spin_lock_init(&domain->lock);
2273	INIT_LIST_HEAD(&domain->dev_list);
2274	INIT_LIST_HEAD(&domain->dev_data_list);
2275	xa_init(&domain->iommu_array);
2276	domain->iop.pgtbl.cfg.amd.nid = nid;
 
 
 
 
 
 
 
 
 
 
 
2277}
2278
2279struct protection_domain *protection_domain_alloc(int nid)
2280{
2281	struct protection_domain *domain;
2282	int domid;
2283
2284	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2285	if (!domain)
2286		return NULL;
2287
2288	domid = pdom_id_alloc();
2289	if (domid <= 0) {
2290		kfree(domain);
2291		return NULL;
2292	}
2293	domain->id = domid;
2294
2295	protection_domain_init(domain, nid);
2296
2297	return domain;
 
 
 
 
 
 
 
 
 
 
 
 
2298}
2299
2300static int pdom_setup_pgtable(struct protection_domain *domain)
2301{
2302	struct io_pgtable_ops *pgtbl_ops;
2303	enum io_pgtable_fmt fmt;
 
 
2304
2305	switch (domain->pd_mode) {
2306	case PD_MODE_V1:
2307		fmt = AMD_IOMMU_V1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2308		break;
2309	case PD_MODE_V2:
2310		fmt = AMD_IOMMU_V2;
 
 
 
 
 
 
 
 
 
 
 
2311		break;
2312	}
2313
2314	pgtbl_ops = alloc_io_pgtable_ops(fmt, &domain->iop.pgtbl.cfg, domain);
 
 
 
2315	if (!pgtbl_ops)
2316		return -ENOMEM;
2317
2318	return 0;
 
 
 
2319}
2320
2321static inline u64 dma_max_address(enum protection_domain_mode pgtable)
2322{
2323	if (pgtable == PD_MODE_V1)
2324		return ~0ULL;
2325
2326	/* V2 with 4/5 level page table */
2327	return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
2328}
2329
2330static bool amd_iommu_hd_support(struct amd_iommu *iommu)
2331{
2332	return iommu && (iommu->features & FEATURE_HDSUP);
2333}
2334
2335static struct iommu_domain *
2336do_iommu_domain_alloc(struct device *dev, u32 flags,
2337		      enum protection_domain_mode pgtable)
2338{
2339	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
2340	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
2341	struct protection_domain *domain;
2342	int ret;
 
 
 
 
 
 
2343
2344	domain = protection_domain_alloc(dev_to_node(dev));
 
 
 
 
 
 
 
 
 
 
2345	if (!domain)
2346		return ERR_PTR(-ENOMEM);
2347
2348	domain->pd_mode = pgtable;
2349	ret = pdom_setup_pgtable(domain);
2350	if (ret) {
2351		pdom_id_free(domain->id);
2352		kfree(domain);
2353		return ERR_PTR(ret);
2354	}
2355
2356	domain->domain.geometry.aperture_start = 0;
2357	domain->domain.geometry.aperture_end   = dma_max_address(pgtable);
2358	domain->domain.geometry.force_aperture = true;
2359	domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap;
2360
2361	domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
2362	domain->domain.ops = iommu->iommu.ops->default_domain_ops;
 
 
2363
2364	if (dirty_tracking)
2365		domain->domain.dirty_ops = &amd_dirty_ops;
 
2366
2367	return &domain->domain;
2368}
2369
2370static struct iommu_domain *
2371amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
2372				    const struct iommu_user_data *user_data)
2373
2374{
2375	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
2376	const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
2377						IOMMU_HWPT_ALLOC_PASID;
2378
2379	if ((flags & ~supported_flags) || user_data)
2380		return ERR_PTR(-EOPNOTSUPP);
 
2381
2382	switch (flags & supported_flags) {
2383	case IOMMU_HWPT_ALLOC_DIRTY_TRACKING:
2384		/* Allocate domain with v1 page table for dirty tracking */
2385		if (!amd_iommu_hd_support(iommu))
2386			break;
2387		return do_iommu_domain_alloc(dev, flags, PD_MODE_V1);
2388	case IOMMU_HWPT_ALLOC_PASID:
2389		/* Allocate domain with v2 page table if IOMMU supports PASID. */
2390		if (!amd_iommu_pasid_supported())
2391			break;
2392		return do_iommu_domain_alloc(dev, flags, PD_MODE_V2);
2393	case 0:
2394		/* If nothing specific is required use the kernel commandline default */
2395		return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable);
2396	default:
2397		break;
2398	}
2399	return ERR_PTR(-EOPNOTSUPP);
2400}
2401
2402void amd_iommu_domain_free(struct iommu_domain *dom)
2403{
2404	struct protection_domain *domain = to_pdomain(dom);
2405
2406	protection_domain_free(domain);
2407}
2408
2409static int blocked_domain_attach_device(struct iommu_domain *domain,
2410					struct device *dev)
2411{
2412	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2413
2414	if (dev_data->domain)
2415		detach_device(dev);
2416
2417	/* Clear DTE and flush the entry */
2418	mutex_lock(&dev_data->mutex);
2419	dev_update_dte(dev_data, false);
2420	mutex_unlock(&dev_data->mutex);
2421
2422	return 0;
2423}
2424
2425static struct iommu_domain blocked_domain = {
2426	.type = IOMMU_DOMAIN_BLOCKED,
2427	.ops = &(const struct iommu_domain_ops) {
2428		.attach_dev     = blocked_domain_attach_device,
2429	}
2430};
2431
2432static struct protection_domain identity_domain;
 
2433
2434static const struct iommu_domain_ops identity_domain_ops = {
2435	.attach_dev = amd_iommu_attach_device,
2436};
2437
2438void amd_iommu_init_identity_domain(void)
2439{
2440	struct iommu_domain *domain = &identity_domain.domain;
2441
2442	domain->type = IOMMU_DOMAIN_IDENTITY;
2443	domain->ops = &identity_domain_ops;
2444	domain->owner = &amd_iommu_ops;
2445
2446	identity_domain.id = pdom_id_alloc();
2447
2448	protection_domain_init(&identity_domain, NUMA_NO_NODE);
2449}
2450
2451/* Same as blocked domain except it supports only ops->attach_dev() */
2452static struct iommu_domain release_domain = {
2453	.type = IOMMU_DOMAIN_BLOCKED,
2454	.ops = &(const struct iommu_domain_ops) {
2455		.attach_dev     = blocked_domain_attach_device,
2456	}
2457};
2458
2459static int amd_iommu_attach_device(struct iommu_domain *dom,
2460				   struct device *dev)
2461{
2462	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2463	struct protection_domain *domain = to_pdomain(dom);
2464	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
2465	int ret;
2466
2467	/*
2468	 * Skip attach device to domain if new domain is same as
2469	 * devices current domain
2470	 */
2471	if (dev_data->domain == domain)
2472		return 0;
2473
2474	dev_data->defer_attach = false;
2475
2476	/*
2477	 * Restrict to devices with compatible IOMMU hardware support
2478	 * when enforcement of dirty tracking is enabled.
2479	 */
2480	if (dom->dirty_ops && !amd_iommu_hd_support(iommu))
2481		return -EINVAL;
2482
2483	if (dev_data->domain)
2484		detach_device(dev);
2485
2486	ret = attach_device(dev, domain);
2487
2488#ifdef CONFIG_IRQ_REMAP
2489	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
2490		if (dom->type == IOMMU_DOMAIN_UNMANAGED)
2491			dev_data->use_vapic = 1;
2492		else
2493			dev_data->use_vapic = 0;
2494	}
2495#endif
2496
 
 
2497	return ret;
2498}
2499
2500static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
2501				    unsigned long iova, size_t size)
2502{
2503	struct protection_domain *domain = to_pdomain(dom);
2504	struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
2505
2506	if (ops->map_pages)
2507		domain_flush_np_cache(domain, iova, size);
2508	return 0;
2509}
2510
2511static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
2512			       phys_addr_t paddr, size_t pgsize, size_t pgcount,
2513			       int iommu_prot, gfp_t gfp, size_t *mapped)
2514{
2515	struct protection_domain *domain = to_pdomain(dom);
2516	struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
2517	int prot = 0;
2518	int ret = -EINVAL;
2519
2520	if ((domain->pd_mode == PD_MODE_V1) &&
2521	    (domain->iop.mode == PAGE_MODE_NONE))
2522		return -EINVAL;
2523
2524	if (iommu_prot & IOMMU_READ)
2525		prot |= IOMMU_PROT_IR;
2526	if (iommu_prot & IOMMU_WRITE)
2527		prot |= IOMMU_PROT_IW;
2528
2529	if (ops->map_pages) {
2530		ret = ops->map_pages(ops, iova, paddr, pgsize,
2531				     pgcount, prot, gfp, mapped);
2532	}
2533
2534	return ret;
2535}
2536
2537static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
2538					    struct iommu_iotlb_gather *gather,
2539					    unsigned long iova, size_t size)
2540{
2541	/*
2542	 * AMD's IOMMU can flush as many pages as necessary in a single flush.
2543	 * Unless we run in a virtual machine, which can be inferred according
2544	 * to whether "non-present cache" is on, it is probably best to prefer
2545	 * (potentially) too extensive TLB flushing (i.e., more misses) over
2546	 * mutliple TLB flushes (i.e., more flushes). For virtual machines the
2547	 * hypervisor needs to synchronize the host IOMMU PTEs with those of
2548	 * the guest, and the trade-off is different: unnecessary TLB flushes
2549	 * should be avoided.
2550	 */
2551	if (amd_iommu_np_cache &&
2552	    iommu_iotlb_gather_is_disjoint(gather, iova, size))
2553		iommu_iotlb_sync(domain, gather);
2554
2555	iommu_iotlb_gather_add_range(gather, iova, size);
2556}
2557
2558static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova,
2559				    size_t pgsize, size_t pgcount,
2560				    struct iommu_iotlb_gather *gather)
2561{
2562	struct protection_domain *domain = to_pdomain(dom);
2563	struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
2564	size_t r;
2565
2566	if ((domain->pd_mode == PD_MODE_V1) &&
2567	    (domain->iop.mode == PAGE_MODE_NONE))
2568		return 0;
2569
2570	r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0;
2571
2572	if (r)
2573		amd_iommu_iotlb_gather_add_page(dom, gather, iova, r);
2574
2575	return r;
2576}
2577
2578static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2579					  dma_addr_t iova)
2580{
2581	struct protection_domain *domain = to_pdomain(dom);
2582	struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
2583
2584	return ops->iova_to_phys(ops, iova);
2585}
2586
2587static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
2588{
2589	switch (cap) {
2590	case IOMMU_CAP_CACHE_COHERENCY:
2591		return true;
2592	case IOMMU_CAP_NOEXEC:
2593		return false;
2594	case IOMMU_CAP_PRE_BOOT_PROTECTION:
2595		return amdr_ivrs_remap_support;
2596	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
2597		return true;
2598	case IOMMU_CAP_DEFERRED_FLUSH:
2599		return true;
2600	case IOMMU_CAP_DIRTY_TRACKING: {
2601		struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
2602
2603		return amd_iommu_hd_support(iommu);
2604	}
2605	default:
2606		break;
2607	}
2608
2609	return false;
2610}
2611
2612static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
2613					bool enable)
2614{
2615	struct protection_domain *pdomain = to_pdomain(domain);
2616	struct dev_table_entry *dev_table;
2617	struct iommu_dev_data *dev_data;
2618	bool domain_flush = false;
2619	struct amd_iommu *iommu;
2620	unsigned long flags;
2621	u64 pte_root;
2622
2623	spin_lock_irqsave(&pdomain->lock, flags);
2624	if (!(pdomain->dirty_tracking ^ enable)) {
2625		spin_unlock_irqrestore(&pdomain->lock, flags);
2626		return 0;
2627	}
2628
2629	list_for_each_entry(dev_data, &pdomain->dev_list, list) {
2630		iommu = get_amd_iommu_from_dev_data(dev_data);
 
 
2631
2632		dev_table = get_dev_table(iommu);
2633		pte_root = dev_table[dev_data->devid].data[0];
2634
2635		pte_root = (enable ? pte_root | DTE_FLAG_HAD :
2636				     pte_root & ~DTE_FLAG_HAD);
2637
2638		/* Flush device DTE */
2639		dev_table[dev_data->devid].data[0] = pte_root;
2640		device_flush_dte(dev_data);
2641		domain_flush = true;
2642	}
2643
2644	/* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
2645	if (domain_flush)
2646		amd_iommu_domain_flush_all(pdomain);
2647
2648	pdomain->dirty_tracking = enable;
2649	spin_unlock_irqrestore(&pdomain->lock, flags);
2650
2651	return 0;
2652}
2653
2654static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
2655					  unsigned long iova, size_t size,
2656					  unsigned long flags,
2657					  struct iommu_dirty_bitmap *dirty)
2658{
2659	struct protection_domain *pdomain = to_pdomain(domain);
2660	struct io_pgtable_ops *ops = &pdomain->iop.pgtbl.ops;
2661	unsigned long lflags;
2662
2663	if (!ops || !ops->read_and_clear_dirty)
2664		return -EOPNOTSUPP;
2665
2666	spin_lock_irqsave(&pdomain->lock, lflags);
2667	if (!pdomain->dirty_tracking && dirty->bitmap) {
2668		spin_unlock_irqrestore(&pdomain->lock, lflags);
2669		return -EINVAL;
2670	}
2671	spin_unlock_irqrestore(&pdomain->lock, lflags);
2672
2673	return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
2674}
2675
2676static void amd_iommu_get_resv_regions(struct device *dev,
2677				       struct list_head *head)
2678{
2679	struct iommu_resv_region *region;
2680	struct unity_map_entry *entry;
2681	struct amd_iommu *iommu;
2682	struct amd_iommu_pci_seg *pci_seg;
2683	int devid, sbdf;
2684
2685	sbdf = get_device_sbdf_id(dev);
2686	if (sbdf < 0)
2687		return;
2688
2689	devid = PCI_SBDF_TO_DEVID(sbdf);
2690	iommu = get_amd_iommu_from_dev(dev);
 
 
2691	pci_seg = iommu->pci_seg;
2692
2693	list_for_each_entry(entry, &pci_seg->unity_map, list) {
2694		int type, prot = 0;
2695		size_t length;
2696
2697		if (devid < entry->devid_start || devid > entry->devid_end)
2698			continue;
2699
2700		type   = IOMMU_RESV_DIRECT;
2701		length = entry->address_end - entry->address_start;
2702		if (entry->prot & IOMMU_PROT_IR)
2703			prot |= IOMMU_READ;
2704		if (entry->prot & IOMMU_PROT_IW)
2705			prot |= IOMMU_WRITE;
2706		if (entry->prot & IOMMU_UNITY_MAP_FLAG_EXCL_RANGE)
2707			/* Exclusion range */
2708			type = IOMMU_RESV_RESERVED;
2709
2710		region = iommu_alloc_resv_region(entry->address_start,
2711						 length, prot, type,
2712						 GFP_KERNEL);
2713		if (!region) {
2714			dev_err(dev, "Out of memory allocating dm-regions\n");
2715			return;
2716		}
2717		list_add_tail(&region->list, head);
2718	}
2719
2720	region = iommu_alloc_resv_region(MSI_RANGE_START,
2721					 MSI_RANGE_END - MSI_RANGE_START + 1,
2722					 0, IOMMU_RESV_MSI, GFP_KERNEL);
2723	if (!region)
2724		return;
2725	list_add_tail(&region->list, head);
2726
2727	region = iommu_alloc_resv_region(HT_RANGE_START,
2728					 HT_RANGE_END - HT_RANGE_START + 1,
2729					 0, IOMMU_RESV_RESERVED, GFP_KERNEL);
2730	if (!region)
2731		return;
2732	list_add_tail(&region->list, head);
2733}
2734
2735static bool amd_iommu_is_attach_deferred(struct device *dev)
2736{
2737	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2738
2739	return dev_data->defer_attach;
2740}
2741
2742static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
2743{
2744	struct protection_domain *dom = to_pdomain(domain);
2745	unsigned long flags;
2746
2747	spin_lock_irqsave(&dom->lock, flags);
2748	amd_iommu_domain_flush_all(dom);
2749	spin_unlock_irqrestore(&dom->lock, flags);
2750}
2751
2752static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
2753				 struct iommu_iotlb_gather *gather)
2754{
2755	struct protection_domain *dom = to_pdomain(domain);
2756	unsigned long flags;
2757
2758	spin_lock_irqsave(&dom->lock, flags);
2759	amd_iommu_domain_flush_pages(dom, gather->start,
2760				     gather->end - gather->start + 1);
2761	spin_unlock_irqrestore(&dom->lock, flags);
2762}
2763
2764static int amd_iommu_def_domain_type(struct device *dev)
2765{
2766	struct iommu_dev_data *dev_data;
2767
2768	dev_data = dev_iommu_priv_get(dev);
2769	if (!dev_data)
2770		return 0;
2771
2772	/* Always use DMA domain for untrusted device */
2773	if (dev_is_pci(dev) && to_pci_dev(dev)->untrusted)
2774		return IOMMU_DOMAIN_DMA;
2775
2776	/*
2777	 * Do not identity map IOMMUv2 capable devices when:
2778	 *  - memory encryption is active, because some of those devices
2779	 *    (AMD GPUs) don't have the encryption bit in their DMA-mask
2780	 *    and require remapping.
2781	 *  - SNP is enabled, because it prohibits DTE[Mode]=0.
2782	 */
2783	if (pdev_pasid_supported(dev_data) &&
2784	    !cc_platform_has(CC_ATTR_MEM_ENCRYPT) &&
2785	    !amd_iommu_snp_en) {
2786		return IOMMU_DOMAIN_IDENTITY;
2787	}
2788
2789	return 0;
2790}
2791
2792static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
2793{
2794	/* IOMMU_PTE_FC is always set */
2795	return true;
2796}
2797
2798static const struct iommu_dirty_ops amd_dirty_ops = {
2799	.set_dirty_tracking = amd_iommu_set_dirty_tracking,
2800	.read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
2801};
2802
2803static int amd_iommu_dev_enable_feature(struct device *dev,
2804					enum iommu_dev_features feat)
2805{
2806	int ret = 0;
2807
2808	switch (feat) {
2809	case IOMMU_DEV_FEAT_IOPF:
2810	case IOMMU_DEV_FEAT_SVA:
2811		break;
2812	default:
2813		ret = -EINVAL;
2814		break;
2815	}
2816	return ret;
2817}
2818
2819static int amd_iommu_dev_disable_feature(struct device *dev,
2820					 enum iommu_dev_features feat)
2821{
2822	int ret = 0;
2823
2824	switch (feat) {
2825	case IOMMU_DEV_FEAT_IOPF:
2826	case IOMMU_DEV_FEAT_SVA:
2827		break;
2828	default:
2829		ret = -EINVAL;
2830		break;
2831	}
2832	return ret;
2833}
2834
2835const struct iommu_ops amd_iommu_ops = {
2836	.capable = amd_iommu_capable,
2837	.blocked_domain = &blocked_domain,
2838	.release_domain = &release_domain,
2839	.identity_domain = &identity_domain.domain,
2840	.domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags,
2841	.domain_alloc_sva = amd_iommu_domain_alloc_sva,
2842	.probe_device = amd_iommu_probe_device,
2843	.release_device = amd_iommu_release_device,
 
2844	.device_group = amd_iommu_device_group,
2845	.get_resv_regions = amd_iommu_get_resv_regions,
2846	.is_attach_deferred = amd_iommu_is_attach_deferred,
 
2847	.def_domain_type = amd_iommu_def_domain_type,
2848	.dev_enable_feat = amd_iommu_dev_enable_feature,
2849	.dev_disable_feat = amd_iommu_dev_disable_feature,
2850	.remove_dev_pasid = amd_iommu_remove_dev_pasid,
2851	.page_response = amd_iommu_page_response,
2852	.default_domain_ops = &(const struct iommu_domain_ops) {
2853		.attach_dev	= amd_iommu_attach_device,
2854		.map_pages	= amd_iommu_map_pages,
2855		.unmap_pages	= amd_iommu_unmap_pages,
2856		.iotlb_sync_map	= amd_iommu_iotlb_sync_map,
2857		.iova_to_phys	= amd_iommu_iova_to_phys,
2858		.flush_iotlb_all = amd_iommu_flush_iotlb_all,
2859		.iotlb_sync	= amd_iommu_iotlb_sync,
2860		.free		= amd_iommu_domain_free,
2861		.enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
2862	}
2863};
2864
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2865#ifdef CONFIG_IRQ_REMAP
2866
2867/*****************************************************************************
2868 *
2869 * Interrupt Remapping Implementation
2870 *
2871 *****************************************************************************/
2872
2873static struct irq_chip amd_ir_chip;
2874static DEFINE_SPINLOCK(iommu_table_lock);
2875
2876static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid)
2877{
2878	int ret;
2879	u64 data;
2880	unsigned long flags;
2881	struct iommu_cmd cmd, cmd2;
2882
2883	if (iommu->irtcachedis_enabled)
2884		return;
2885
2886	build_inv_irt(&cmd, devid);
2887	data = atomic64_inc_return(&iommu->cmd_sem_val);
2888	build_completion_wait(&cmd2, iommu, data);
2889
2890	raw_spin_lock_irqsave(&iommu->lock, flags);
2891	ret = __iommu_queue_command_sync(iommu, &cmd, true);
2892	if (ret)
2893		goto out;
2894	ret = __iommu_queue_command_sync(iommu, &cmd2, false);
2895	if (ret)
2896		goto out;
2897	wait_on_sem(iommu, data);
2898out:
2899	raw_spin_unlock_irqrestore(&iommu->lock, flags);
2900}
2901
2902static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid,
2903			      struct irq_remap_table *table)
2904{
2905	u64 dte;
2906	struct dev_table_entry *dev_table = get_dev_table(iommu);
2907
2908	dte	= dev_table[devid].data[2];
2909	dte	&= ~DTE_IRQ_PHYS_ADDR_MASK;
2910	dte	|= iommu_virt_to_phys(table->table);
2911	dte	|= DTE_IRQ_REMAP_INTCTL;
2912	dte	|= DTE_INTTABLEN;
2913	dte	|= DTE_IRQ_REMAP_ENABLE;
2914
2915	dev_table[devid].data[2] = dte;
2916}
2917
2918static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid)
2919{
2920	struct irq_remap_table *table;
2921	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2922
2923	if (WARN_ONCE(!pci_seg->rlookup_table[devid],
2924		      "%s: no iommu for devid %x:%x\n",
2925		      __func__, pci_seg->id, devid))
2926		return NULL;
2927
2928	table = pci_seg->irq_lookup_table[devid];
2929	if (WARN_ONCE(!table, "%s: no table for devid %x:%x\n",
2930		      __func__, pci_seg->id, devid))
2931		return NULL;
2932
2933	return table;
2934}
2935
2936static struct irq_remap_table *__alloc_irq_table(void)
2937{
2938	struct irq_remap_table *table;
2939
2940	table = kzalloc(sizeof(*table), GFP_KERNEL);
2941	if (!table)
2942		return NULL;
2943
2944	table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
2945	if (!table->table) {
2946		kfree(table);
2947		return NULL;
2948	}
2949	raw_spin_lock_init(&table->lock);
2950
2951	if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
2952		memset(table->table, 0,
2953		       MAX_IRQS_PER_TABLE * sizeof(u32));
2954	else
2955		memset(table->table, 0,
2956		       (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
2957	return table;
2958}
2959
2960static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
2961				  struct irq_remap_table *table)
2962{
2963	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2964
2965	pci_seg->irq_lookup_table[devid] = table;
2966	set_dte_irq_entry(iommu, devid, table);
2967	iommu_flush_dte(iommu, devid);
2968}
2969
2970static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias,
2971				       void *data)
2972{
2973	struct irq_remap_table *table = data;
2974	struct amd_iommu_pci_seg *pci_seg;
2975	struct amd_iommu *iommu = rlookup_amd_iommu(&pdev->dev);
2976
2977	if (!iommu)
2978		return -EINVAL;
2979
2980	pci_seg = iommu->pci_seg;
2981	pci_seg->irq_lookup_table[alias] = table;
2982	set_dte_irq_entry(iommu, alias, table);
2983	iommu_flush_dte(pci_seg->rlookup_table[alias], alias);
2984
2985	return 0;
2986}
2987
2988static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu,
2989					       u16 devid, struct pci_dev *pdev)
2990{
2991	struct irq_remap_table *table = NULL;
2992	struct irq_remap_table *new_table = NULL;
2993	struct amd_iommu_pci_seg *pci_seg;
2994	unsigned long flags;
2995	u16 alias;
2996
2997	spin_lock_irqsave(&iommu_table_lock, flags);
2998
2999	pci_seg = iommu->pci_seg;
3000	table = pci_seg->irq_lookup_table[devid];
3001	if (table)
3002		goto out_unlock;
3003
3004	alias = pci_seg->alias_table[devid];
3005	table = pci_seg->irq_lookup_table[alias];
3006	if (table) {
3007		set_remap_table_entry(iommu, devid, table);
3008		goto out_wait;
3009	}
3010	spin_unlock_irqrestore(&iommu_table_lock, flags);
3011
3012	/* Nothing there yet, allocate new irq remapping table */
3013	new_table = __alloc_irq_table();
3014	if (!new_table)
3015		return NULL;
3016
3017	spin_lock_irqsave(&iommu_table_lock, flags);
3018
3019	table = pci_seg->irq_lookup_table[devid];
3020	if (table)
3021		goto out_unlock;
3022
3023	table = pci_seg->irq_lookup_table[alias];
3024	if (table) {
3025		set_remap_table_entry(iommu, devid, table);
3026		goto out_wait;
3027	}
3028
3029	table = new_table;
3030	new_table = NULL;
3031
3032	if (pdev)
3033		pci_for_each_dma_alias(pdev, set_remap_table_entry_alias,
3034				       table);
3035	else
3036		set_remap_table_entry(iommu, devid, table);
3037
3038	if (devid != alias)
3039		set_remap_table_entry(iommu, alias, table);
3040
3041out_wait:
3042	iommu_completion_wait(iommu);
3043
3044out_unlock:
3045	spin_unlock_irqrestore(&iommu_table_lock, flags);
3046
3047	if (new_table) {
3048		kmem_cache_free(amd_iommu_irq_cache, new_table->table);
3049		kfree(new_table);
3050	}
3051	return table;
3052}
3053
3054static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count,
3055			   bool align, struct pci_dev *pdev)
3056{
3057	struct irq_remap_table *table;
3058	int index, c, alignment = 1;
3059	unsigned long flags;
3060
3061	table = alloc_irq_table(iommu, devid, pdev);
3062	if (!table)
3063		return -ENODEV;
3064
3065	if (align)
3066		alignment = roundup_pow_of_two(count);
3067
3068	raw_spin_lock_irqsave(&table->lock, flags);
3069
3070	/* Scan table for free entries */
3071	for (index = ALIGN(table->min_index, alignment), c = 0;
3072	     index < MAX_IRQS_PER_TABLE;) {
3073		if (!iommu->irte_ops->is_allocated(table, index)) {
3074			c += 1;
3075		} else {
3076			c     = 0;
3077			index = ALIGN(index + 1, alignment);
3078			continue;
3079		}
3080
3081		if (c == count)	{
3082			for (; c != 0; --c)
3083				iommu->irte_ops->set_allocated(table, index - c + 1);
3084
3085			index -= count - 1;
3086			goto out;
3087		}
3088
3089		index++;
3090	}
3091
3092	index = -ENOSPC;
3093
3094out:
3095	raw_spin_unlock_irqrestore(&table->lock, flags);
3096
3097	return index;
3098}
3099
3100static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
3101			    struct irte_ga *irte)
3102{
3103	struct irq_remap_table *table;
3104	struct irte_ga *entry;
3105	unsigned long flags;
3106	u128 old;
3107
3108	table = get_irq_table(iommu, devid);
3109	if (!table)
3110		return -ENOMEM;
3111
3112	raw_spin_lock_irqsave(&table->lock, flags);
3113
3114	entry = (struct irte_ga *)table->table;
3115	entry = &entry[index];
3116
3117	/*
3118	 * We use cmpxchg16 to atomically update the 128-bit IRTE,
3119	 * and it cannot be updated by the hardware or other processors
3120	 * behind us, so the return value of cmpxchg16 should be the
3121	 * same as the old value.
3122	 */
3123	old = entry->irte;
3124	WARN_ON(!try_cmpxchg128(&entry->irte, &old, irte->irte));
3125
3126	raw_spin_unlock_irqrestore(&table->lock, flags);
3127
3128	return 0;
3129}
3130
3131static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
3132			  struct irte_ga *irte)
3133{
3134	bool ret;
3135
3136	ret = __modify_irte_ga(iommu, devid, index, irte);
3137	if (ret)
3138		return ret;
3139
3140	iommu_flush_irt_and_complete(iommu, devid);
3141
3142	return 0;
3143}
3144
3145static int modify_irte(struct amd_iommu *iommu,
3146		       u16 devid, int index, union irte *irte)
3147{
3148	struct irq_remap_table *table;
3149	unsigned long flags;
3150
3151	table = get_irq_table(iommu, devid);
3152	if (!table)
3153		return -ENOMEM;
3154
3155	raw_spin_lock_irqsave(&table->lock, flags);
3156	table->table[index] = irte->val;
3157	raw_spin_unlock_irqrestore(&table->lock, flags);
3158
3159	iommu_flush_irt_and_complete(iommu, devid);
3160
3161	return 0;
3162}
3163
3164static void free_irte(struct amd_iommu *iommu, u16 devid, int index)
3165{
3166	struct irq_remap_table *table;
3167	unsigned long flags;
3168
3169	table = get_irq_table(iommu, devid);
3170	if (!table)
3171		return;
3172
3173	raw_spin_lock_irqsave(&table->lock, flags);
3174	iommu->irte_ops->clear_allocated(table, index);
3175	raw_spin_unlock_irqrestore(&table->lock, flags);
3176
3177	iommu_flush_irt_and_complete(iommu, devid);
3178}
3179
3180static void irte_prepare(void *entry,
3181			 u32 delivery_mode, bool dest_mode,
3182			 u8 vector, u32 dest_apicid, int devid)
3183{
3184	union irte *irte = (union irte *) entry;
3185
3186	irte->val                = 0;
3187	irte->fields.vector      = vector;
3188	irte->fields.int_type    = delivery_mode;
3189	irte->fields.destination = dest_apicid;
3190	irte->fields.dm          = dest_mode;
3191	irte->fields.valid       = 1;
3192}
3193
3194static void irte_ga_prepare(void *entry,
3195			    u32 delivery_mode, bool dest_mode,
3196			    u8 vector, u32 dest_apicid, int devid)
3197{
3198	struct irte_ga *irte = (struct irte_ga *) entry;
3199
3200	irte->lo.val                      = 0;
3201	irte->hi.val                      = 0;
3202	irte->lo.fields_remap.int_type    = delivery_mode;
3203	irte->lo.fields_remap.dm          = dest_mode;
3204	irte->hi.fields.vector            = vector;
3205	irte->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(dest_apicid);
3206	irte->hi.fields.destination       = APICID_TO_IRTE_DEST_HI(dest_apicid);
3207	irte->lo.fields_remap.valid       = 1;
3208}
3209
3210static void irte_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3211{
3212	union irte *irte = (union irte *) entry;
3213
3214	irte->fields.valid = 1;
3215	modify_irte(iommu, devid, index, irte);
3216}
3217
3218static void irte_ga_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3219{
3220	struct irte_ga *irte = (struct irte_ga *) entry;
3221
3222	irte->lo.fields_remap.valid = 1;
3223	modify_irte_ga(iommu, devid, index, irte);
3224}
3225
3226static void irte_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3227{
3228	union irte *irte = (union irte *) entry;
3229
3230	irte->fields.valid = 0;
3231	modify_irte(iommu, devid, index, irte);
3232}
3233
3234static void irte_ga_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3235{
3236	struct irte_ga *irte = (struct irte_ga *) entry;
3237
3238	irte->lo.fields_remap.valid = 0;
3239	modify_irte_ga(iommu, devid, index, irte);
3240}
3241
3242static void irte_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3243			      u8 vector, u32 dest_apicid)
3244{
3245	union irte *irte = (union irte *) entry;
3246
3247	irte->fields.vector = vector;
3248	irte->fields.destination = dest_apicid;
3249	modify_irte(iommu, devid, index, irte);
3250}
3251
3252static void irte_ga_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3253				 u8 vector, u32 dest_apicid)
3254{
3255	struct irte_ga *irte = (struct irte_ga *) entry;
3256
3257	if (!irte->lo.fields_remap.guest_mode) {
3258		irte->hi.fields.vector = vector;
3259		irte->lo.fields_remap.destination =
3260					APICID_TO_IRTE_DEST_LO(dest_apicid);
3261		irte->hi.fields.destination =
3262					APICID_TO_IRTE_DEST_HI(dest_apicid);
3263		modify_irte_ga(iommu, devid, index, irte);
3264	}
3265}
3266
3267#define IRTE_ALLOCATED (~1U)
3268static void irte_set_allocated(struct irq_remap_table *table, int index)
3269{
3270	table->table[index] = IRTE_ALLOCATED;
3271}
3272
3273static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
3274{
3275	struct irte_ga *ptr = (struct irte_ga *)table->table;
3276	struct irte_ga *irte = &ptr[index];
3277
3278	memset(&irte->lo.val, 0, sizeof(u64));
3279	memset(&irte->hi.val, 0, sizeof(u64));
3280	irte->hi.fields.vector = 0xff;
3281}
3282
3283static bool irte_is_allocated(struct irq_remap_table *table, int index)
3284{
3285	union irte *ptr = (union irte *)table->table;
3286	union irte *irte = &ptr[index];
3287
3288	return irte->val != 0;
3289}
3290
3291static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
3292{
3293	struct irte_ga *ptr = (struct irte_ga *)table->table;
3294	struct irte_ga *irte = &ptr[index];
3295
3296	return irte->hi.fields.vector != 0;
3297}
3298
3299static void irte_clear_allocated(struct irq_remap_table *table, int index)
3300{
3301	table->table[index] = 0;
3302}
3303
3304static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
3305{
3306	struct irte_ga *ptr = (struct irte_ga *)table->table;
3307	struct irte_ga *irte = &ptr[index];
3308
3309	memset(&irte->lo.val, 0, sizeof(u64));
3310	memset(&irte->hi.val, 0, sizeof(u64));
3311}
3312
3313static int get_devid(struct irq_alloc_info *info)
3314{
3315	switch (info->type) {
3316	case X86_IRQ_ALLOC_TYPE_IOAPIC:
3317		return get_ioapic_devid(info->devid);
3318	case X86_IRQ_ALLOC_TYPE_HPET:
3319		return get_hpet_devid(info->devid);
3320	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3321	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3322		return get_device_sbdf_id(msi_desc_to_dev(info->desc));
3323	default:
3324		WARN_ON_ONCE(1);
3325		return -1;
3326	}
3327}
3328
3329struct irq_remap_ops amd_iommu_irq_ops = {
3330	.prepare		= amd_iommu_prepare,
3331	.enable			= amd_iommu_enable,
3332	.disable		= amd_iommu_disable,
3333	.reenable		= amd_iommu_reenable,
3334	.enable_faulting	= amd_iommu_enable_faulting,
3335};
3336
3337static void fill_msi_msg(struct msi_msg *msg, u32 index)
3338{
3339	msg->data = index;
3340	msg->address_lo = 0;
3341	msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
3342	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
3343}
3344
3345static void irq_remapping_prepare_irte(struct amd_ir_data *data,
3346				       struct irq_cfg *irq_cfg,
3347				       struct irq_alloc_info *info,
3348				       int devid, int index, int sub_handle)
3349{
3350	struct irq_2_irte *irte_info = &data->irq_2_irte;
3351	struct amd_iommu *iommu = data->iommu;
3352
3353	if (!iommu)
3354		return;
3355
3356	data->irq_2_irte.devid = devid;
3357	data->irq_2_irte.index = index + sub_handle;
3358	iommu->irte_ops->prepare(data->entry, APIC_DELIVERY_MODE_FIXED,
3359				 apic->dest_mode_logical, irq_cfg->vector,
3360				 irq_cfg->dest_apicid, devid);
3361
3362	switch (info->type) {
3363	case X86_IRQ_ALLOC_TYPE_IOAPIC:
3364	case X86_IRQ_ALLOC_TYPE_HPET:
3365	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3366	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3367		fill_msi_msg(&data->msi_entry, irte_info->index);
3368		break;
3369
3370	default:
3371		BUG_ON(1);
3372		break;
3373	}
3374}
3375
3376struct amd_irte_ops irte_32_ops = {
3377	.prepare = irte_prepare,
3378	.activate = irte_activate,
3379	.deactivate = irte_deactivate,
3380	.set_affinity = irte_set_affinity,
3381	.set_allocated = irte_set_allocated,
3382	.is_allocated = irte_is_allocated,
3383	.clear_allocated = irte_clear_allocated,
3384};
3385
3386struct amd_irte_ops irte_128_ops = {
3387	.prepare = irte_ga_prepare,
3388	.activate = irte_ga_activate,
3389	.deactivate = irte_ga_deactivate,
3390	.set_affinity = irte_ga_set_affinity,
3391	.set_allocated = irte_ga_set_allocated,
3392	.is_allocated = irte_ga_is_allocated,
3393	.clear_allocated = irte_ga_clear_allocated,
3394};
3395
3396static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
3397			       unsigned int nr_irqs, void *arg)
3398{
3399	struct irq_alloc_info *info = arg;
3400	struct irq_data *irq_data;
3401	struct amd_ir_data *data = NULL;
3402	struct amd_iommu *iommu;
3403	struct irq_cfg *cfg;
3404	int i, ret, devid, seg, sbdf;
3405	int index;
3406
3407	if (!info)
3408		return -EINVAL;
3409	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI)
3410		return -EINVAL;
3411
3412	sbdf = get_devid(info);
3413	if (sbdf < 0)
3414		return -EINVAL;
3415
3416	seg = PCI_SBDF_TO_SEGID(sbdf);
3417	devid = PCI_SBDF_TO_DEVID(sbdf);
3418	iommu = __rlookup_amd_iommu(seg, devid);
3419	if (!iommu)
3420		return -EINVAL;
3421
3422	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
3423	if (ret < 0)
3424		return ret;
3425
3426	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
3427		struct irq_remap_table *table;
3428
3429		table = alloc_irq_table(iommu, devid, NULL);
3430		if (table) {
3431			if (!table->min_index) {
3432				/*
3433				 * Keep the first 32 indexes free for IOAPIC
3434				 * interrupts.
3435				 */
3436				table->min_index = 32;
3437				for (i = 0; i < 32; ++i)
3438					iommu->irte_ops->set_allocated(table, i);
3439			}
3440			WARN_ON(table->min_index != 32);
3441			index = info->ioapic.pin;
3442		} else {
3443			index = -ENOMEM;
3444		}
3445	} else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI ||
3446		   info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) {
3447		bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI);
3448
3449		index = alloc_irq_index(iommu, devid, nr_irqs, align,
3450					msi_desc_to_pci_dev(info->desc));
3451	} else {
3452		index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL);
3453	}
3454
3455	if (index < 0) {
3456		pr_warn("Failed to allocate IRTE\n");
3457		ret = index;
3458		goto out_free_parent;
3459	}
3460
3461	for (i = 0; i < nr_irqs; i++) {
3462		irq_data = irq_domain_get_irq_data(domain, virq + i);
3463		cfg = irq_data ? irqd_cfg(irq_data) : NULL;
3464		if (!cfg) {
3465			ret = -EINVAL;
3466			goto out_free_data;
3467		}
3468
3469		ret = -ENOMEM;
3470		data = kzalloc(sizeof(*data), GFP_KERNEL);
3471		if (!data)
3472			goto out_free_data;
3473
3474		if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
3475			data->entry = kzalloc(sizeof(union irte), GFP_KERNEL);
3476		else
3477			data->entry = kzalloc(sizeof(struct irte_ga),
3478						     GFP_KERNEL);
3479		if (!data->entry) {
3480			kfree(data);
3481			goto out_free_data;
3482		}
3483
3484		data->iommu = iommu;
3485		irq_data->hwirq = (devid << 16) + i;
3486		irq_data->chip_data = data;
3487		irq_data->chip = &amd_ir_chip;
3488		irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
3489		irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
3490	}
3491
3492	return 0;
3493
3494out_free_data:
3495	for (i--; i >= 0; i--) {
3496		irq_data = irq_domain_get_irq_data(domain, virq + i);
3497		if (irq_data)
3498			kfree(irq_data->chip_data);
3499	}
3500	for (i = 0; i < nr_irqs; i++)
3501		free_irte(iommu, devid, index + i);
3502out_free_parent:
3503	irq_domain_free_irqs_common(domain, virq, nr_irqs);
3504	return ret;
3505}
3506
3507static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
3508			       unsigned int nr_irqs)
3509{
3510	struct irq_2_irte *irte_info;
3511	struct irq_data *irq_data;
3512	struct amd_ir_data *data;
3513	int i;
3514
3515	for (i = 0; i < nr_irqs; i++) {
3516		irq_data = irq_domain_get_irq_data(domain, virq  + i);
3517		if (irq_data && irq_data->chip_data) {
3518			data = irq_data->chip_data;
3519			irte_info = &data->irq_2_irte;
3520			free_irte(data->iommu, irte_info->devid, irte_info->index);
3521			kfree(data->entry);
3522			kfree(data);
3523		}
3524	}
3525	irq_domain_free_irqs_common(domain, virq, nr_irqs);
3526}
3527
3528static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3529			       struct amd_ir_data *ir_data,
3530			       struct irq_2_irte *irte_info,
3531			       struct irq_cfg *cfg);
3532
3533static int irq_remapping_activate(struct irq_domain *domain,
3534				  struct irq_data *irq_data, bool reserve)
3535{
3536	struct amd_ir_data *data = irq_data->chip_data;
3537	struct irq_2_irte *irte_info = &data->irq_2_irte;
3538	struct amd_iommu *iommu = data->iommu;
3539	struct irq_cfg *cfg = irqd_cfg(irq_data);
3540
3541	if (!iommu)
3542		return 0;
3543
3544	iommu->irte_ops->activate(iommu, data->entry, irte_info->devid,
3545				  irte_info->index);
3546	amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg);
3547	return 0;
3548}
3549
3550static void irq_remapping_deactivate(struct irq_domain *domain,
3551				     struct irq_data *irq_data)
3552{
3553	struct amd_ir_data *data = irq_data->chip_data;
3554	struct irq_2_irte *irte_info = &data->irq_2_irte;
3555	struct amd_iommu *iommu = data->iommu;
3556
3557	if (iommu)
3558		iommu->irte_ops->deactivate(iommu, data->entry, irte_info->devid,
3559					    irte_info->index);
3560}
3561
3562static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec,
3563				enum irq_domain_bus_token bus_token)
3564{
3565	struct amd_iommu *iommu;
3566	int devid = -1;
3567
3568	if (!amd_iommu_irq_remap)
3569		return 0;
3570
3571	if (x86_fwspec_is_ioapic(fwspec))
3572		devid = get_ioapic_devid(fwspec->param[0]);
3573	else if (x86_fwspec_is_hpet(fwspec))
3574		devid = get_hpet_devid(fwspec->param[0]);
3575
3576	if (devid < 0)
3577		return 0;
3578	iommu = __rlookup_amd_iommu((devid >> 16), (devid & 0xffff));
3579
3580	return iommu && iommu->ir_domain == d;
3581}
3582
3583static const struct irq_domain_ops amd_ir_domain_ops = {
3584	.select = irq_remapping_select,
3585	.alloc = irq_remapping_alloc,
3586	.free = irq_remapping_free,
3587	.activate = irq_remapping_activate,
3588	.deactivate = irq_remapping_deactivate,
3589};
3590
3591int amd_iommu_activate_guest_mode(void *data)
3592{
3593	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3594	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3595	u64 valid;
3596
3597	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry)
3598		return 0;
3599
3600	valid = entry->lo.fields_vapic.valid;
3601
3602	entry->lo.val = 0;
3603	entry->hi.val = 0;
3604
3605	entry->lo.fields_vapic.valid       = valid;
3606	entry->lo.fields_vapic.guest_mode  = 1;
3607	entry->lo.fields_vapic.ga_log_intr = 1;
3608	entry->hi.fields.ga_root_ptr       = ir_data->ga_root_ptr;
3609	entry->hi.fields.vector            = ir_data->ga_vector;
3610	entry->lo.fields_vapic.ga_tag      = ir_data->ga_tag;
3611
3612	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3613			      ir_data->irq_2_irte.index, entry);
3614}
3615EXPORT_SYMBOL(amd_iommu_activate_guest_mode);
3616
3617int amd_iommu_deactivate_guest_mode(void *data)
3618{
3619	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3620	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3621	struct irq_cfg *cfg = ir_data->cfg;
3622	u64 valid;
3623
3624	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3625	    !entry || !entry->lo.fields_vapic.guest_mode)
3626		return 0;
3627
3628	valid = entry->lo.fields_remap.valid;
3629
3630	entry->lo.val = 0;
3631	entry->hi.val = 0;
3632
3633	entry->lo.fields_remap.valid       = valid;
3634	entry->lo.fields_remap.dm          = apic->dest_mode_logical;
3635	entry->lo.fields_remap.int_type    = APIC_DELIVERY_MODE_FIXED;
3636	entry->hi.fields.vector            = cfg->vector;
3637	entry->lo.fields_remap.destination =
3638				APICID_TO_IRTE_DEST_LO(cfg->dest_apicid);
3639	entry->hi.fields.destination =
3640				APICID_TO_IRTE_DEST_HI(cfg->dest_apicid);
3641
3642	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3643			      ir_data->irq_2_irte.index, entry);
3644}
3645EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
3646
3647static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
3648{
3649	int ret;
3650	struct amd_iommu_pi_data *pi_data = vcpu_info;
3651	struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
3652	struct amd_ir_data *ir_data = data->chip_data;
3653	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3654	struct iommu_dev_data *dev_data;
3655
3656	if (ir_data->iommu == NULL)
3657		return -EINVAL;
3658
3659	dev_data = search_dev_data(ir_data->iommu, irte_info->devid);
3660
3661	/* Note:
3662	 * This device has never been set up for guest mode.
3663	 * we should not modify the IRTE
3664	 */
3665	if (!dev_data || !dev_data->use_vapic)
3666		return 0;
3667
3668	ir_data->cfg = irqd_cfg(data);
3669	pi_data->ir_data = ir_data;
3670
3671	/* Note:
3672	 * SVM tries to set up for VAPIC mode, but we are in
3673	 * legacy mode. So, we force legacy mode instead.
3674	 */
3675	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
3676		pr_debug("%s: Fall back to using intr legacy remap\n",
3677			 __func__);
3678		pi_data->is_guest_mode = false;
3679	}
3680
3681	pi_data->prev_ga_tag = ir_data->cached_ga_tag;
3682	if (pi_data->is_guest_mode) {
3683		ir_data->ga_root_ptr = (pi_data->base >> 12);
3684		ir_data->ga_vector = vcpu_pi_info->vector;
3685		ir_data->ga_tag = pi_data->ga_tag;
3686		ret = amd_iommu_activate_guest_mode(ir_data);
3687		if (!ret)
3688			ir_data->cached_ga_tag = pi_data->ga_tag;
3689	} else {
3690		ret = amd_iommu_deactivate_guest_mode(ir_data);
3691
3692		/*
3693		 * This communicates the ga_tag back to the caller
3694		 * so that it can do all the necessary clean up.
3695		 */
3696		if (!ret)
3697			ir_data->cached_ga_tag = 0;
3698	}
3699
3700	return ret;
3701}
3702
3703
3704static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3705			       struct amd_ir_data *ir_data,
3706			       struct irq_2_irte *irte_info,
3707			       struct irq_cfg *cfg)
3708{
3709
3710	/*
3711	 * Atomically updates the IRTE with the new destination, vector
3712	 * and flushes the interrupt entry cache.
3713	 */
3714	iommu->irte_ops->set_affinity(iommu, ir_data->entry, irte_info->devid,
3715				      irte_info->index, cfg->vector,
3716				      cfg->dest_apicid);
3717}
3718
3719static int amd_ir_set_affinity(struct irq_data *data,
3720			       const struct cpumask *mask, bool force)
3721{
3722	struct amd_ir_data *ir_data = data->chip_data;
3723	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3724	struct irq_cfg *cfg = irqd_cfg(data);
3725	struct irq_data *parent = data->parent_data;
3726	struct amd_iommu *iommu = ir_data->iommu;
3727	int ret;
3728
3729	if (!iommu)
3730		return -ENODEV;
3731
3732	ret = parent->chip->irq_set_affinity(parent, mask, force);
3733	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
3734		return ret;
3735
3736	amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg);
3737	/*
3738	 * After this point, all the interrupts will start arriving
3739	 * at the new destination. So, time to cleanup the previous
3740	 * vector allocation.
3741	 */
3742	vector_schedule_cleanup(cfg);
3743
3744	return IRQ_SET_MASK_OK_DONE;
3745}
3746
3747static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
3748{
3749	struct amd_ir_data *ir_data = irq_data->chip_data;
3750
3751	*msg = ir_data->msi_entry;
3752}
3753
3754static struct irq_chip amd_ir_chip = {
3755	.name			= "AMD-IR",
3756	.irq_ack		= apic_ack_irq,
3757	.irq_set_affinity	= amd_ir_set_affinity,
3758	.irq_set_vcpu_affinity	= amd_ir_set_vcpu_affinity,
3759	.irq_compose_msi_msg	= ir_compose_msi_msg,
3760};
3761
3762static const struct msi_parent_ops amdvi_msi_parent_ops = {
3763	.supported_flags	= X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI,
 
 
3764	.prefix			= "IR-",
3765	.init_dev_msi_info	= msi_parent_init_dev_msi_info,
3766};
3767
 
 
 
 
 
 
 
3768int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
3769{
3770	struct fwnode_handle *fn;
3771
3772	fn = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index);
3773	if (!fn)
3774		return -ENOMEM;
3775	iommu->ir_domain = irq_domain_create_hierarchy(arch_get_ir_parent_domain(), 0, 0,
3776						       fn, &amd_ir_domain_ops, iommu);
3777	if (!iommu->ir_domain) {
3778		irq_domain_free_fwnode(fn);
3779		return -ENOMEM;
3780	}
3781
3782	irq_domain_update_bus_token(iommu->ir_domain,  DOMAIN_BUS_AMDVI);
3783	iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT |
3784				   IRQ_DOMAIN_FLAG_ISOLATED_MSI;
3785	iommu->ir_domain->msi_parent_ops = &amdvi_msi_parent_ops;
 
 
 
 
3786
3787	return 0;
3788}
3789
3790int amd_iommu_update_ga(int cpu, bool is_run, void *data)
3791{
3792	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3793	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3794
3795	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3796	    !entry || !entry->lo.fields_vapic.guest_mode)
3797		return 0;
3798
3799	if (!ir_data->iommu)
3800		return -ENODEV;
3801
3802	if (cpu >= 0) {
3803		entry->lo.fields_vapic.destination =
3804					APICID_TO_IRTE_DEST_LO(cpu);
3805		entry->hi.fields.destination =
3806					APICID_TO_IRTE_DEST_HI(cpu);
3807	}
3808	entry->lo.fields_vapic.is_run = is_run;
3809
3810	return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3811				ir_data->irq_2_irte.index, entry);
3812}
3813EXPORT_SYMBOL(amd_iommu_update_ga);
3814#endif
v6.8
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
   4 * Author: Joerg Roedel <jroedel@suse.de>
   5 *         Leo Duran <leo.duran@amd.com>
   6 */
   7
   8#define pr_fmt(fmt)     "AMD-Vi: " fmt
   9#define dev_fmt(fmt)    pr_fmt(fmt)
  10
  11#include <linux/ratelimit.h>
  12#include <linux/pci.h>
  13#include <linux/acpi.h>
  14#include <linux/pci-ats.h>
  15#include <linux/bitmap.h>
  16#include <linux/slab.h>
  17#include <linux/debugfs.h>
  18#include <linux/scatterlist.h>
  19#include <linux/dma-map-ops.h>
  20#include <linux/dma-direct.h>
 
  21#include <linux/iommu-helper.h>
  22#include <linux/delay.h>
  23#include <linux/amd-iommu.h>
  24#include <linux/notifier.h>
  25#include <linux/export.h>
  26#include <linux/irq.h>
  27#include <linux/msi.h>
  28#include <linux/irqdomain.h>
  29#include <linux/percpu.h>
  30#include <linux/io-pgtable.h>
  31#include <linux/cc_platform.h>
  32#include <asm/irq_remapping.h>
  33#include <asm/io_apic.h>
  34#include <asm/apic.h>
  35#include <asm/hw_irq.h>
  36#include <asm/proto.h>
  37#include <asm/iommu.h>
  38#include <asm/gart.h>
  39#include <asm/dma.h>
  40#include <uapi/linux/iommufd.h>
  41
  42#include "amd_iommu.h"
  43#include "../dma-iommu.h"
  44#include "../irq_remapping.h"
 
  45
  46#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
  47
  48/* IO virtual address start page frame number */
  49#define IOVA_START_PFN		(1)
  50#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
  51
  52/* Reserved IOVA ranges */
  53#define MSI_RANGE_START		(0xfee00000)
  54#define MSI_RANGE_END		(0xfeefffff)
  55#define HT_RANGE_START		(0xfd00000000ULL)
  56#define HT_RANGE_END		(0xffffffffffULL)
  57
  58#define DEFAULT_PGTABLE_LEVEL	PAGE_MODE_3_LEVEL
  59
  60static DEFINE_SPINLOCK(pd_bitmap_lock);
  61
  62LIST_HEAD(ioapic_map);
  63LIST_HEAD(hpet_map);
  64LIST_HEAD(acpihid_map);
  65
  66const struct iommu_ops amd_iommu_ops;
  67static const struct iommu_dirty_ops amd_dirty_ops;
  68
  69int amd_iommu_max_glx_val = -1;
  70
  71/*
  72 * general struct to manage commands send to an IOMMU
  73 */
  74struct iommu_cmd {
  75	u32 data[4];
  76};
  77
 
 
 
 
 
 
  78struct kmem_cache *amd_iommu_irq_cache;
  79
  80static void detach_device(struct device *dev);
 
 
 
 
  81
  82/****************************************************************************
  83 *
  84 * Helper functions
  85 *
  86 ****************************************************************************/
  87
  88static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom)
  89{
  90	return (pdom && (pdom->flags & PD_IOMMUV2_MASK));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  91}
  92
  93static inline int get_acpihid_device_id(struct device *dev,
  94					struct acpihid_map_entry **entry)
  95{
  96	struct acpi_device *adev = ACPI_COMPANION(dev);
  97	struct acpihid_map_entry *p;
  98
  99	if (!adev)
 100		return -ENODEV;
 101
 102	list_for_each_entry(p, &acpihid_map, list) {
 103		if (acpi_dev_hid_uid_match(adev, p->hid,
 104					   p->uid[0] ? p->uid : NULL)) {
 105			if (entry)
 106				*entry = p;
 107			return p->devid;
 108		}
 109	}
 110	return -EINVAL;
 111}
 112
 113static inline int get_device_sbdf_id(struct device *dev)
 114{
 115	int sbdf;
 116
 117	if (dev_is_pci(dev))
 118		sbdf = get_pci_sbdf_id(to_pci_dev(dev));
 119	else
 120		sbdf = get_acpihid_device_id(dev, NULL);
 121
 122	return sbdf;
 123}
 124
 125struct dev_table_entry *get_dev_table(struct amd_iommu *iommu)
 126{
 127	struct dev_table_entry *dev_table;
 128	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 129
 130	BUG_ON(pci_seg == NULL);
 131	dev_table = pci_seg->dev_table;
 132	BUG_ON(dev_table == NULL);
 133
 134	return dev_table;
 135}
 136
 137static inline u16 get_device_segment(struct device *dev)
 138{
 139	u16 seg;
 140
 141	if (dev_is_pci(dev)) {
 142		struct pci_dev *pdev = to_pci_dev(dev);
 143
 144		seg = pci_domain_nr(pdev->bus);
 145	} else {
 146		u32 devid = get_acpihid_device_id(dev, NULL);
 147
 148		seg = PCI_SBDF_TO_SEGID(devid);
 149	}
 150
 151	return seg;
 152}
 153
 154/* Writes the specific IOMMU for a device into the PCI segment rlookup table */
 155void amd_iommu_set_rlookup_table(struct amd_iommu *iommu, u16 devid)
 156{
 157	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 158
 159	pci_seg->rlookup_table[devid] = iommu;
 160}
 161
 162static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid)
 163{
 164	struct amd_iommu_pci_seg *pci_seg;
 165
 166	for_each_pci_segment(pci_seg) {
 167		if (pci_seg->id == seg)
 168			return pci_seg->rlookup_table[devid];
 169	}
 170	return NULL;
 171}
 172
 173static struct amd_iommu *rlookup_amd_iommu(struct device *dev)
 174{
 175	u16 seg = get_device_segment(dev);
 176	int devid = get_device_sbdf_id(dev);
 177
 178	if (devid < 0)
 179		return NULL;
 180	return __rlookup_amd_iommu(seg, PCI_SBDF_TO_DEVID(devid));
 181}
 182
 183static struct protection_domain *to_pdomain(struct iommu_domain *dom)
 184{
 185	return container_of(dom, struct protection_domain, domain);
 186}
 187
 188static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid)
 189{
 190	struct iommu_dev_data *dev_data;
 191	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 192
 193	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
 194	if (!dev_data)
 195		return NULL;
 196
 197	spin_lock_init(&dev_data->lock);
 198	dev_data->devid = devid;
 199	ratelimit_default_init(&dev_data->rs);
 200
 201	llist_add(&dev_data->dev_data_list, &pci_seg->dev_data_list);
 202	return dev_data;
 203}
 204
 205static struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid)
 206{
 207	struct iommu_dev_data *dev_data;
 208	struct llist_node *node;
 209	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 210
 211	if (llist_empty(&pci_seg->dev_data_list))
 212		return NULL;
 213
 214	node = pci_seg->dev_data_list.first;
 215	llist_for_each_entry(dev_data, node, dev_data_list) {
 216		if (dev_data->devid == devid)
 217			return dev_data;
 218	}
 219
 220	return NULL;
 221}
 222
 223static int clone_alias(struct pci_dev *pdev, u16 alias, void *data)
 224{
 225	struct amd_iommu *iommu;
 226	struct dev_table_entry *dev_table;
 227	u16 devid = pci_dev_id(pdev);
 228
 229	if (devid == alias)
 230		return 0;
 231
 232	iommu = rlookup_amd_iommu(&pdev->dev);
 233	if (!iommu)
 234		return 0;
 235
 236	amd_iommu_set_rlookup_table(iommu, alias);
 237	dev_table = get_dev_table(iommu);
 238	memcpy(dev_table[alias].data,
 239	       dev_table[devid].data,
 240	       sizeof(dev_table[alias].data));
 241
 242	return 0;
 243}
 244
 245static void clone_aliases(struct amd_iommu *iommu, struct device *dev)
 246{
 247	struct pci_dev *pdev;
 248
 249	if (!dev_is_pci(dev))
 250		return;
 251	pdev = to_pci_dev(dev);
 252
 253	/*
 254	 * The IVRS alias stored in the alias table may not be
 255	 * part of the PCI DMA aliases if it's bus differs
 256	 * from the original device.
 257	 */
 258	clone_alias(pdev, iommu->pci_seg->alias_table[pci_dev_id(pdev)], NULL);
 259
 260	pci_for_each_dma_alias(pdev, clone_alias, NULL);
 261}
 262
 263static void setup_aliases(struct amd_iommu *iommu, struct device *dev)
 264{
 265	struct pci_dev *pdev = to_pci_dev(dev);
 266	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 267	u16 ivrs_alias;
 268
 269	/* For ACPI HID devices, there are no aliases */
 270	if (!dev_is_pci(dev))
 271		return;
 272
 273	/*
 274	 * Add the IVRS alias to the pci aliases if it is on the same
 275	 * bus. The IVRS table may know about a quirk that we don't.
 276	 */
 277	ivrs_alias = pci_seg->alias_table[pci_dev_id(pdev)];
 278	if (ivrs_alias != pci_dev_id(pdev) &&
 279	    PCI_BUS_NUM(ivrs_alias) == pdev->bus->number)
 280		pci_add_dma_alias(pdev, ivrs_alias & 0xff, 1);
 281
 282	clone_aliases(iommu, dev);
 283}
 284
 285static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid)
 286{
 287	struct iommu_dev_data *dev_data;
 288
 289	dev_data = search_dev_data(iommu, devid);
 290
 291	if (dev_data == NULL) {
 292		dev_data = alloc_dev_data(iommu, devid);
 293		if (!dev_data)
 294			return NULL;
 295
 296		if (translation_pre_enabled(iommu))
 297			dev_data->defer_attach = true;
 298	}
 299
 300	return dev_data;
 301}
 302
 303/*
 304* Find or create an IOMMU group for a acpihid device.
 305*/
 306static struct iommu_group *acpihid_device_group(struct device *dev)
 307{
 308	struct acpihid_map_entry *p, *entry = NULL;
 309	int devid;
 310
 311	devid = get_acpihid_device_id(dev, &entry);
 312	if (devid < 0)
 313		return ERR_PTR(devid);
 314
 315	list_for_each_entry(p, &acpihid_map, list) {
 316		if ((devid == p->devid) && p->group)
 317			entry->group = p->group;
 318	}
 319
 320	if (!entry->group)
 321		entry->group = generic_device_group(dev);
 322	else
 323		iommu_group_ref_get(entry->group);
 324
 325	return entry->group;
 326}
 327
 328static inline bool pdev_pasid_supported(struct iommu_dev_data *dev_data)
 329{
 330	return (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP);
 331}
 332
 333static u32 pdev_get_caps(struct pci_dev *pdev)
 334{
 335	int features;
 336	u32 flags = 0;
 337
 338	if (pci_ats_supported(pdev))
 339		flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
 340
 341	if (pci_pri_supported(pdev))
 342		flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
 343
 344	features = pci_pasid_features(pdev);
 345	if (features >= 0) {
 346		flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
 347
 348		if (features & PCI_PASID_CAP_EXEC)
 349			flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
 350
 351		if (features & PCI_PASID_CAP_PRIV)
 352			flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
 353	}
 354
 355	return flags;
 356}
 357
 358static inline int pdev_enable_cap_ats(struct pci_dev *pdev)
 359{
 360	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 361	int ret = -EINVAL;
 362
 363	if (dev_data->ats_enabled)
 364		return 0;
 365
 366	if (amd_iommu_iotlb_sup &&
 367	    (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP)) {
 368		ret = pci_enable_ats(pdev, PAGE_SHIFT);
 369		if (!ret) {
 370			dev_data->ats_enabled = 1;
 371			dev_data->ats_qdep    = pci_ats_queue_depth(pdev);
 372		}
 373	}
 374
 375	return ret;
 376}
 377
 378static inline void pdev_disable_cap_ats(struct pci_dev *pdev)
 379{
 380	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 381
 382	if (dev_data->ats_enabled) {
 383		pci_disable_ats(pdev);
 384		dev_data->ats_enabled = 0;
 385	}
 386}
 387
 388int amd_iommu_pdev_enable_cap_pri(struct pci_dev *pdev)
 389{
 390	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 391	int ret = -EINVAL;
 392
 393	if (dev_data->pri_enabled)
 394		return 0;
 395
 
 
 
 396	if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) {
 397		/*
 398		 * First reset the PRI state of the device.
 399		 * FIXME: Hardcode number of outstanding requests for now
 400		 */
 401		if (!pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) {
 402			dev_data->pri_enabled = 1;
 403			dev_data->pri_tlp     = pci_prg_resp_pasid_required(pdev);
 404
 405			ret = 0;
 406		}
 407	}
 408
 409	return ret;
 410}
 411
 412void amd_iommu_pdev_disable_cap_pri(struct pci_dev *pdev)
 413{
 414	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 415
 416	if (dev_data->pri_enabled) {
 417		pci_disable_pri(pdev);
 418		dev_data->pri_enabled = 0;
 419	}
 420}
 421
 422static inline int pdev_enable_cap_pasid(struct pci_dev *pdev)
 423{
 424	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 425	int ret = -EINVAL;
 426
 427	if (dev_data->pasid_enabled)
 428		return 0;
 429
 430	if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) {
 431		/* Only allow access to user-accessible pages */
 432		ret = pci_enable_pasid(pdev, 0);
 433		if (!ret)
 434			dev_data->pasid_enabled = 1;
 435	}
 436
 437	return ret;
 438}
 439
 440static inline void pdev_disable_cap_pasid(struct pci_dev *pdev)
 441{
 442	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 443
 444	if (dev_data->pasid_enabled) {
 445		pci_disable_pasid(pdev);
 446		dev_data->pasid_enabled = 0;
 447	}
 448}
 449
 450static void pdev_enable_caps(struct pci_dev *pdev)
 451{
 452	pdev_enable_cap_ats(pdev);
 453	pdev_enable_cap_pasid(pdev);
 454	amd_iommu_pdev_enable_cap_pri(pdev);
 455
 456}
 457
 458static void pdev_disable_caps(struct pci_dev *pdev)
 459{
 460	pdev_disable_cap_ats(pdev);
 461	pdev_disable_cap_pasid(pdev);
 462	amd_iommu_pdev_disable_cap_pri(pdev);
 463}
 464
 465/*
 466 * This function checks if the driver got a valid device from the caller to
 467 * avoid dereferencing invalid pointers.
 468 */
 469static bool check_device(struct device *dev)
 470{
 471	struct amd_iommu_pci_seg *pci_seg;
 472	struct amd_iommu *iommu;
 473	int devid, sbdf;
 474
 475	if (!dev)
 476		return false;
 477
 478	sbdf = get_device_sbdf_id(dev);
 479	if (sbdf < 0)
 480		return false;
 481	devid = PCI_SBDF_TO_DEVID(sbdf);
 482
 483	iommu = rlookup_amd_iommu(dev);
 484	if (!iommu)
 485		return false;
 486
 487	/* Out of our scope? */
 488	pci_seg = iommu->pci_seg;
 489	if (devid > pci_seg->last_bdf)
 490		return false;
 491
 492	return true;
 493}
 494
 495static int iommu_init_device(struct amd_iommu *iommu, struct device *dev)
 496{
 497	struct iommu_dev_data *dev_data;
 498	int devid, sbdf;
 499
 500	if (dev_iommu_priv_get(dev))
 501		return 0;
 502
 503	sbdf = get_device_sbdf_id(dev);
 504	if (sbdf < 0)
 505		return sbdf;
 506
 507	devid = PCI_SBDF_TO_DEVID(sbdf);
 508	dev_data = find_dev_data(iommu, devid);
 509	if (!dev_data)
 510		return -ENOMEM;
 511
 512	dev_data->dev = dev;
 513	setup_aliases(iommu, dev);
 514
 515	/*
 516	 * By default we use passthrough mode for IOMMUv2 capable device.
 517	 * But if amd_iommu=force_isolation is set (e.g. to debug DMA to
 518	 * invalid address), we ignore the capability for the device so
 519	 * it'll be forced to go into translation mode.
 520	 */
 521	if ((iommu_default_passthrough() || !amd_iommu_force_isolation) &&
 522	    dev_is_pci(dev) && amd_iommu_gt_ppr_supported()) {
 523		dev_data->flags = pdev_get_caps(to_pci_dev(dev));
 524	}
 525
 526	dev_iommu_priv_set(dev, dev_data);
 527
 528	return 0;
 529}
 530
 531static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev)
 532{
 533	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 534	struct dev_table_entry *dev_table = get_dev_table(iommu);
 535	int devid, sbdf;
 536
 537	sbdf = get_device_sbdf_id(dev);
 538	if (sbdf < 0)
 539		return;
 540
 541	devid = PCI_SBDF_TO_DEVID(sbdf);
 542	pci_seg->rlookup_table[devid] = NULL;
 543	memset(&dev_table[devid], 0, sizeof(struct dev_table_entry));
 544
 545	setup_aliases(iommu, dev);
 546}
 547
 548static void amd_iommu_uninit_device(struct device *dev)
 549{
 550	struct iommu_dev_data *dev_data;
 551
 552	dev_data = dev_iommu_priv_get(dev);
 553	if (!dev_data)
 554		return;
 555
 556	if (dev_data->domain)
 557		detach_device(dev);
 558
 559	/*
 560	 * We keep dev_data around for unplugged devices and reuse it when the
 561	 * device is re-plugged - not doing so would introduce a ton of races.
 562	 */
 563}
 564
 565/****************************************************************************
 566 *
 567 * Interrupt handling functions
 568 *
 569 ****************************************************************************/
 570
 571static void dump_dte_entry(struct amd_iommu *iommu, u16 devid)
 572{
 573	int i;
 574	struct dev_table_entry *dev_table = get_dev_table(iommu);
 575
 576	for (i = 0; i < 4; ++i)
 577		pr_err("DTE[%d]: %016llx\n", i, dev_table[devid].data[i]);
 578}
 579
 580static void dump_command(unsigned long phys_addr)
 581{
 582	struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr);
 583	int i;
 584
 585	for (i = 0; i < 4; ++i)
 586		pr_err("CMD[%d]: %08x\n", i, cmd->data[i]);
 587}
 588
 589static void amd_iommu_report_rmp_hw_error(struct amd_iommu *iommu, volatile u32 *event)
 590{
 591	struct iommu_dev_data *dev_data = NULL;
 592	int devid, vmg_tag, flags;
 593	struct pci_dev *pdev;
 594	u64 spa;
 595
 596	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
 597	vmg_tag = (event[1]) & 0xFFFF;
 598	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
 599	spa     = ((u64)event[3] << 32) | (event[2] & 0xFFFFFFF8);
 600
 601	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
 602					   devid & 0xff);
 603	if (pdev)
 604		dev_data = dev_iommu_priv_get(&pdev->dev);
 605
 606	if (dev_data) {
 607		if (__ratelimit(&dev_data->rs)) {
 608			pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
 609				vmg_tag, spa, flags);
 610		}
 611	} else {
 612		pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
 613			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 614			vmg_tag, spa, flags);
 615	}
 616
 617	if (pdev)
 618		pci_dev_put(pdev);
 619}
 620
 621static void amd_iommu_report_rmp_fault(struct amd_iommu *iommu, volatile u32 *event)
 622{
 623	struct iommu_dev_data *dev_data = NULL;
 624	int devid, flags_rmp, vmg_tag, flags;
 625	struct pci_dev *pdev;
 626	u64 gpa;
 627
 628	devid     = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
 629	flags_rmp = (event[0] >> EVENT_FLAGS_SHIFT) & 0xFF;
 630	vmg_tag   = (event[1]) & 0xFFFF;
 631	flags     = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
 632	gpa       = ((u64)event[3] << 32) | event[2];
 633
 634	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
 635					   devid & 0xff);
 636	if (pdev)
 637		dev_data = dev_iommu_priv_get(&pdev->dev);
 638
 639	if (dev_data) {
 640		if (__ratelimit(&dev_data->rs)) {
 641			pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
 642				vmg_tag, gpa, flags_rmp, flags);
 643		}
 644	} else {
 645		pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
 646			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 647			vmg_tag, gpa, flags_rmp, flags);
 648	}
 649
 650	if (pdev)
 651		pci_dev_put(pdev);
 652}
 653
 654#define IS_IOMMU_MEM_TRANSACTION(flags)		\
 655	(((flags) & EVENT_FLAG_I) == 0)
 656
 657#define IS_WRITE_REQUEST(flags)			\
 658	((flags) & EVENT_FLAG_RW)
 659
 660static void amd_iommu_report_page_fault(struct amd_iommu *iommu,
 661					u16 devid, u16 domain_id,
 662					u64 address, int flags)
 663{
 664	struct iommu_dev_data *dev_data = NULL;
 665	struct pci_dev *pdev;
 666
 667	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
 668					   devid & 0xff);
 669	if (pdev)
 670		dev_data = dev_iommu_priv_get(&pdev->dev);
 671
 672	if (dev_data) {
 673		/*
 674		 * If this is a DMA fault (for which the I(nterrupt)
 675		 * bit will be unset), allow report_iommu_fault() to
 676		 * prevent logging it.
 677		 */
 678		if (IS_IOMMU_MEM_TRANSACTION(flags)) {
 679			/* Device not attached to domain properly */
 680			if (dev_data->domain == NULL) {
 681				pr_err_ratelimited("Event logged [Device not attached to domain properly]\n");
 682				pr_err_ratelimited("  device=%04x:%02x:%02x.%x domain=0x%04x\n",
 683						   iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
 684						   PCI_FUNC(devid), domain_id);
 685				goto out;
 686			}
 687
 688			if (!report_iommu_fault(&dev_data->domain->domain,
 689						&pdev->dev, address,
 690						IS_WRITE_REQUEST(flags) ?
 691							IOMMU_FAULT_WRITE :
 692							IOMMU_FAULT_READ))
 693				goto out;
 694		}
 695
 696		if (__ratelimit(&dev_data->rs)) {
 697			pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n",
 698				domain_id, address, flags);
 699		}
 700	} else {
 701		pr_err_ratelimited("Event logged [IO_PAGE_FAULT device=%04x:%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n",
 702			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 703			domain_id, address, flags);
 704	}
 705
 706out:
 707	if (pdev)
 708		pci_dev_put(pdev);
 709}
 710
 711static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
 712{
 713	struct device *dev = iommu->iommu.dev;
 714	int type, devid, flags, tag;
 715	volatile u32 *event = __evt;
 716	int count = 0;
 717	u64 address;
 718	u32 pasid;
 719
 720retry:
 721	type    = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
 722	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
 723	pasid   = (event[0] & EVENT_DOMID_MASK_HI) |
 724		  (event[1] & EVENT_DOMID_MASK_LO);
 725	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
 726	address = (u64)(((u64)event[3]) << 32) | event[2];
 727
 728	if (type == 0) {
 729		/* Did we hit the erratum? */
 730		if (++count == LOOP_TIMEOUT) {
 731			pr_err("No event written to event log\n");
 732			return;
 733		}
 734		udelay(1);
 735		goto retry;
 736	}
 737
 738	if (type == EVENT_TYPE_IO_FAULT) {
 739		amd_iommu_report_page_fault(iommu, devid, pasid, address, flags);
 740		return;
 741	}
 742
 743	switch (type) {
 744	case EVENT_TYPE_ILL_DEV:
 745		dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
 746			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 747			pasid, address, flags);
 748		dump_dte_entry(iommu, devid);
 749		break;
 750	case EVENT_TYPE_DEV_TAB_ERR:
 751		dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x "
 752			"address=0x%llx flags=0x%04x]\n",
 753			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 754			address, flags);
 755		break;
 756	case EVENT_TYPE_PAGE_TAB_ERR:
 757		dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x pasid=0x%04x address=0x%llx flags=0x%04x]\n",
 758			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 759			pasid, address, flags);
 760		break;
 761	case EVENT_TYPE_ILL_CMD:
 762		dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%llx]\n", address);
 763		dump_command(address);
 764		break;
 765	case EVENT_TYPE_CMD_HARD_ERR:
 766		dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%llx flags=0x%04x]\n",
 767			address, flags);
 768		break;
 769	case EVENT_TYPE_IOTLB_INV_TO:
 770		dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%04x:%02x:%02x.%x address=0x%llx]\n",
 771			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 772			address);
 773		break;
 774	case EVENT_TYPE_INV_DEV_REQ:
 775		dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
 776			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 777			pasid, address, flags);
 778		break;
 779	case EVENT_TYPE_RMP_FAULT:
 780		amd_iommu_report_rmp_fault(iommu, event);
 781		break;
 782	case EVENT_TYPE_RMP_HW_ERR:
 783		amd_iommu_report_rmp_hw_error(iommu, event);
 784		break;
 785	case EVENT_TYPE_INV_PPR_REQ:
 786		pasid = PPR_PASID(*((u64 *)__evt));
 787		tag = event[1] & 0x03FF;
 788		dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n",
 789			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 790			pasid, address, flags, tag);
 791		break;
 792	default:
 793		dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n",
 794			event[0], event[1], event[2], event[3]);
 795	}
 796
 797	/*
 798	 * To detect the hardware errata 732 we need to clear the
 799	 * entry back to zero. This issue does not exist on SNP
 800	 * enabled system. Also this buffer is not writeable on
 801	 * SNP enabled system.
 802	 */
 803	if (!amd_iommu_snp_en)
 804		memset(__evt, 0, 4 * sizeof(u32));
 805}
 806
 807static void iommu_poll_events(struct amd_iommu *iommu)
 808{
 809	u32 head, tail;
 810
 811	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
 812	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
 813
 814	while (head != tail) {
 815		iommu_print_event(iommu, iommu->evt_buf + head);
 
 
 816		head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
 
 817	}
 818
 819	writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
 820}
 821
 822static void iommu_poll_ppr_log(struct amd_iommu *iommu)
 823{
 824	u32 head, tail;
 825
 826	if (iommu->ppr_log == NULL)
 827		return;
 828
 829	head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
 830	tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
 831
 832	while (head != tail) {
 833		volatile u64 *raw;
 834		u64 entry[2];
 835		int i;
 836
 837		raw = (u64 *)(iommu->ppr_log + head);
 838
 839		/*
 840		 * Hardware bug: Interrupt may arrive before the entry is
 841		 * written to memory. If this happens we need to wait for the
 842		 * entry to arrive.
 843		 */
 844		for (i = 0; i < LOOP_TIMEOUT; ++i) {
 845			if (PPR_REQ_TYPE(raw[0]) != 0)
 846				break;
 847			udelay(1);
 848		}
 849
 850		/* Avoid memcpy function-call overhead */
 851		entry[0] = raw[0];
 852		entry[1] = raw[1];
 853
 854		/*
 855		 * To detect the hardware errata 733 we need to clear the
 856		 * entry back to zero. This issue does not exist on SNP
 857		 * enabled system. Also this buffer is not writeable on
 858		 * SNP enabled system.
 859		 */
 860		if (!amd_iommu_snp_en)
 861			raw[0] = raw[1] = 0UL;
 862
 863		/* Update head pointer of hardware ring-buffer */
 864		head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
 865		writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
 866
 867		/* TODO: PPR Handler will be added when we add IOPF support */
 868
 869		/* Refresh ring-buffer information */
 870		head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
 871		tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
 872	}
 873}
 874
 875#ifdef CONFIG_IRQ_REMAP
 876static int (*iommu_ga_log_notifier)(u32);
 877
 878int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
 879{
 880	iommu_ga_log_notifier = notifier;
 881
 882	return 0;
 883}
 884EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
 885
 886static void iommu_poll_ga_log(struct amd_iommu *iommu)
 887{
 888	u32 head, tail;
 889
 890	if (iommu->ga_log == NULL)
 891		return;
 892
 893	head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
 894	tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
 895
 896	while (head != tail) {
 897		volatile u64 *raw;
 898		u64 log_entry;
 899
 900		raw = (u64 *)(iommu->ga_log + head);
 901
 902		/* Avoid memcpy function-call overhead */
 903		log_entry = *raw;
 904
 905		/* Update head pointer of hardware ring-buffer */
 906		head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
 907		writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
 908
 909		/* Handle GA entry */
 910		switch (GA_REQ_TYPE(log_entry)) {
 911		case GA_GUEST_NR:
 912			if (!iommu_ga_log_notifier)
 913				break;
 914
 915			pr_debug("%s: devid=%#x, ga_tag=%#x\n",
 916				 __func__, GA_DEVID(log_entry),
 917				 GA_TAG(log_entry));
 918
 919			if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
 920				pr_err("GA log notifier failed.\n");
 921			break;
 922		default:
 923			break;
 924		}
 925	}
 926}
 927
 928static void
 929amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu)
 930{
 931	if (!irq_remapping_enabled || !dev_is_pci(dev) ||
 932	    !pci_dev_has_default_msi_parent_domain(to_pci_dev(dev)))
 933		return;
 934
 935	dev_set_msi_domain(dev, iommu->ir_domain);
 936}
 937
 938#else /* CONFIG_IRQ_REMAP */
 939static inline void
 940amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { }
 941#endif /* !CONFIG_IRQ_REMAP */
 942
 943static void amd_iommu_handle_irq(void *data, const char *evt_type,
 944				 u32 int_mask, u32 overflow_mask,
 945				 void (*int_handler)(struct amd_iommu *),
 946				 void (*overflow_handler)(struct amd_iommu *))
 947{
 948	struct amd_iommu *iommu = (struct amd_iommu *) data;
 949	u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
 950	u32 mask = int_mask | overflow_mask;
 951
 952	while (status & mask) {
 953		/* Enable interrupt sources again */
 954		writel(mask, iommu->mmio_base + MMIO_STATUS_OFFSET);
 955
 956		if (int_handler) {
 957			pr_devel("Processing IOMMU (ivhd%d) %s Log\n",
 958				 iommu->index, evt_type);
 959			int_handler(iommu);
 960		}
 961
 962		if ((status & overflow_mask) && overflow_handler)
 963			overflow_handler(iommu);
 964
 965		/*
 966		 * Hardware bug: ERBT1312
 967		 * When re-enabling interrupt (by writing 1
 968		 * to clear the bit), the hardware might also try to set
 969		 * the interrupt bit in the event status register.
 970		 * In this scenario, the bit will be set, and disable
 971		 * subsequent interrupts.
 972		 *
 973		 * Workaround: The IOMMU driver should read back the
 974		 * status register and check if the interrupt bits are cleared.
 975		 * If not, driver will need to go through the interrupt handler
 976		 * again and re-clear the bits
 977		 */
 978		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
 979	}
 980}
 981
 982irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data)
 983{
 984	amd_iommu_handle_irq(data, "Evt", MMIO_STATUS_EVT_INT_MASK,
 985			     MMIO_STATUS_EVT_OVERFLOW_MASK,
 986			     iommu_poll_events, amd_iommu_restart_event_logging);
 987
 988	return IRQ_HANDLED;
 989}
 990
 991irqreturn_t amd_iommu_int_thread_pprlog(int irq, void *data)
 992{
 993	amd_iommu_handle_irq(data, "PPR", MMIO_STATUS_PPR_INT_MASK,
 994			     MMIO_STATUS_PPR_OVERFLOW_MASK,
 995			     iommu_poll_ppr_log, amd_iommu_restart_ppr_log);
 996
 997	return IRQ_HANDLED;
 998}
 999
1000irqreturn_t amd_iommu_int_thread_galog(int irq, void *data)
1001{
1002#ifdef CONFIG_IRQ_REMAP
1003	amd_iommu_handle_irq(data, "GA", MMIO_STATUS_GALOG_INT_MASK,
1004			     MMIO_STATUS_GALOG_OVERFLOW_MASK,
1005			     iommu_poll_ga_log, amd_iommu_restart_ga_log);
1006#endif
1007
1008	return IRQ_HANDLED;
1009}
1010
1011irqreturn_t amd_iommu_int_thread(int irq, void *data)
1012{
1013	amd_iommu_int_thread_evtlog(irq, data);
1014	amd_iommu_int_thread_pprlog(irq, data);
1015	amd_iommu_int_thread_galog(irq, data);
1016
1017	return IRQ_HANDLED;
1018}
1019
1020irqreturn_t amd_iommu_int_handler(int irq, void *data)
1021{
1022	return IRQ_WAKE_THREAD;
1023}
1024
1025/****************************************************************************
1026 *
1027 * IOMMU command queuing functions
1028 *
1029 ****************************************************************************/
1030
1031static int wait_on_sem(struct amd_iommu *iommu, u64 data)
1032{
1033	int i = 0;
1034
1035	while (*iommu->cmd_sem != data && i < LOOP_TIMEOUT) {
1036		udelay(1);
1037		i += 1;
1038	}
1039
1040	if (i == LOOP_TIMEOUT) {
1041		pr_alert("Completion-Wait loop timed out\n");
1042		return -EIO;
1043	}
1044
1045	return 0;
1046}
1047
1048static void copy_cmd_to_buffer(struct amd_iommu *iommu,
1049			       struct iommu_cmd *cmd)
1050{
1051	u8 *target;
1052	u32 tail;
1053
1054	/* Copy command to buffer */
1055	tail = iommu->cmd_buf_tail;
1056	target = iommu->cmd_buf + tail;
1057	memcpy(target, cmd, sizeof(*cmd));
1058
1059	tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
1060	iommu->cmd_buf_tail = tail;
1061
1062	/* Tell the IOMMU about it */
1063	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
1064}
1065
1066static void build_completion_wait(struct iommu_cmd *cmd,
1067				  struct amd_iommu *iommu,
1068				  u64 data)
1069{
1070	u64 paddr = iommu_virt_to_phys((void *)iommu->cmd_sem);
1071
1072	memset(cmd, 0, sizeof(*cmd));
1073	cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
1074	cmd->data[1] = upper_32_bits(paddr);
1075	cmd->data[2] = lower_32_bits(data);
1076	cmd->data[3] = upper_32_bits(data);
1077	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
1078}
1079
1080static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
1081{
1082	memset(cmd, 0, sizeof(*cmd));
1083	cmd->data[0] = devid;
1084	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
1085}
1086
1087/*
1088 * Builds an invalidation address which is suitable for one page or multiple
1089 * pages. Sets the size bit (S) as needed is more than one page is flushed.
1090 */
1091static inline u64 build_inv_address(u64 address, size_t size)
1092{
1093	u64 pages, end, msb_diff;
1094
1095	pages = iommu_num_pages(address, size, PAGE_SIZE);
1096
1097	if (pages == 1)
1098		return address & PAGE_MASK;
1099
1100	end = address + size - 1;
1101
1102	/*
1103	 * msb_diff would hold the index of the most significant bit that
1104	 * flipped between the start and end.
1105	 */
1106	msb_diff = fls64(end ^ address) - 1;
1107
1108	/*
1109	 * Bits 63:52 are sign extended. If for some reason bit 51 is different
1110	 * between the start and the end, invalidate everything.
1111	 */
1112	if (unlikely(msb_diff > 51)) {
1113		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
1114	} else {
1115		/*
1116		 * The msb-bit must be clear on the address. Just set all the
1117		 * lower bits.
1118		 */
1119		address |= (1ull << msb_diff) - 1;
1120	}
1121
1122	/* Clear bits 11:0 */
1123	address &= PAGE_MASK;
1124
1125	/* Set the size bit - we flush more than one 4kb page */
1126	return address | CMD_INV_IOMMU_PAGES_SIZE_MASK;
1127}
1128
1129static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
1130				  size_t size, u16 domid,
1131				  ioasid_t pasid, bool gn)
1132{
1133	u64 inv_address = build_inv_address(address, size);
1134
1135	memset(cmd, 0, sizeof(*cmd));
1136
1137	cmd->data[1] |= domid;
1138	cmd->data[2]  = lower_32_bits(inv_address);
1139	cmd->data[3]  = upper_32_bits(inv_address);
1140	/* PDE bit - we want to flush everything, not only the PTEs */
1141	cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
1142	if (gn) {
1143		cmd->data[0] |= pasid;
1144		cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1145	}
1146	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
1147}
1148
1149static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
1150				  u64 address, size_t size,
1151				  ioasid_t pasid, bool gn)
1152{
1153	u64 inv_address = build_inv_address(address, size);
1154
1155	memset(cmd, 0, sizeof(*cmd));
1156
1157	cmd->data[0]  = devid;
1158	cmd->data[0] |= (qdep & 0xff) << 24;
1159	cmd->data[1]  = devid;
1160	cmd->data[2]  = lower_32_bits(inv_address);
1161	cmd->data[3]  = upper_32_bits(inv_address);
1162	if (gn) {
1163		cmd->data[0] |= ((pasid >> 8) & 0xff) << 16;
1164		cmd->data[1] |= (pasid & 0xff) << 16;
1165		cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1166	}
1167
1168	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
1169}
1170
1171static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid,
1172			       int status, int tag, u8 gn)
1173{
1174	memset(cmd, 0, sizeof(*cmd));
1175
1176	cmd->data[0]  = devid;
1177	if (gn) {
1178		cmd->data[1]  = pasid;
1179		cmd->data[2]  = CMD_INV_IOMMU_PAGES_GN_MASK;
1180	}
1181	cmd->data[3]  = tag & 0x1ff;
1182	cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;
1183
1184	CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
1185}
1186
1187static void build_inv_all(struct iommu_cmd *cmd)
1188{
1189	memset(cmd, 0, sizeof(*cmd));
1190	CMD_SET_TYPE(cmd, CMD_INV_ALL);
1191}
1192
1193static void build_inv_irt(struct iommu_cmd *cmd, u16 devid)
1194{
1195	memset(cmd, 0, sizeof(*cmd));
1196	cmd->data[0] = devid;
1197	CMD_SET_TYPE(cmd, CMD_INV_IRT);
1198}
1199
1200/*
1201 * Writes the command to the IOMMUs command buffer and informs the
1202 * hardware about the new command.
1203 */
1204static int __iommu_queue_command_sync(struct amd_iommu *iommu,
1205				      struct iommu_cmd *cmd,
1206				      bool sync)
1207{
1208	unsigned int count = 0;
1209	u32 left, next_tail;
1210
1211	next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
1212again:
1213	left      = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE;
1214
1215	if (left <= 0x20) {
1216		/* Skip udelay() the first time around */
1217		if (count++) {
1218			if (count == LOOP_TIMEOUT) {
1219				pr_err("Command buffer timeout\n");
1220				return -EIO;
1221			}
1222
1223			udelay(1);
1224		}
1225
1226		/* Update head and recheck remaining space */
1227		iommu->cmd_buf_head = readl(iommu->mmio_base +
1228					    MMIO_CMD_HEAD_OFFSET);
1229
1230		goto again;
1231	}
1232
1233	copy_cmd_to_buffer(iommu, cmd);
1234
1235	/* Do we need to make sure all commands are processed? */
1236	iommu->need_sync = sync;
1237
1238	return 0;
1239}
1240
1241static int iommu_queue_command_sync(struct amd_iommu *iommu,
1242				    struct iommu_cmd *cmd,
1243				    bool sync)
1244{
1245	unsigned long flags;
1246	int ret;
1247
1248	raw_spin_lock_irqsave(&iommu->lock, flags);
1249	ret = __iommu_queue_command_sync(iommu, cmd, sync);
1250	raw_spin_unlock_irqrestore(&iommu->lock, flags);
1251
1252	return ret;
1253}
1254
1255static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
1256{
1257	return iommu_queue_command_sync(iommu, cmd, true);
1258}
1259
1260/*
1261 * This function queues a completion wait command into the command
1262 * buffer of an IOMMU
1263 */
1264static int iommu_completion_wait(struct amd_iommu *iommu)
1265{
1266	struct iommu_cmd cmd;
1267	unsigned long flags;
1268	int ret;
1269	u64 data;
1270
1271	if (!iommu->need_sync)
1272		return 0;
1273
1274	data = atomic64_add_return(1, &iommu->cmd_sem_val);
1275	build_completion_wait(&cmd, iommu, data);
1276
1277	raw_spin_lock_irqsave(&iommu->lock, flags);
1278
1279	ret = __iommu_queue_command_sync(iommu, &cmd, false);
1280	if (ret)
1281		goto out_unlock;
1282
1283	ret = wait_on_sem(iommu, data);
1284
1285out_unlock:
1286	raw_spin_unlock_irqrestore(&iommu->lock, flags);
1287
1288	return ret;
1289}
1290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1291static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
1292{
1293	struct iommu_cmd cmd;
1294
1295	build_inv_dte(&cmd, devid);
1296
1297	return iommu_queue_command(iommu, &cmd);
1298}
1299
1300static void amd_iommu_flush_dte_all(struct amd_iommu *iommu)
1301{
1302	u32 devid;
1303	u16 last_bdf = iommu->pci_seg->last_bdf;
1304
1305	for (devid = 0; devid <= last_bdf; ++devid)
1306		iommu_flush_dte(iommu, devid);
1307
1308	iommu_completion_wait(iommu);
1309}
1310
1311/*
1312 * This function uses heavy locking and may disable irqs for some time. But
1313 * this is no issue because it is only called during resume.
1314 */
1315static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu)
1316{
1317	u32 dom_id;
1318	u16 last_bdf = iommu->pci_seg->last_bdf;
1319
1320	for (dom_id = 0; dom_id <= last_bdf; ++dom_id) {
1321		struct iommu_cmd cmd;
1322		build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1323				      dom_id, IOMMU_NO_PASID, false);
1324		iommu_queue_command(iommu, &cmd);
1325	}
1326
1327	iommu_completion_wait(iommu);
1328}
1329
1330static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
1331{
1332	struct iommu_cmd cmd;
1333
1334	build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1335			      dom_id, IOMMU_NO_PASID, false);
1336	iommu_queue_command(iommu, &cmd);
1337
1338	iommu_completion_wait(iommu);
1339}
1340
1341static void amd_iommu_flush_all(struct amd_iommu *iommu)
1342{
1343	struct iommu_cmd cmd;
1344
1345	build_inv_all(&cmd);
1346
1347	iommu_queue_command(iommu, &cmd);
1348	iommu_completion_wait(iommu);
1349}
1350
1351static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid)
1352{
1353	struct iommu_cmd cmd;
1354
1355	build_inv_irt(&cmd, devid);
1356
1357	iommu_queue_command(iommu, &cmd);
1358}
1359
1360static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
1361{
1362	u32 devid;
1363	u16 last_bdf = iommu->pci_seg->last_bdf;
1364
1365	if (iommu->irtcachedis_enabled)
1366		return;
1367
1368	for (devid = 0; devid <= last_bdf; devid++)
1369		iommu_flush_irt(iommu, devid);
1370
1371	iommu_completion_wait(iommu);
1372}
1373
1374void amd_iommu_flush_all_caches(struct amd_iommu *iommu)
1375{
1376	if (check_feature(FEATURE_IA)) {
1377		amd_iommu_flush_all(iommu);
1378	} else {
1379		amd_iommu_flush_dte_all(iommu);
1380		amd_iommu_flush_irt_all(iommu);
1381		amd_iommu_flush_tlb_all(iommu);
1382	}
1383}
1384
1385/*
1386 * Command send function for flushing on-device TLB
1387 */
1388static int device_flush_iotlb(struct iommu_dev_data *dev_data, u64 address,
1389			      size_t size, ioasid_t pasid, bool gn)
1390{
1391	struct amd_iommu *iommu;
1392	struct iommu_cmd cmd;
1393	int qdep;
1394
1395	qdep     = dev_data->ats_qdep;
1396	iommu    = rlookup_amd_iommu(dev_data->dev);
1397	if (!iommu)
1398		return -EINVAL;
1399
1400	build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address,
1401			      size, pasid, gn);
1402
1403	return iommu_queue_command(iommu, &cmd);
1404}
1405
1406static int device_flush_dte_alias(struct pci_dev *pdev, u16 alias, void *data)
1407{
1408	struct amd_iommu *iommu = data;
1409
1410	return iommu_flush_dte(iommu, alias);
1411}
1412
1413/*
1414 * Command send function for invalidating a device table entry
1415 */
1416static int device_flush_dte(struct iommu_dev_data *dev_data)
1417{
1418	struct amd_iommu *iommu;
1419	struct pci_dev *pdev = NULL;
1420	struct amd_iommu_pci_seg *pci_seg;
1421	u16 alias;
1422	int ret;
1423
1424	iommu = rlookup_amd_iommu(dev_data->dev);
1425	if (!iommu)
1426		return -EINVAL;
1427
1428	if (dev_is_pci(dev_data->dev))
1429		pdev = to_pci_dev(dev_data->dev);
1430
1431	if (pdev)
1432		ret = pci_for_each_dma_alias(pdev,
1433					     device_flush_dte_alias, iommu);
1434	else
1435		ret = iommu_flush_dte(iommu, dev_data->devid);
1436	if (ret)
1437		return ret;
1438
1439	pci_seg = iommu->pci_seg;
1440	alias = pci_seg->alias_table[dev_data->devid];
1441	if (alias != dev_data->devid) {
1442		ret = iommu_flush_dte(iommu, alias);
1443		if (ret)
1444			return ret;
1445	}
1446
1447	if (dev_data->ats_enabled) {
1448		/* Invalidate the entire contents of an IOTLB */
1449		ret = device_flush_iotlb(dev_data, 0, ~0UL,
1450					 IOMMU_NO_PASID, false);
1451	}
1452
1453	return ret;
1454}
1455
1456/*
1457 * TLB invalidation function which is called from the mapping functions.
1458 * It invalidates a single PTE if the range to flush is within a single
1459 * page. Otherwise it flushes the whole TLB of the IOMMU.
1460 */
1461static void __domain_flush_pages(struct protection_domain *domain,
1462				 u64 address, size_t size)
1463{
1464	struct iommu_dev_data *dev_data;
1465	struct iommu_cmd cmd;
1466	int ret = 0, i;
1467	ioasid_t pasid = IOMMU_NO_PASID;
1468	bool gn = false;
 
 
 
 
 
 
 
 
 
 
 
 
1469
1470	if (pdom_is_v2_pgtbl_mode(domain))
1471		gn = true;
 
 
 
 
 
1472
1473	build_inv_iommu_pages(&cmd, address, size, domain->id, pasid, gn);
1474
1475	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
1476		if (!domain->dev_iommu[i])
1477			continue;
1478
 
1479		/*
1480		 * Devices of this domain are behind this IOMMU
1481		 * We need a TLB flush
1482		 */
1483		ret |= iommu_queue_command(amd_iommus[i], &cmd);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1484	}
1485
1486	list_for_each_entry(dev_data, &domain->dev_list, list) {
1487
1488		if (!dev_data->ats_enabled)
1489			continue;
1490
1491		ret |= device_flush_iotlb(dev_data, address, size, pasid, gn);
1492	}
1493
1494	WARN_ON(ret);
1495}
1496
1497void amd_iommu_domain_flush_pages(struct protection_domain *domain,
1498				  u64 address, size_t size)
1499{
 
 
1500	if (likely(!amd_iommu_np_cache)) {
1501		__domain_flush_pages(domain, address, size);
1502
1503		/* Wait until IOMMU TLB and all device IOTLB flushes are complete */
1504		amd_iommu_domain_flush_complete(domain);
1505
1506		return;
1507	}
1508
1509	/*
1510	 * When NpCache is on, we infer that we run in a VM and use a vIOMMU.
1511	 * In such setups it is best to avoid flushes of ranges which are not
1512	 * naturally aligned, since it would lead to flushes of unmodified
1513	 * PTEs. Such flushes would require the hypervisor to do more work than
1514	 * necessary. Therefore, perform repeated flushes of aligned ranges
1515	 * until you cover the range. Each iteration flushes the smaller
1516	 * between the natural alignment of the address that we flush and the
1517	 * greatest naturally aligned region that fits in the range.
1518	 */
1519	while (size != 0) {
1520		int addr_alignment = __ffs(address);
1521		int size_alignment = __fls(size);
1522		int min_alignment;
1523		size_t flush_size;
1524
1525		/*
1526		 * size is always non-zero, but address might be zero, causing
1527		 * addr_alignment to be negative. As the casting of the
1528		 * argument in __ffs(address) to long might trim the high bits
1529		 * of the address on x86-32, cast to long when doing the check.
1530		 */
1531		if (likely((unsigned long)address != 0))
1532			min_alignment = min(addr_alignment, size_alignment);
1533		else
1534			min_alignment = size_alignment;
1535
1536		flush_size = 1ul << min_alignment;
1537
1538		__domain_flush_pages(domain, address, flush_size);
1539		address += flush_size;
1540		size -= flush_size;
1541	}
1542
1543	/* Wait until IOMMU TLB and all device IOTLB flushes are complete */
1544	amd_iommu_domain_flush_complete(domain);
1545}
1546
1547/* Flush the whole IO/TLB for a given protection domain - including PDE */
1548static void amd_iommu_domain_flush_all(struct protection_domain *domain)
1549{
1550	amd_iommu_domain_flush_pages(domain, 0,
1551				     CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
1552}
1553
1554void amd_iommu_domain_flush_complete(struct protection_domain *domain)
 
1555{
1556	int i;
 
 
 
 
 
 
 
 
1557
1558	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
1559		if (domain && !domain->dev_iommu[i])
1560			continue;
1561
1562		/*
1563		 * Devices of this domain are behind this IOMMU
1564		 * We need to wait for completion of all commands.
1565		 */
1566		iommu_completion_wait(amd_iommus[i]);
1567	}
1568}
1569
1570/* Flush the not present cache if it exists */
1571static void domain_flush_np_cache(struct protection_domain *domain,
1572		dma_addr_t iova, size_t size)
1573{
1574	if (unlikely(amd_iommu_np_cache)) {
1575		unsigned long flags;
1576
1577		spin_lock_irqsave(&domain->lock, flags);
1578		amd_iommu_domain_flush_pages(domain, iova, size);
1579		spin_unlock_irqrestore(&domain->lock, flags);
1580	}
1581}
1582
1583
1584/*
1585 * This function flushes the DTEs for all devices in domain
1586 */
1587static void domain_flush_devices(struct protection_domain *domain)
1588{
1589	struct iommu_dev_data *dev_data;
1590
 
 
 
 
 
 
 
 
 
1591	list_for_each_entry(dev_data, &domain->dev_list, list)
1592		device_flush_dte(dev_data);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1593}
1594
1595/****************************************************************************
1596 *
1597 * The next functions belong to the domain allocation. A domain is
1598 * allocated for every IOMMU as the default domain. If device isolation
1599 * is enabled, every device get its own domain. The most important thing
1600 * about domains is the page table mapping the DMA address space they
1601 * contain.
1602 *
1603 ****************************************************************************/
1604
1605static u16 domain_id_alloc(void)
1606{
1607	int id;
1608
1609	spin_lock(&pd_bitmap_lock);
1610	id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
1611	BUG_ON(id == 0);
1612	if (id > 0 && id < MAX_DOMAIN_ID)
1613		__set_bit(id, amd_iommu_pd_alloc_bitmap);
1614	else
1615		id = 0;
1616	spin_unlock(&pd_bitmap_lock);
1617
1618	return id;
1619}
1620
1621static void domain_id_free(int id)
1622{
1623	spin_lock(&pd_bitmap_lock);
1624	if (id > 0 && id < MAX_DOMAIN_ID)
1625		__clear_bit(id, amd_iommu_pd_alloc_bitmap);
1626	spin_unlock(&pd_bitmap_lock);
1627}
1628
1629static void free_gcr3_tbl_level1(u64 *tbl)
1630{
1631	u64 *ptr;
1632	int i;
1633
1634	for (i = 0; i < 512; ++i) {
1635		if (!(tbl[i] & GCR3_VALID))
1636			continue;
1637
1638		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1639
1640		free_page((unsigned long)ptr);
1641	}
1642}
1643
1644static void free_gcr3_tbl_level2(u64 *tbl)
1645{
1646	u64 *ptr;
1647	int i;
1648
1649	for (i = 0; i < 512; ++i) {
1650		if (!(tbl[i] & GCR3_VALID))
1651			continue;
1652
1653		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1654
1655		free_gcr3_tbl_level1(ptr);
1656	}
1657}
1658
1659static void free_gcr3_table(struct protection_domain *domain)
1660{
1661	if (domain->glx == 2)
1662		free_gcr3_tbl_level2(domain->gcr3_tbl);
1663	else if (domain->glx == 1)
1664		free_gcr3_tbl_level1(domain->gcr3_tbl);
1665	else
1666		BUG_ON(domain->glx != 0);
 
 
1667
1668	free_page((unsigned long)domain->gcr3_tbl);
 
 
 
 
1669}
1670
1671/*
1672 * Number of GCR3 table levels required. Level must be 4-Kbyte
1673 * page and can contain up to 512 entries.
1674 */
1675static int get_gcr3_levels(int pasids)
1676{
1677	int levels;
1678
1679	if (pasids == -1)
1680		return amd_iommu_max_glx_val;
1681
1682	levels = get_count_order(pasids);
1683
1684	return levels ? (DIV_ROUND_UP(levels, 9) - 1) : levels;
1685}
1686
1687/* Note: This function expects iommu_domain->lock to be held prior calling the function. */
1688static int setup_gcr3_table(struct protection_domain *domain, int pasids)
1689{
1690	int levels = get_gcr3_levels(pasids);
 
 
1691
1692	if (levels > amd_iommu_max_glx_val)
1693		return -EINVAL;
1694
1695	domain->gcr3_tbl = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
1696	if (domain->gcr3_tbl == NULL)
 
 
 
 
 
 
 
 
 
 
1697		return -ENOMEM;
 
1698
1699	domain->glx      = levels;
1700	domain->flags   |= PD_IOMMUV2_MASK;
1701
1702	amd_iommu_domain_update(domain);
 
1703
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1704	return 0;
1705}
1706
1707static void set_dte_entry(struct amd_iommu *iommu, u16 devid,
1708			  struct protection_domain *domain, bool ats, bool ppr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1709{
1710	u64 pte_root = 0;
1711	u64 flags = 0;
1712	u32 old_domid;
 
 
 
1713	struct dev_table_entry *dev_table = get_dev_table(iommu);
 
 
 
 
 
 
1714
1715	if (domain->iop.mode != PAGE_MODE_NONE)
1716		pte_root = iommu_virt_to_phys(domain->iop.root);
1717
1718	pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
1719		    << DEV_ENTRY_MODE_SHIFT;
1720
1721	pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
1722
1723	/*
1724	 * When SNP is enabled, Only set TV bit when IOMMU
1725	 * page translation is in use.
1726	 */
1727	if (!amd_iommu_snp_en || (domain->id != 0))
1728		pte_root |= DTE_FLAG_TV;
1729
1730	flags = dev_table[devid].data[1];
1731
1732	if (ats)
1733		flags |= DTE_FLAG_IOTLB;
1734
1735	if (ppr)
1736		pte_root |= 1ULL << DEV_ENTRY_PPR;
1737
1738	if (domain->dirty_tracking)
1739		pte_root |= DTE_FLAG_HAD;
1740
1741	if (domain->flags & PD_IOMMUV2_MASK) {
1742		u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
1743		u64 glx  = domain->glx;
1744		u64 tmp;
1745
1746		pte_root |= DTE_FLAG_GV;
1747		pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
1748
1749		/* First mask out possible old values for GCR3 table */
1750		tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
1751		flags    &= ~tmp;
1752
1753		tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
1754		flags    &= ~tmp;
1755
1756		/* Encode GCR3 table into DTE */
1757		tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
1758		pte_root |= tmp;
1759
1760		tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
1761		flags    |= tmp;
1762
1763		tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
1764		flags    |= tmp;
1765
1766		if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
1767			dev_table[devid].data[2] |=
1768				((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
1769		}
1770
1771		if (domain->flags & PD_GIOV_MASK)
 
1772			pte_root |= DTE_FLAG_GIOV;
1773	}
1774
1775	flags &= ~DEV_DOMID_MASK;
1776	flags |= domain->id;
1777
1778	old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
1779	dev_table[devid].data[1]  = flags;
1780	dev_table[devid].data[0]  = pte_root;
1781
1782	/*
1783	 * A kdump kernel might be replacing a domain ID that was copied from
1784	 * the previous kernel--if so, it needs to flush the translation cache
1785	 * entries for the old domain ID that is being overwritten
1786	 */
1787	if (old_domid) {
1788		amd_iommu_flush_tlb_domid(iommu, old_domid);
1789	}
1790}
1791
1792static void clear_dte_entry(struct amd_iommu *iommu, u16 devid)
1793{
1794	struct dev_table_entry *dev_table = get_dev_table(iommu);
1795
1796	/* remove entry from the device table seen by the hardware */
1797	dev_table[devid].data[0]  = DTE_FLAG_V;
1798
1799	if (!amd_iommu_snp_en)
1800		dev_table[devid].data[0] |= DTE_FLAG_TV;
1801
1802	dev_table[devid].data[1] &= DTE_FLAG_MASK;
1803
1804	amd_iommu_apply_erratum_63(iommu, devid);
1805}
1806
1807static void do_attach(struct iommu_dev_data *dev_data,
1808		      struct protection_domain *domain)
1809{
1810	struct amd_iommu *iommu;
1811	bool ats;
 
 
 
 
 
 
 
 
 
1812
1813	iommu = rlookup_amd_iommu(dev_data->dev);
1814	if (!iommu)
1815		return;
1816	ats   = dev_data->ats_enabled;
 
 
 
 
 
 
1817
1818	/* Update data structures */
1819	dev_data->domain = domain;
1820	list_add(&dev_data->list, &domain->dev_list);
 
 
 
1821
1822	/* Update NUMA Node ID */
1823	if (domain->nid == NUMA_NO_NODE)
1824		domain->nid = dev_to_node(dev_data->dev);
 
 
 
 
 
1825
1826	/* Do reference counting */
1827	domain->dev_iommu[iommu->index] += 1;
1828	domain->dev_cnt                 += 1;
1829
1830	/* Update device table */
1831	set_dte_entry(iommu, dev_data->devid, domain,
1832		      ats, dev_data->ppr);
1833	clone_aliases(iommu, dev_data->dev);
1834
1835	device_flush_dte(dev_data);
1836}
1837
1838static void do_detach(struct iommu_dev_data *dev_data)
 
1839{
1840	struct protection_domain *domain = dev_data->domain;
1841	struct amd_iommu *iommu;
 
 
1842
1843	iommu = rlookup_amd_iommu(dev_data->dev);
1844	if (!iommu)
1845		return;
1846
1847	/* Update data structures */
1848	dev_data->domain = NULL;
1849	list_del(&dev_data->list);
1850	clear_dte_entry(iommu, dev_data->devid);
1851	clone_aliases(iommu, dev_data->dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1852
1853	/* Flush the DTE entry */
1854	device_flush_dte(dev_data);
 
 
 
1855
1856	/* Flush IOTLB and wait for the flushes to finish */
1857	amd_iommu_domain_flush_all(domain);
 
 
 
1858
1859	/* decrease reference counters - needs to happen after the flushes */
1860	domain->dev_iommu[iommu->index] -= 1;
1861	domain->dev_cnt                 -= 1;
1862}
1863
1864/*
1865 * If a device is not yet associated with a domain, this function makes the
1866 * device visible in the domain
1867 */
1868static int attach_device(struct device *dev,
1869			 struct protection_domain *domain)
1870{
1871	struct iommu_dev_data *dev_data;
 
 
1872	unsigned long flags;
1873	int ret = 0;
1874
1875	spin_lock_irqsave(&domain->lock, flags);
1876
1877	dev_data = dev_iommu_priv_get(dev);
1878
1879	spin_lock(&dev_data->lock);
1880
1881	if (dev_data->domain != NULL) {
1882		ret = -EBUSY;
1883		goto out;
1884	}
1885
1886	if (dev_is_pci(dev))
1887		pdev_enable_caps(to_pci_dev(dev));
 
 
 
 
 
 
 
 
 
 
 
1888
1889	do_attach(dev_data, domain);
 
 
1890
1891out:
1892	spin_unlock(&dev_data->lock);
 
 
 
 
 
 
 
 
1893
 
 
 
 
1894	spin_unlock_irqrestore(&domain->lock, flags);
1895
 
 
 
 
 
 
1896	return ret;
1897}
1898
1899/*
1900 * Removes a device from a protection domain (with devtable_lock held)
1901 */
1902static void detach_device(struct device *dev)
1903{
1904	struct protection_domain *domain;
1905	struct iommu_dev_data *dev_data;
 
1906	unsigned long flags;
1907
1908	dev_data = dev_iommu_priv_get(dev);
1909	domain   = dev_data->domain;
1910
1911	spin_lock_irqsave(&domain->lock, flags);
1912
1913	spin_lock(&dev_data->lock);
1914
1915	/*
1916	 * First check if the device is still attached. It might already
1917	 * be detached from its domain because the generic
1918	 * iommu_detach_group code detached it and we try again here in
1919	 * our alias handling.
1920	 */
1921	if (WARN_ON(!dev_data->domain))
1922		goto out;
1923
1924	do_detach(dev_data);
 
 
 
 
1925
1926	if (dev_is_pci(dev))
1927		pdev_disable_caps(to_pci_dev(dev));
1928
1929out:
1930	spin_unlock(&dev_data->lock);
1931
 
 
 
 
1932	spin_unlock_irqrestore(&domain->lock, flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
1933}
1934
1935static struct iommu_device *amd_iommu_probe_device(struct device *dev)
1936{
1937	struct iommu_device *iommu_dev;
1938	struct amd_iommu *iommu;
 
1939	int ret;
1940
1941	if (!check_device(dev))
1942		return ERR_PTR(-ENODEV);
1943
1944	iommu = rlookup_amd_iommu(dev);
1945	if (!iommu)
1946		return ERR_PTR(-ENODEV);
1947
1948	/* Not registered yet? */
1949	if (!iommu->iommu.ops)
1950		return ERR_PTR(-ENODEV);
1951
1952	if (dev_iommu_priv_get(dev))
1953		return &iommu->iommu;
1954
1955	ret = iommu_init_device(iommu, dev);
1956	if (ret) {
1957		if (ret != -ENOTSUPP)
1958			dev_err(dev, "Failed to initialize - trying to proceed anyway\n");
1959		iommu_dev = ERR_PTR(ret);
1960		iommu_ignore_device(iommu, dev);
1961	} else {
1962		amd_iommu_set_pci_msi_domain(dev, iommu);
1963		iommu_dev = &iommu->iommu;
 
 
 
 
 
 
 
 
 
 
 
 
1964	}
1965
 
1966	iommu_completion_wait(iommu);
1967
 
 
 
1968	return iommu_dev;
1969}
1970
1971static void amd_iommu_probe_finalize(struct device *dev)
1972{
1973	/* Domains are initialized for this device - have a look what we ended up with */
1974	set_dma_ops(dev, NULL);
1975	iommu_setup_dma_ops(dev, 0, U64_MAX);
1976}
1977
1978static void amd_iommu_release_device(struct device *dev)
1979{
1980	struct amd_iommu *iommu;
1981
1982	if (!check_device(dev))
1983		return;
1984
1985	iommu = rlookup_amd_iommu(dev);
1986	if (!iommu)
1987		return;
1988
1989	amd_iommu_uninit_device(dev);
1990	iommu_completion_wait(iommu);
1991}
1992
1993static struct iommu_group *amd_iommu_device_group(struct device *dev)
1994{
1995	if (dev_is_pci(dev))
1996		return pci_device_group(dev);
1997
1998	return acpihid_device_group(dev);
1999}
2000
2001/*****************************************************************************
2002 *
2003 * The next functions belong to the dma_ops mapping/unmapping code.
2004 *
2005 *****************************************************************************/
2006
2007static void update_device_table(struct protection_domain *domain)
2008{
2009	struct iommu_dev_data *dev_data;
2010
2011	list_for_each_entry(dev_data, &domain->dev_list, list) {
2012		struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
2013
2014		if (!iommu)
2015			continue;
2016		set_dte_entry(iommu, dev_data->devid, domain,
2017			      dev_data->ats_enabled, dev_data->ppr);
2018		clone_aliases(iommu, dev_data->dev);
2019	}
2020}
2021
2022void amd_iommu_update_and_flush_device_table(struct protection_domain *domain)
2023{
2024	update_device_table(domain);
2025	domain_flush_devices(domain);
2026}
2027
2028void amd_iommu_domain_update(struct protection_domain *domain)
2029{
2030	/* Update device table */
2031	amd_iommu_update_and_flush_device_table(domain);
2032
2033	/* Flush domain TLB(s) and wait for completion */
2034	amd_iommu_domain_flush_all(domain);
2035}
2036
2037/*****************************************************************************
2038 *
2039 * The following functions belong to the exported interface of AMD IOMMU
2040 *
2041 * This interface allows access to lower level functions of the IOMMU
2042 * like protection domain handling and assignement of devices to domains
2043 * which is not possible with the dma_ops interface.
2044 *
2045 *****************************************************************************/
2046
2047static void cleanup_domain(struct protection_domain *domain)
2048{
2049	struct iommu_dev_data *entry;
2050
2051	lockdep_assert_held(&domain->lock);
2052
2053	if (!domain->dev_cnt)
2054		return;
2055
2056	while (!list_empty(&domain->dev_list)) {
2057		entry = list_first_entry(&domain->dev_list,
2058					 struct iommu_dev_data, list);
2059		BUG_ON(!entry->domain);
2060		do_detach(entry);
2061	}
2062	WARN_ON(domain->dev_cnt != 0);
2063}
2064
2065static void protection_domain_free(struct protection_domain *domain)
2066{
2067	if (!domain)
2068		return;
2069
2070	if (domain->iop.pgtbl_cfg.tlb)
2071		free_io_pgtable_ops(&domain->iop.iop.ops);
2072
2073	if (domain->flags & PD_IOMMUV2_MASK)
2074		free_gcr3_table(domain);
2075
2076	if (domain->iop.root)
2077		free_page((unsigned long)domain->iop.root);
2078
2079	if (domain->id)
2080		domain_id_free(domain->id);
2081
2082	kfree(domain);
2083}
2084
2085static int protection_domain_init_v1(struct protection_domain *domain, int mode)
2086{
2087	u64 *pt_root = NULL;
 
2088
2089	BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL);
 
 
2090
2091	if (mode != PAGE_MODE_NONE) {
2092		pt_root = (void *)get_zeroed_page(GFP_KERNEL);
2093		if (!pt_root)
2094			return -ENOMEM;
2095	}
 
2096
2097	amd_iommu_domain_set_pgtable(domain, pt_root, mode);
2098
2099	return 0;
2100}
2101
2102static int protection_domain_init_v2(struct protection_domain *domain)
2103{
2104	domain->flags |= PD_GIOV_MASK;
2105
2106	domain->domain.pgsize_bitmap = AMD_IOMMU_PGSIZES_V2;
2107
2108	if (setup_gcr3_table(domain, 1))
2109		return -ENOMEM;
2110
2111	return 0;
2112}
2113
2114static struct protection_domain *protection_domain_alloc(unsigned int type)
2115{
2116	struct io_pgtable_ops *pgtbl_ops;
2117	struct protection_domain *domain;
2118	int pgtable;
2119	int ret;
2120
2121	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2122	if (!domain)
2123		return NULL;
2124
2125	domain->id = domain_id_alloc();
2126	if (!domain->id)
2127		goto out_err;
2128
2129	spin_lock_init(&domain->lock);
2130	INIT_LIST_HEAD(&domain->dev_list);
2131	domain->nid = NUMA_NO_NODE;
2132
2133	switch (type) {
2134	/* No need to allocate io pgtable ops in passthrough mode */
2135	case IOMMU_DOMAIN_IDENTITY:
2136		return domain;
2137	case IOMMU_DOMAIN_DMA:
2138		pgtable = amd_iommu_pgtable;
2139		break;
2140	/*
2141	 * Force IOMMU v1 page table when allocating
2142	 * domain for pass-through devices.
2143	 */
2144	case IOMMU_DOMAIN_UNMANAGED:
2145		pgtable = AMD_IOMMU_V1;
2146		break;
2147	default:
2148		goto out_err;
2149	}
2150
2151	switch (pgtable) {
2152	case AMD_IOMMU_V1:
2153		ret = protection_domain_init_v1(domain, DEFAULT_PGTABLE_LEVEL);
2154		break;
2155	case AMD_IOMMU_V2:
2156		ret = protection_domain_init_v2(domain);
2157		break;
2158	default:
2159		ret = -EINVAL;
2160		break;
2161	}
2162
2163	if (ret)
2164		goto out_err;
2165
2166	pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl_cfg, domain);
2167	if (!pgtbl_ops)
2168		goto out_err;
2169
2170	return domain;
2171out_err:
2172	protection_domain_free(domain);
2173	return NULL;
2174}
2175
2176static inline u64 dma_max_address(void)
2177{
2178	if (amd_iommu_pgtable == AMD_IOMMU_V1)
2179		return ~0ULL;
2180
2181	/* V2 with 4/5 level page table */
2182	return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
2183}
2184
2185static bool amd_iommu_hd_support(struct amd_iommu *iommu)
2186{
2187	return iommu && (iommu->features & FEATURE_HDSUP);
2188}
2189
2190static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
2191						  struct device *dev, u32 flags)
 
2192{
2193	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
 
2194	struct protection_domain *domain;
2195	struct amd_iommu *iommu = NULL;
2196
2197	if (dev) {
2198		iommu = rlookup_amd_iommu(dev);
2199		if (!iommu)
2200			return ERR_PTR(-ENODEV);
2201	}
2202
2203	/*
2204	 * Since DTE[Mode]=0 is prohibited on SNP-enabled system,
2205	 * default to use IOMMU_DOMAIN_DMA[_FQ].
2206	 */
2207	if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
2208		return ERR_PTR(-EINVAL);
2209
2210	if (dirty_tracking && !amd_iommu_hd_support(iommu))
2211		return ERR_PTR(-EOPNOTSUPP);
2212
2213	domain = protection_domain_alloc(type);
2214	if (!domain)
2215		return ERR_PTR(-ENOMEM);
2216
 
 
 
 
 
 
 
 
2217	domain->domain.geometry.aperture_start = 0;
2218	domain->domain.geometry.aperture_end   = dma_max_address();
2219	domain->domain.geometry.force_aperture = true;
 
2220
2221	if (iommu) {
2222		domain->domain.type = type;
2223		domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap;
2224		domain->domain.ops = iommu->iommu.ops->default_domain_ops;
2225
2226		if (dirty_tracking)
2227			domain->domain.dirty_ops = &amd_dirty_ops;
2228	}
2229
2230	return &domain->domain;
2231}
2232
2233static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
 
 
 
2234{
2235	struct iommu_domain *domain;
 
 
2236
2237	domain = do_iommu_domain_alloc(type, NULL, 0);
2238	if (IS_ERR(domain))
2239		return NULL;
2240
2241	return domain;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2242}
2243
2244static struct iommu_domain *
2245amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
2246			    struct iommu_domain *parent,
2247			    const struct iommu_user_data *user_data)
 
 
2248
 
 
2249{
2250	unsigned int type = IOMMU_DOMAIN_UNMANAGED;
2251
2252	if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data)
2253		return ERR_PTR(-EOPNOTSUPP);
 
 
 
 
 
2254
2255	return do_iommu_domain_alloc(type, dev, flags);
2256}
2257
2258static void amd_iommu_domain_free(struct iommu_domain *dom)
2259{
2260	struct protection_domain *domain;
2261	unsigned long flags;
 
 
2262
2263	if (!dom)
2264		return;
2265
2266	domain = to_pdomain(dom);
 
 
2267
2268	spin_lock_irqsave(&domain->lock, flags);
 
 
2269
2270	cleanup_domain(domain);
 
 
2271
2272	spin_unlock_irqrestore(&domain->lock, flags);
2273
2274	protection_domain_free(domain);
2275}
2276
 
 
 
 
 
 
 
 
2277static int amd_iommu_attach_device(struct iommu_domain *dom,
2278				   struct device *dev)
2279{
2280	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2281	struct protection_domain *domain = to_pdomain(dom);
2282	struct amd_iommu *iommu = rlookup_amd_iommu(dev);
2283	int ret;
2284
2285	/*
2286	 * Skip attach device to domain if new domain is same as
2287	 * devices current domain
2288	 */
2289	if (dev_data->domain == domain)
2290		return 0;
2291
2292	dev_data->defer_attach = false;
2293
2294	/*
2295	 * Restrict to devices with compatible IOMMU hardware support
2296	 * when enforcement of dirty tracking is enabled.
2297	 */
2298	if (dom->dirty_ops && !amd_iommu_hd_support(iommu))
2299		return -EINVAL;
2300
2301	if (dev_data->domain)
2302		detach_device(dev);
2303
2304	ret = attach_device(dev, domain);
2305
2306#ifdef CONFIG_IRQ_REMAP
2307	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
2308		if (dom->type == IOMMU_DOMAIN_UNMANAGED)
2309			dev_data->use_vapic = 1;
2310		else
2311			dev_data->use_vapic = 0;
2312	}
2313#endif
2314
2315	iommu_completion_wait(iommu);
2316
2317	return ret;
2318}
2319
2320static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
2321				    unsigned long iova, size_t size)
2322{
2323	struct protection_domain *domain = to_pdomain(dom);
2324	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2325
2326	if (ops->map_pages)
2327		domain_flush_np_cache(domain, iova, size);
2328	return 0;
2329}
2330
2331static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
2332			       phys_addr_t paddr, size_t pgsize, size_t pgcount,
2333			       int iommu_prot, gfp_t gfp, size_t *mapped)
2334{
2335	struct protection_domain *domain = to_pdomain(dom);
2336	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2337	int prot = 0;
2338	int ret = -EINVAL;
2339
2340	if ((amd_iommu_pgtable == AMD_IOMMU_V1) &&
2341	    (domain->iop.mode == PAGE_MODE_NONE))
2342		return -EINVAL;
2343
2344	if (iommu_prot & IOMMU_READ)
2345		prot |= IOMMU_PROT_IR;
2346	if (iommu_prot & IOMMU_WRITE)
2347		prot |= IOMMU_PROT_IW;
2348
2349	if (ops->map_pages) {
2350		ret = ops->map_pages(ops, iova, paddr, pgsize,
2351				     pgcount, prot, gfp, mapped);
2352	}
2353
2354	return ret;
2355}
2356
2357static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
2358					    struct iommu_iotlb_gather *gather,
2359					    unsigned long iova, size_t size)
2360{
2361	/*
2362	 * AMD's IOMMU can flush as many pages as necessary in a single flush.
2363	 * Unless we run in a virtual machine, which can be inferred according
2364	 * to whether "non-present cache" is on, it is probably best to prefer
2365	 * (potentially) too extensive TLB flushing (i.e., more misses) over
2366	 * mutliple TLB flushes (i.e., more flushes). For virtual machines the
2367	 * hypervisor needs to synchronize the host IOMMU PTEs with those of
2368	 * the guest, and the trade-off is different: unnecessary TLB flushes
2369	 * should be avoided.
2370	 */
2371	if (amd_iommu_np_cache &&
2372	    iommu_iotlb_gather_is_disjoint(gather, iova, size))
2373		iommu_iotlb_sync(domain, gather);
2374
2375	iommu_iotlb_gather_add_range(gather, iova, size);
2376}
2377
2378static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova,
2379				    size_t pgsize, size_t pgcount,
2380				    struct iommu_iotlb_gather *gather)
2381{
2382	struct protection_domain *domain = to_pdomain(dom);
2383	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2384	size_t r;
2385
2386	if ((amd_iommu_pgtable == AMD_IOMMU_V1) &&
2387	    (domain->iop.mode == PAGE_MODE_NONE))
2388		return 0;
2389
2390	r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0;
2391
2392	if (r)
2393		amd_iommu_iotlb_gather_add_page(dom, gather, iova, r);
2394
2395	return r;
2396}
2397
2398static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2399					  dma_addr_t iova)
2400{
2401	struct protection_domain *domain = to_pdomain(dom);
2402	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2403
2404	return ops->iova_to_phys(ops, iova);
2405}
2406
2407static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
2408{
2409	switch (cap) {
2410	case IOMMU_CAP_CACHE_COHERENCY:
2411		return true;
2412	case IOMMU_CAP_NOEXEC:
2413		return false;
2414	case IOMMU_CAP_PRE_BOOT_PROTECTION:
2415		return amdr_ivrs_remap_support;
2416	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
2417		return true;
2418	case IOMMU_CAP_DEFERRED_FLUSH:
2419		return true;
2420	case IOMMU_CAP_DIRTY_TRACKING: {
2421		struct amd_iommu *iommu = rlookup_amd_iommu(dev);
2422
2423		return amd_iommu_hd_support(iommu);
2424	}
2425	default:
2426		break;
2427	}
2428
2429	return false;
2430}
2431
2432static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
2433					bool enable)
2434{
2435	struct protection_domain *pdomain = to_pdomain(domain);
2436	struct dev_table_entry *dev_table;
2437	struct iommu_dev_data *dev_data;
2438	bool domain_flush = false;
2439	struct amd_iommu *iommu;
2440	unsigned long flags;
2441	u64 pte_root;
2442
2443	spin_lock_irqsave(&pdomain->lock, flags);
2444	if (!(pdomain->dirty_tracking ^ enable)) {
2445		spin_unlock_irqrestore(&pdomain->lock, flags);
2446		return 0;
2447	}
2448
2449	list_for_each_entry(dev_data, &pdomain->dev_list, list) {
2450		iommu = rlookup_amd_iommu(dev_data->dev);
2451		if (!iommu)
2452			continue;
2453
2454		dev_table = get_dev_table(iommu);
2455		pte_root = dev_table[dev_data->devid].data[0];
2456
2457		pte_root = (enable ? pte_root | DTE_FLAG_HAD :
2458				     pte_root & ~DTE_FLAG_HAD);
2459
2460		/* Flush device DTE */
2461		dev_table[dev_data->devid].data[0] = pte_root;
2462		device_flush_dte(dev_data);
2463		domain_flush = true;
2464	}
2465
2466	/* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
2467	if (domain_flush)
2468		amd_iommu_domain_flush_all(pdomain);
2469
2470	pdomain->dirty_tracking = enable;
2471	spin_unlock_irqrestore(&pdomain->lock, flags);
2472
2473	return 0;
2474}
2475
2476static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
2477					  unsigned long iova, size_t size,
2478					  unsigned long flags,
2479					  struct iommu_dirty_bitmap *dirty)
2480{
2481	struct protection_domain *pdomain = to_pdomain(domain);
2482	struct io_pgtable_ops *ops = &pdomain->iop.iop.ops;
2483	unsigned long lflags;
2484
2485	if (!ops || !ops->read_and_clear_dirty)
2486		return -EOPNOTSUPP;
2487
2488	spin_lock_irqsave(&pdomain->lock, lflags);
2489	if (!pdomain->dirty_tracking && dirty->bitmap) {
2490		spin_unlock_irqrestore(&pdomain->lock, lflags);
2491		return -EINVAL;
2492	}
2493	spin_unlock_irqrestore(&pdomain->lock, lflags);
2494
2495	return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
2496}
2497
2498static void amd_iommu_get_resv_regions(struct device *dev,
2499				       struct list_head *head)
2500{
2501	struct iommu_resv_region *region;
2502	struct unity_map_entry *entry;
2503	struct amd_iommu *iommu;
2504	struct amd_iommu_pci_seg *pci_seg;
2505	int devid, sbdf;
2506
2507	sbdf = get_device_sbdf_id(dev);
2508	if (sbdf < 0)
2509		return;
2510
2511	devid = PCI_SBDF_TO_DEVID(sbdf);
2512	iommu = rlookup_amd_iommu(dev);
2513	if (!iommu)
2514		return;
2515	pci_seg = iommu->pci_seg;
2516
2517	list_for_each_entry(entry, &pci_seg->unity_map, list) {
2518		int type, prot = 0;
2519		size_t length;
2520
2521		if (devid < entry->devid_start || devid > entry->devid_end)
2522			continue;
2523
2524		type   = IOMMU_RESV_DIRECT;
2525		length = entry->address_end - entry->address_start;
2526		if (entry->prot & IOMMU_PROT_IR)
2527			prot |= IOMMU_READ;
2528		if (entry->prot & IOMMU_PROT_IW)
2529			prot |= IOMMU_WRITE;
2530		if (entry->prot & IOMMU_UNITY_MAP_FLAG_EXCL_RANGE)
2531			/* Exclusion range */
2532			type = IOMMU_RESV_RESERVED;
2533
2534		region = iommu_alloc_resv_region(entry->address_start,
2535						 length, prot, type,
2536						 GFP_KERNEL);
2537		if (!region) {
2538			dev_err(dev, "Out of memory allocating dm-regions\n");
2539			return;
2540		}
2541		list_add_tail(&region->list, head);
2542	}
2543
2544	region = iommu_alloc_resv_region(MSI_RANGE_START,
2545					 MSI_RANGE_END - MSI_RANGE_START + 1,
2546					 0, IOMMU_RESV_MSI, GFP_KERNEL);
2547	if (!region)
2548		return;
2549	list_add_tail(&region->list, head);
2550
2551	region = iommu_alloc_resv_region(HT_RANGE_START,
2552					 HT_RANGE_END - HT_RANGE_START + 1,
2553					 0, IOMMU_RESV_RESERVED, GFP_KERNEL);
2554	if (!region)
2555		return;
2556	list_add_tail(&region->list, head);
2557}
2558
2559bool amd_iommu_is_attach_deferred(struct device *dev)
2560{
2561	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2562
2563	return dev_data->defer_attach;
2564}
2565
2566static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
2567{
2568	struct protection_domain *dom = to_pdomain(domain);
2569	unsigned long flags;
2570
2571	spin_lock_irqsave(&dom->lock, flags);
2572	amd_iommu_domain_flush_all(dom);
2573	spin_unlock_irqrestore(&dom->lock, flags);
2574}
2575
2576static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
2577				 struct iommu_iotlb_gather *gather)
2578{
2579	struct protection_domain *dom = to_pdomain(domain);
2580	unsigned long flags;
2581
2582	spin_lock_irqsave(&dom->lock, flags);
2583	amd_iommu_domain_flush_pages(dom, gather->start,
2584				     gather->end - gather->start + 1);
2585	spin_unlock_irqrestore(&dom->lock, flags);
2586}
2587
2588static int amd_iommu_def_domain_type(struct device *dev)
2589{
2590	struct iommu_dev_data *dev_data;
2591
2592	dev_data = dev_iommu_priv_get(dev);
2593	if (!dev_data)
2594		return 0;
2595
 
 
 
 
2596	/*
2597	 * Do not identity map IOMMUv2 capable devices when:
2598	 *  - memory encryption is active, because some of those devices
2599	 *    (AMD GPUs) don't have the encryption bit in their DMA-mask
2600	 *    and require remapping.
2601	 *  - SNP is enabled, because it prohibits DTE[Mode]=0.
2602	 */
2603	if (pdev_pasid_supported(dev_data) &&
2604	    !cc_platform_has(CC_ATTR_MEM_ENCRYPT) &&
2605	    !amd_iommu_snp_en) {
2606		return IOMMU_DOMAIN_IDENTITY;
2607	}
2608
2609	return 0;
2610}
2611
2612static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
2613{
2614	/* IOMMU_PTE_FC is always set */
2615	return true;
2616}
2617
2618static const struct iommu_dirty_ops amd_dirty_ops = {
2619	.set_dirty_tracking = amd_iommu_set_dirty_tracking,
2620	.read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
2621};
2622
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2623const struct iommu_ops amd_iommu_ops = {
2624	.capable = amd_iommu_capable,
2625	.domain_alloc = amd_iommu_domain_alloc,
2626	.domain_alloc_user = amd_iommu_domain_alloc_user,
 
 
 
2627	.probe_device = amd_iommu_probe_device,
2628	.release_device = amd_iommu_release_device,
2629	.probe_finalize = amd_iommu_probe_finalize,
2630	.device_group = amd_iommu_device_group,
2631	.get_resv_regions = amd_iommu_get_resv_regions,
2632	.is_attach_deferred = amd_iommu_is_attach_deferred,
2633	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
2634	.def_domain_type = amd_iommu_def_domain_type,
 
 
 
 
2635	.default_domain_ops = &(const struct iommu_domain_ops) {
2636		.attach_dev	= amd_iommu_attach_device,
2637		.map_pages	= amd_iommu_map_pages,
2638		.unmap_pages	= amd_iommu_unmap_pages,
2639		.iotlb_sync_map	= amd_iommu_iotlb_sync_map,
2640		.iova_to_phys	= amd_iommu_iova_to_phys,
2641		.flush_iotlb_all = amd_iommu_flush_iotlb_all,
2642		.iotlb_sync	= amd_iommu_iotlb_sync,
2643		.free		= amd_iommu_domain_free,
2644		.enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
2645	}
2646};
2647
2648static int __flush_pasid(struct protection_domain *domain, u32 pasid,
2649			 u64 address, size_t size)
2650{
2651	struct iommu_dev_data *dev_data;
2652	struct iommu_cmd cmd;
2653	int i, ret;
2654
2655	if (!(domain->flags & PD_IOMMUV2_MASK))
2656		return -EINVAL;
2657
2658	build_inv_iommu_pages(&cmd, address, size, domain->id, pasid, true);
2659
2660	/*
2661	 * IOMMU TLB needs to be flushed before Device TLB to
2662	 * prevent device TLB refill from IOMMU TLB
2663	 */
2664	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
2665		if (domain->dev_iommu[i] == 0)
2666			continue;
2667
2668		ret = iommu_queue_command(amd_iommus[i], &cmd);
2669		if (ret != 0)
2670			goto out;
2671	}
2672
2673	/* Wait until IOMMU TLB flushes are complete */
2674	amd_iommu_domain_flush_complete(domain);
2675
2676	/* Now flush device TLBs */
2677	list_for_each_entry(dev_data, &domain->dev_list, list) {
2678		struct amd_iommu *iommu;
2679		int qdep;
2680
2681		/*
2682		   There might be non-IOMMUv2 capable devices in an IOMMUv2
2683		 * domain.
2684		 */
2685		if (!dev_data->ats_enabled)
2686			continue;
2687
2688		qdep  = dev_data->ats_qdep;
2689		iommu = rlookup_amd_iommu(dev_data->dev);
2690		if (!iommu)
2691			continue;
2692		build_inv_iotlb_pages(&cmd, dev_data->devid, qdep,
2693				      address, size, pasid, true);
2694
2695		ret = iommu_queue_command(iommu, &cmd);
2696		if (ret != 0)
2697			goto out;
2698	}
2699
2700	/* Wait until all device TLBs are flushed */
2701	amd_iommu_domain_flush_complete(domain);
2702
2703	ret = 0;
2704
2705out:
2706
2707	return ret;
2708}
2709
2710static int __amd_iommu_flush_page(struct protection_domain *domain, u32 pasid,
2711				  u64 address)
2712{
2713	return __flush_pasid(domain, pasid, address, PAGE_SIZE);
2714}
2715
2716int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid,
2717			 u64 address)
2718{
2719	struct protection_domain *domain = to_pdomain(dom);
2720	unsigned long flags;
2721	int ret;
2722
2723	spin_lock_irqsave(&domain->lock, flags);
2724	ret = __amd_iommu_flush_page(domain, pasid, address);
2725	spin_unlock_irqrestore(&domain->lock, flags);
2726
2727	return ret;
2728}
2729
2730static int __amd_iommu_flush_tlb(struct protection_domain *domain, u32 pasid)
2731{
2732	return __flush_pasid(domain, pasid, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
2733}
2734
2735int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid)
2736{
2737	struct protection_domain *domain = to_pdomain(dom);
2738	unsigned long flags;
2739	int ret;
2740
2741	spin_lock_irqsave(&domain->lock, flags);
2742	ret = __amd_iommu_flush_tlb(domain, pasid);
2743	spin_unlock_irqrestore(&domain->lock, flags);
2744
2745	return ret;
2746}
2747
2748static u64 *__get_gcr3_pte(u64 *root, int level, u32 pasid, bool alloc)
2749{
2750	int index;
2751	u64 *pte;
2752
2753	while (true) {
2754
2755		index = (pasid >> (9 * level)) & 0x1ff;
2756		pte   = &root[index];
2757
2758		if (level == 0)
2759			break;
2760
2761		if (!(*pte & GCR3_VALID)) {
2762			if (!alloc)
2763				return NULL;
2764
2765			root = (void *)get_zeroed_page(GFP_ATOMIC);
2766			if (root == NULL)
2767				return NULL;
2768
2769			*pte = iommu_virt_to_phys(root) | GCR3_VALID;
2770		}
2771
2772		root = iommu_phys_to_virt(*pte & PAGE_MASK);
2773
2774		level -= 1;
2775	}
2776
2777	return pte;
2778}
2779
2780static int __set_gcr3(struct protection_domain *domain, u32 pasid,
2781		      unsigned long cr3)
2782{
2783	u64 *pte;
2784
2785	if (domain->iop.mode != PAGE_MODE_NONE)
2786		return -EINVAL;
2787
2788	pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true);
2789	if (pte == NULL)
2790		return -ENOMEM;
2791
2792	*pte = (cr3 & PAGE_MASK) | GCR3_VALID;
2793
2794	return __amd_iommu_flush_tlb(domain, pasid);
2795}
2796
2797static int __clear_gcr3(struct protection_domain *domain, u32 pasid)
2798{
2799	u64 *pte;
2800
2801	if (domain->iop.mode != PAGE_MODE_NONE)
2802		return -EINVAL;
2803
2804	pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false);
2805	if (pte == NULL)
2806		return 0;
2807
2808	*pte = 0;
2809
2810	return __amd_iommu_flush_tlb(domain, pasid);
2811}
2812
2813int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid,
2814			      unsigned long cr3)
2815{
2816	struct protection_domain *domain = to_pdomain(dom);
2817	unsigned long flags;
2818	int ret;
2819
2820	spin_lock_irqsave(&domain->lock, flags);
2821	ret = __set_gcr3(domain, pasid, cr3);
2822	spin_unlock_irqrestore(&domain->lock, flags);
2823
2824	return ret;
2825}
2826
2827int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid)
2828{
2829	struct protection_domain *domain = to_pdomain(dom);
2830	unsigned long flags;
2831	int ret;
2832
2833	spin_lock_irqsave(&domain->lock, flags);
2834	ret = __clear_gcr3(domain, pasid);
2835	spin_unlock_irqrestore(&domain->lock, flags);
2836
2837	return ret;
2838}
2839
2840int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid,
2841			   int status, int tag)
2842{
2843	struct iommu_dev_data *dev_data;
2844	struct amd_iommu *iommu;
2845	struct iommu_cmd cmd;
2846
2847	dev_data = dev_iommu_priv_get(&pdev->dev);
2848	iommu    = rlookup_amd_iommu(&pdev->dev);
2849	if (!iommu)
2850		return -ENODEV;
2851
2852	build_complete_ppr(&cmd, dev_data->devid, pasid, status,
2853			   tag, dev_data->pri_tlp);
2854
2855	return iommu_queue_command(iommu, &cmd);
2856}
2857
2858#ifdef CONFIG_IRQ_REMAP
2859
2860/*****************************************************************************
2861 *
2862 * Interrupt Remapping Implementation
2863 *
2864 *****************************************************************************/
2865
2866static struct irq_chip amd_ir_chip;
2867static DEFINE_SPINLOCK(iommu_table_lock);
2868
2869static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid)
2870{
2871	int ret;
2872	u64 data;
2873	unsigned long flags;
2874	struct iommu_cmd cmd, cmd2;
2875
2876	if (iommu->irtcachedis_enabled)
2877		return;
2878
2879	build_inv_irt(&cmd, devid);
2880	data = atomic64_add_return(1, &iommu->cmd_sem_val);
2881	build_completion_wait(&cmd2, iommu, data);
2882
2883	raw_spin_lock_irqsave(&iommu->lock, flags);
2884	ret = __iommu_queue_command_sync(iommu, &cmd, true);
2885	if (ret)
2886		goto out;
2887	ret = __iommu_queue_command_sync(iommu, &cmd2, false);
2888	if (ret)
2889		goto out;
2890	wait_on_sem(iommu, data);
2891out:
2892	raw_spin_unlock_irqrestore(&iommu->lock, flags);
2893}
2894
2895static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid,
2896			      struct irq_remap_table *table)
2897{
2898	u64 dte;
2899	struct dev_table_entry *dev_table = get_dev_table(iommu);
2900
2901	dte	= dev_table[devid].data[2];
2902	dte	&= ~DTE_IRQ_PHYS_ADDR_MASK;
2903	dte	|= iommu_virt_to_phys(table->table);
2904	dte	|= DTE_IRQ_REMAP_INTCTL;
2905	dte	|= DTE_INTTABLEN;
2906	dte	|= DTE_IRQ_REMAP_ENABLE;
2907
2908	dev_table[devid].data[2] = dte;
2909}
2910
2911static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid)
2912{
2913	struct irq_remap_table *table;
2914	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2915
2916	if (WARN_ONCE(!pci_seg->rlookup_table[devid],
2917		      "%s: no iommu for devid %x:%x\n",
2918		      __func__, pci_seg->id, devid))
2919		return NULL;
2920
2921	table = pci_seg->irq_lookup_table[devid];
2922	if (WARN_ONCE(!table, "%s: no table for devid %x:%x\n",
2923		      __func__, pci_seg->id, devid))
2924		return NULL;
2925
2926	return table;
2927}
2928
2929static struct irq_remap_table *__alloc_irq_table(void)
2930{
2931	struct irq_remap_table *table;
2932
2933	table = kzalloc(sizeof(*table), GFP_KERNEL);
2934	if (!table)
2935		return NULL;
2936
2937	table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
2938	if (!table->table) {
2939		kfree(table);
2940		return NULL;
2941	}
2942	raw_spin_lock_init(&table->lock);
2943
2944	if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
2945		memset(table->table, 0,
2946		       MAX_IRQS_PER_TABLE * sizeof(u32));
2947	else
2948		memset(table->table, 0,
2949		       (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
2950	return table;
2951}
2952
2953static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
2954				  struct irq_remap_table *table)
2955{
2956	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2957
2958	pci_seg->irq_lookup_table[devid] = table;
2959	set_dte_irq_entry(iommu, devid, table);
2960	iommu_flush_dte(iommu, devid);
2961}
2962
2963static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias,
2964				       void *data)
2965{
2966	struct irq_remap_table *table = data;
2967	struct amd_iommu_pci_seg *pci_seg;
2968	struct amd_iommu *iommu = rlookup_amd_iommu(&pdev->dev);
2969
2970	if (!iommu)
2971		return -EINVAL;
2972
2973	pci_seg = iommu->pci_seg;
2974	pci_seg->irq_lookup_table[alias] = table;
2975	set_dte_irq_entry(iommu, alias, table);
2976	iommu_flush_dte(pci_seg->rlookup_table[alias], alias);
2977
2978	return 0;
2979}
2980
2981static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu,
2982					       u16 devid, struct pci_dev *pdev)
2983{
2984	struct irq_remap_table *table = NULL;
2985	struct irq_remap_table *new_table = NULL;
2986	struct amd_iommu_pci_seg *pci_seg;
2987	unsigned long flags;
2988	u16 alias;
2989
2990	spin_lock_irqsave(&iommu_table_lock, flags);
2991
2992	pci_seg = iommu->pci_seg;
2993	table = pci_seg->irq_lookup_table[devid];
2994	if (table)
2995		goto out_unlock;
2996
2997	alias = pci_seg->alias_table[devid];
2998	table = pci_seg->irq_lookup_table[alias];
2999	if (table) {
3000		set_remap_table_entry(iommu, devid, table);
3001		goto out_wait;
3002	}
3003	spin_unlock_irqrestore(&iommu_table_lock, flags);
3004
3005	/* Nothing there yet, allocate new irq remapping table */
3006	new_table = __alloc_irq_table();
3007	if (!new_table)
3008		return NULL;
3009
3010	spin_lock_irqsave(&iommu_table_lock, flags);
3011
3012	table = pci_seg->irq_lookup_table[devid];
3013	if (table)
3014		goto out_unlock;
3015
3016	table = pci_seg->irq_lookup_table[alias];
3017	if (table) {
3018		set_remap_table_entry(iommu, devid, table);
3019		goto out_wait;
3020	}
3021
3022	table = new_table;
3023	new_table = NULL;
3024
3025	if (pdev)
3026		pci_for_each_dma_alias(pdev, set_remap_table_entry_alias,
3027				       table);
3028	else
3029		set_remap_table_entry(iommu, devid, table);
3030
3031	if (devid != alias)
3032		set_remap_table_entry(iommu, alias, table);
3033
3034out_wait:
3035	iommu_completion_wait(iommu);
3036
3037out_unlock:
3038	spin_unlock_irqrestore(&iommu_table_lock, flags);
3039
3040	if (new_table) {
3041		kmem_cache_free(amd_iommu_irq_cache, new_table->table);
3042		kfree(new_table);
3043	}
3044	return table;
3045}
3046
3047static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count,
3048			   bool align, struct pci_dev *pdev)
3049{
3050	struct irq_remap_table *table;
3051	int index, c, alignment = 1;
3052	unsigned long flags;
3053
3054	table = alloc_irq_table(iommu, devid, pdev);
3055	if (!table)
3056		return -ENODEV;
3057
3058	if (align)
3059		alignment = roundup_pow_of_two(count);
3060
3061	raw_spin_lock_irqsave(&table->lock, flags);
3062
3063	/* Scan table for free entries */
3064	for (index = ALIGN(table->min_index, alignment), c = 0;
3065	     index < MAX_IRQS_PER_TABLE;) {
3066		if (!iommu->irte_ops->is_allocated(table, index)) {
3067			c += 1;
3068		} else {
3069			c     = 0;
3070			index = ALIGN(index + 1, alignment);
3071			continue;
3072		}
3073
3074		if (c == count)	{
3075			for (; c != 0; --c)
3076				iommu->irte_ops->set_allocated(table, index - c + 1);
3077
3078			index -= count - 1;
3079			goto out;
3080		}
3081
3082		index++;
3083	}
3084
3085	index = -ENOSPC;
3086
3087out:
3088	raw_spin_unlock_irqrestore(&table->lock, flags);
3089
3090	return index;
3091}
3092
3093static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
3094			    struct irte_ga *irte)
3095{
3096	struct irq_remap_table *table;
3097	struct irte_ga *entry;
3098	unsigned long flags;
3099	u128 old;
3100
3101	table = get_irq_table(iommu, devid);
3102	if (!table)
3103		return -ENOMEM;
3104
3105	raw_spin_lock_irqsave(&table->lock, flags);
3106
3107	entry = (struct irte_ga *)table->table;
3108	entry = &entry[index];
3109
3110	/*
3111	 * We use cmpxchg16 to atomically update the 128-bit IRTE,
3112	 * and it cannot be updated by the hardware or other processors
3113	 * behind us, so the return value of cmpxchg16 should be the
3114	 * same as the old value.
3115	 */
3116	old = entry->irte;
3117	WARN_ON(!try_cmpxchg128(&entry->irte, &old, irte->irte));
3118
3119	raw_spin_unlock_irqrestore(&table->lock, flags);
3120
3121	return 0;
3122}
3123
3124static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
3125			  struct irte_ga *irte)
3126{
3127	bool ret;
3128
3129	ret = __modify_irte_ga(iommu, devid, index, irte);
3130	if (ret)
3131		return ret;
3132
3133	iommu_flush_irt_and_complete(iommu, devid);
3134
3135	return 0;
3136}
3137
3138static int modify_irte(struct amd_iommu *iommu,
3139		       u16 devid, int index, union irte *irte)
3140{
3141	struct irq_remap_table *table;
3142	unsigned long flags;
3143
3144	table = get_irq_table(iommu, devid);
3145	if (!table)
3146		return -ENOMEM;
3147
3148	raw_spin_lock_irqsave(&table->lock, flags);
3149	table->table[index] = irte->val;
3150	raw_spin_unlock_irqrestore(&table->lock, flags);
3151
3152	iommu_flush_irt_and_complete(iommu, devid);
3153
3154	return 0;
3155}
3156
3157static void free_irte(struct amd_iommu *iommu, u16 devid, int index)
3158{
3159	struct irq_remap_table *table;
3160	unsigned long flags;
3161
3162	table = get_irq_table(iommu, devid);
3163	if (!table)
3164		return;
3165
3166	raw_spin_lock_irqsave(&table->lock, flags);
3167	iommu->irte_ops->clear_allocated(table, index);
3168	raw_spin_unlock_irqrestore(&table->lock, flags);
3169
3170	iommu_flush_irt_and_complete(iommu, devid);
3171}
3172
3173static void irte_prepare(void *entry,
3174			 u32 delivery_mode, bool dest_mode,
3175			 u8 vector, u32 dest_apicid, int devid)
3176{
3177	union irte *irte = (union irte *) entry;
3178
3179	irte->val                = 0;
3180	irte->fields.vector      = vector;
3181	irte->fields.int_type    = delivery_mode;
3182	irte->fields.destination = dest_apicid;
3183	irte->fields.dm          = dest_mode;
3184	irte->fields.valid       = 1;
3185}
3186
3187static void irte_ga_prepare(void *entry,
3188			    u32 delivery_mode, bool dest_mode,
3189			    u8 vector, u32 dest_apicid, int devid)
3190{
3191	struct irte_ga *irte = (struct irte_ga *) entry;
3192
3193	irte->lo.val                      = 0;
3194	irte->hi.val                      = 0;
3195	irte->lo.fields_remap.int_type    = delivery_mode;
3196	irte->lo.fields_remap.dm          = dest_mode;
3197	irte->hi.fields.vector            = vector;
3198	irte->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(dest_apicid);
3199	irte->hi.fields.destination       = APICID_TO_IRTE_DEST_HI(dest_apicid);
3200	irte->lo.fields_remap.valid       = 1;
3201}
3202
3203static void irte_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3204{
3205	union irte *irte = (union irte *) entry;
3206
3207	irte->fields.valid = 1;
3208	modify_irte(iommu, devid, index, irte);
3209}
3210
3211static void irte_ga_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3212{
3213	struct irte_ga *irte = (struct irte_ga *) entry;
3214
3215	irte->lo.fields_remap.valid = 1;
3216	modify_irte_ga(iommu, devid, index, irte);
3217}
3218
3219static void irte_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3220{
3221	union irte *irte = (union irte *) entry;
3222
3223	irte->fields.valid = 0;
3224	modify_irte(iommu, devid, index, irte);
3225}
3226
3227static void irte_ga_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3228{
3229	struct irte_ga *irte = (struct irte_ga *) entry;
3230
3231	irte->lo.fields_remap.valid = 0;
3232	modify_irte_ga(iommu, devid, index, irte);
3233}
3234
3235static void irte_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3236			      u8 vector, u32 dest_apicid)
3237{
3238	union irte *irte = (union irte *) entry;
3239
3240	irte->fields.vector = vector;
3241	irte->fields.destination = dest_apicid;
3242	modify_irte(iommu, devid, index, irte);
3243}
3244
3245static void irte_ga_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3246				 u8 vector, u32 dest_apicid)
3247{
3248	struct irte_ga *irte = (struct irte_ga *) entry;
3249
3250	if (!irte->lo.fields_remap.guest_mode) {
3251		irte->hi.fields.vector = vector;
3252		irte->lo.fields_remap.destination =
3253					APICID_TO_IRTE_DEST_LO(dest_apicid);
3254		irte->hi.fields.destination =
3255					APICID_TO_IRTE_DEST_HI(dest_apicid);
3256		modify_irte_ga(iommu, devid, index, irte);
3257	}
3258}
3259
3260#define IRTE_ALLOCATED (~1U)
3261static void irte_set_allocated(struct irq_remap_table *table, int index)
3262{
3263	table->table[index] = IRTE_ALLOCATED;
3264}
3265
3266static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
3267{
3268	struct irte_ga *ptr = (struct irte_ga *)table->table;
3269	struct irte_ga *irte = &ptr[index];
3270
3271	memset(&irte->lo.val, 0, sizeof(u64));
3272	memset(&irte->hi.val, 0, sizeof(u64));
3273	irte->hi.fields.vector = 0xff;
3274}
3275
3276static bool irte_is_allocated(struct irq_remap_table *table, int index)
3277{
3278	union irte *ptr = (union irte *)table->table;
3279	union irte *irte = &ptr[index];
3280
3281	return irte->val != 0;
3282}
3283
3284static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
3285{
3286	struct irte_ga *ptr = (struct irte_ga *)table->table;
3287	struct irte_ga *irte = &ptr[index];
3288
3289	return irte->hi.fields.vector != 0;
3290}
3291
3292static void irte_clear_allocated(struct irq_remap_table *table, int index)
3293{
3294	table->table[index] = 0;
3295}
3296
3297static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
3298{
3299	struct irte_ga *ptr = (struct irte_ga *)table->table;
3300	struct irte_ga *irte = &ptr[index];
3301
3302	memset(&irte->lo.val, 0, sizeof(u64));
3303	memset(&irte->hi.val, 0, sizeof(u64));
3304}
3305
3306static int get_devid(struct irq_alloc_info *info)
3307{
3308	switch (info->type) {
3309	case X86_IRQ_ALLOC_TYPE_IOAPIC:
3310		return get_ioapic_devid(info->devid);
3311	case X86_IRQ_ALLOC_TYPE_HPET:
3312		return get_hpet_devid(info->devid);
3313	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3314	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3315		return get_device_sbdf_id(msi_desc_to_dev(info->desc));
3316	default:
3317		WARN_ON_ONCE(1);
3318		return -1;
3319	}
3320}
3321
3322struct irq_remap_ops amd_iommu_irq_ops = {
3323	.prepare		= amd_iommu_prepare,
3324	.enable			= amd_iommu_enable,
3325	.disable		= amd_iommu_disable,
3326	.reenable		= amd_iommu_reenable,
3327	.enable_faulting	= amd_iommu_enable_faulting,
3328};
3329
3330static void fill_msi_msg(struct msi_msg *msg, u32 index)
3331{
3332	msg->data = index;
3333	msg->address_lo = 0;
3334	msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
3335	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
3336}
3337
3338static void irq_remapping_prepare_irte(struct amd_ir_data *data,
3339				       struct irq_cfg *irq_cfg,
3340				       struct irq_alloc_info *info,
3341				       int devid, int index, int sub_handle)
3342{
3343	struct irq_2_irte *irte_info = &data->irq_2_irte;
3344	struct amd_iommu *iommu = data->iommu;
3345
3346	if (!iommu)
3347		return;
3348
3349	data->irq_2_irte.devid = devid;
3350	data->irq_2_irte.index = index + sub_handle;
3351	iommu->irte_ops->prepare(data->entry, APIC_DELIVERY_MODE_FIXED,
3352				 apic->dest_mode_logical, irq_cfg->vector,
3353				 irq_cfg->dest_apicid, devid);
3354
3355	switch (info->type) {
3356	case X86_IRQ_ALLOC_TYPE_IOAPIC:
3357	case X86_IRQ_ALLOC_TYPE_HPET:
3358	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3359	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3360		fill_msi_msg(&data->msi_entry, irte_info->index);
3361		break;
3362
3363	default:
3364		BUG_ON(1);
3365		break;
3366	}
3367}
3368
3369struct amd_irte_ops irte_32_ops = {
3370	.prepare = irte_prepare,
3371	.activate = irte_activate,
3372	.deactivate = irte_deactivate,
3373	.set_affinity = irte_set_affinity,
3374	.set_allocated = irte_set_allocated,
3375	.is_allocated = irte_is_allocated,
3376	.clear_allocated = irte_clear_allocated,
3377};
3378
3379struct amd_irte_ops irte_128_ops = {
3380	.prepare = irte_ga_prepare,
3381	.activate = irte_ga_activate,
3382	.deactivate = irte_ga_deactivate,
3383	.set_affinity = irte_ga_set_affinity,
3384	.set_allocated = irte_ga_set_allocated,
3385	.is_allocated = irte_ga_is_allocated,
3386	.clear_allocated = irte_ga_clear_allocated,
3387};
3388
3389static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
3390			       unsigned int nr_irqs, void *arg)
3391{
3392	struct irq_alloc_info *info = arg;
3393	struct irq_data *irq_data;
3394	struct amd_ir_data *data = NULL;
3395	struct amd_iommu *iommu;
3396	struct irq_cfg *cfg;
3397	int i, ret, devid, seg, sbdf;
3398	int index;
3399
3400	if (!info)
3401		return -EINVAL;
3402	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI)
3403		return -EINVAL;
3404
3405	sbdf = get_devid(info);
3406	if (sbdf < 0)
3407		return -EINVAL;
3408
3409	seg = PCI_SBDF_TO_SEGID(sbdf);
3410	devid = PCI_SBDF_TO_DEVID(sbdf);
3411	iommu = __rlookup_amd_iommu(seg, devid);
3412	if (!iommu)
3413		return -EINVAL;
3414
3415	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
3416	if (ret < 0)
3417		return ret;
3418
3419	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
3420		struct irq_remap_table *table;
3421
3422		table = alloc_irq_table(iommu, devid, NULL);
3423		if (table) {
3424			if (!table->min_index) {
3425				/*
3426				 * Keep the first 32 indexes free for IOAPIC
3427				 * interrupts.
3428				 */
3429				table->min_index = 32;
3430				for (i = 0; i < 32; ++i)
3431					iommu->irte_ops->set_allocated(table, i);
3432			}
3433			WARN_ON(table->min_index != 32);
3434			index = info->ioapic.pin;
3435		} else {
3436			index = -ENOMEM;
3437		}
3438	} else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI ||
3439		   info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) {
3440		bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI);
3441
3442		index = alloc_irq_index(iommu, devid, nr_irqs, align,
3443					msi_desc_to_pci_dev(info->desc));
3444	} else {
3445		index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL);
3446	}
3447
3448	if (index < 0) {
3449		pr_warn("Failed to allocate IRTE\n");
3450		ret = index;
3451		goto out_free_parent;
3452	}
3453
3454	for (i = 0; i < nr_irqs; i++) {
3455		irq_data = irq_domain_get_irq_data(domain, virq + i);
3456		cfg = irq_data ? irqd_cfg(irq_data) : NULL;
3457		if (!cfg) {
3458			ret = -EINVAL;
3459			goto out_free_data;
3460		}
3461
3462		ret = -ENOMEM;
3463		data = kzalloc(sizeof(*data), GFP_KERNEL);
3464		if (!data)
3465			goto out_free_data;
3466
3467		if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
3468			data->entry = kzalloc(sizeof(union irte), GFP_KERNEL);
3469		else
3470			data->entry = kzalloc(sizeof(struct irte_ga),
3471						     GFP_KERNEL);
3472		if (!data->entry) {
3473			kfree(data);
3474			goto out_free_data;
3475		}
3476
3477		data->iommu = iommu;
3478		irq_data->hwirq = (devid << 16) + i;
3479		irq_data->chip_data = data;
3480		irq_data->chip = &amd_ir_chip;
3481		irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
3482		irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
3483	}
3484
3485	return 0;
3486
3487out_free_data:
3488	for (i--; i >= 0; i--) {
3489		irq_data = irq_domain_get_irq_data(domain, virq + i);
3490		if (irq_data)
3491			kfree(irq_data->chip_data);
3492	}
3493	for (i = 0; i < nr_irqs; i++)
3494		free_irte(iommu, devid, index + i);
3495out_free_parent:
3496	irq_domain_free_irqs_common(domain, virq, nr_irqs);
3497	return ret;
3498}
3499
3500static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
3501			       unsigned int nr_irqs)
3502{
3503	struct irq_2_irte *irte_info;
3504	struct irq_data *irq_data;
3505	struct amd_ir_data *data;
3506	int i;
3507
3508	for (i = 0; i < nr_irqs; i++) {
3509		irq_data = irq_domain_get_irq_data(domain, virq  + i);
3510		if (irq_data && irq_data->chip_data) {
3511			data = irq_data->chip_data;
3512			irte_info = &data->irq_2_irte;
3513			free_irte(data->iommu, irte_info->devid, irte_info->index);
3514			kfree(data->entry);
3515			kfree(data);
3516		}
3517	}
3518	irq_domain_free_irqs_common(domain, virq, nr_irqs);
3519}
3520
3521static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3522			       struct amd_ir_data *ir_data,
3523			       struct irq_2_irte *irte_info,
3524			       struct irq_cfg *cfg);
3525
3526static int irq_remapping_activate(struct irq_domain *domain,
3527				  struct irq_data *irq_data, bool reserve)
3528{
3529	struct amd_ir_data *data = irq_data->chip_data;
3530	struct irq_2_irte *irte_info = &data->irq_2_irte;
3531	struct amd_iommu *iommu = data->iommu;
3532	struct irq_cfg *cfg = irqd_cfg(irq_data);
3533
3534	if (!iommu)
3535		return 0;
3536
3537	iommu->irte_ops->activate(iommu, data->entry, irte_info->devid,
3538				  irte_info->index);
3539	amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg);
3540	return 0;
3541}
3542
3543static void irq_remapping_deactivate(struct irq_domain *domain,
3544				     struct irq_data *irq_data)
3545{
3546	struct amd_ir_data *data = irq_data->chip_data;
3547	struct irq_2_irte *irte_info = &data->irq_2_irte;
3548	struct amd_iommu *iommu = data->iommu;
3549
3550	if (iommu)
3551		iommu->irte_ops->deactivate(iommu, data->entry, irte_info->devid,
3552					    irte_info->index);
3553}
3554
3555static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec,
3556				enum irq_domain_bus_token bus_token)
3557{
3558	struct amd_iommu *iommu;
3559	int devid = -1;
3560
3561	if (!amd_iommu_irq_remap)
3562		return 0;
3563
3564	if (x86_fwspec_is_ioapic(fwspec))
3565		devid = get_ioapic_devid(fwspec->param[0]);
3566	else if (x86_fwspec_is_hpet(fwspec))
3567		devid = get_hpet_devid(fwspec->param[0]);
3568
3569	if (devid < 0)
3570		return 0;
3571	iommu = __rlookup_amd_iommu((devid >> 16), (devid & 0xffff));
3572
3573	return iommu && iommu->ir_domain == d;
3574}
3575
3576static const struct irq_domain_ops amd_ir_domain_ops = {
3577	.select = irq_remapping_select,
3578	.alloc = irq_remapping_alloc,
3579	.free = irq_remapping_free,
3580	.activate = irq_remapping_activate,
3581	.deactivate = irq_remapping_deactivate,
3582};
3583
3584int amd_iommu_activate_guest_mode(void *data)
3585{
3586	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3587	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3588	u64 valid;
3589
3590	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry)
3591		return 0;
3592
3593	valid = entry->lo.fields_vapic.valid;
3594
3595	entry->lo.val = 0;
3596	entry->hi.val = 0;
3597
3598	entry->lo.fields_vapic.valid       = valid;
3599	entry->lo.fields_vapic.guest_mode  = 1;
3600	entry->lo.fields_vapic.ga_log_intr = 1;
3601	entry->hi.fields.ga_root_ptr       = ir_data->ga_root_ptr;
3602	entry->hi.fields.vector            = ir_data->ga_vector;
3603	entry->lo.fields_vapic.ga_tag      = ir_data->ga_tag;
3604
3605	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3606			      ir_data->irq_2_irte.index, entry);
3607}
3608EXPORT_SYMBOL(amd_iommu_activate_guest_mode);
3609
3610int amd_iommu_deactivate_guest_mode(void *data)
3611{
3612	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3613	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3614	struct irq_cfg *cfg = ir_data->cfg;
3615	u64 valid;
3616
3617	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3618	    !entry || !entry->lo.fields_vapic.guest_mode)
3619		return 0;
3620
3621	valid = entry->lo.fields_remap.valid;
3622
3623	entry->lo.val = 0;
3624	entry->hi.val = 0;
3625
3626	entry->lo.fields_remap.valid       = valid;
3627	entry->lo.fields_remap.dm          = apic->dest_mode_logical;
3628	entry->lo.fields_remap.int_type    = APIC_DELIVERY_MODE_FIXED;
3629	entry->hi.fields.vector            = cfg->vector;
3630	entry->lo.fields_remap.destination =
3631				APICID_TO_IRTE_DEST_LO(cfg->dest_apicid);
3632	entry->hi.fields.destination =
3633				APICID_TO_IRTE_DEST_HI(cfg->dest_apicid);
3634
3635	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3636			      ir_data->irq_2_irte.index, entry);
3637}
3638EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
3639
3640static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
3641{
3642	int ret;
3643	struct amd_iommu_pi_data *pi_data = vcpu_info;
3644	struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
3645	struct amd_ir_data *ir_data = data->chip_data;
3646	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3647	struct iommu_dev_data *dev_data;
3648
3649	if (ir_data->iommu == NULL)
3650		return -EINVAL;
3651
3652	dev_data = search_dev_data(ir_data->iommu, irte_info->devid);
3653
3654	/* Note:
3655	 * This device has never been set up for guest mode.
3656	 * we should not modify the IRTE
3657	 */
3658	if (!dev_data || !dev_data->use_vapic)
3659		return 0;
3660
3661	ir_data->cfg = irqd_cfg(data);
3662	pi_data->ir_data = ir_data;
3663
3664	/* Note:
3665	 * SVM tries to set up for VAPIC mode, but we are in
3666	 * legacy mode. So, we force legacy mode instead.
3667	 */
3668	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
3669		pr_debug("%s: Fall back to using intr legacy remap\n",
3670			 __func__);
3671		pi_data->is_guest_mode = false;
3672	}
3673
3674	pi_data->prev_ga_tag = ir_data->cached_ga_tag;
3675	if (pi_data->is_guest_mode) {
3676		ir_data->ga_root_ptr = (pi_data->base >> 12);
3677		ir_data->ga_vector = vcpu_pi_info->vector;
3678		ir_data->ga_tag = pi_data->ga_tag;
3679		ret = amd_iommu_activate_guest_mode(ir_data);
3680		if (!ret)
3681			ir_data->cached_ga_tag = pi_data->ga_tag;
3682	} else {
3683		ret = amd_iommu_deactivate_guest_mode(ir_data);
3684
3685		/*
3686		 * This communicates the ga_tag back to the caller
3687		 * so that it can do all the necessary clean up.
3688		 */
3689		if (!ret)
3690			ir_data->cached_ga_tag = 0;
3691	}
3692
3693	return ret;
3694}
3695
3696
3697static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3698			       struct amd_ir_data *ir_data,
3699			       struct irq_2_irte *irte_info,
3700			       struct irq_cfg *cfg)
3701{
3702
3703	/*
3704	 * Atomically updates the IRTE with the new destination, vector
3705	 * and flushes the interrupt entry cache.
3706	 */
3707	iommu->irte_ops->set_affinity(iommu, ir_data->entry, irte_info->devid,
3708				      irte_info->index, cfg->vector,
3709				      cfg->dest_apicid);
3710}
3711
3712static int amd_ir_set_affinity(struct irq_data *data,
3713			       const struct cpumask *mask, bool force)
3714{
3715	struct amd_ir_data *ir_data = data->chip_data;
3716	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3717	struct irq_cfg *cfg = irqd_cfg(data);
3718	struct irq_data *parent = data->parent_data;
3719	struct amd_iommu *iommu = ir_data->iommu;
3720	int ret;
3721
3722	if (!iommu)
3723		return -ENODEV;
3724
3725	ret = parent->chip->irq_set_affinity(parent, mask, force);
3726	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
3727		return ret;
3728
3729	amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg);
3730	/*
3731	 * After this point, all the interrupts will start arriving
3732	 * at the new destination. So, time to cleanup the previous
3733	 * vector allocation.
3734	 */
3735	vector_schedule_cleanup(cfg);
3736
3737	return IRQ_SET_MASK_OK_DONE;
3738}
3739
3740static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
3741{
3742	struct amd_ir_data *ir_data = irq_data->chip_data;
3743
3744	*msg = ir_data->msi_entry;
3745}
3746
3747static struct irq_chip amd_ir_chip = {
3748	.name			= "AMD-IR",
3749	.irq_ack		= apic_ack_irq,
3750	.irq_set_affinity	= amd_ir_set_affinity,
3751	.irq_set_vcpu_affinity	= amd_ir_set_vcpu_affinity,
3752	.irq_compose_msi_msg	= ir_compose_msi_msg,
3753};
3754
3755static const struct msi_parent_ops amdvi_msi_parent_ops = {
3756	.supported_flags	= X86_VECTOR_MSI_FLAGS_SUPPORTED |
3757				  MSI_FLAG_MULTI_PCI_MSI |
3758				  MSI_FLAG_PCI_IMS,
3759	.prefix			= "IR-",
3760	.init_dev_msi_info	= msi_parent_init_dev_msi_info,
3761};
3762
3763static const struct msi_parent_ops virt_amdvi_msi_parent_ops = {
3764	.supported_flags	= X86_VECTOR_MSI_FLAGS_SUPPORTED |
3765				  MSI_FLAG_MULTI_PCI_MSI,
3766	.prefix			= "vIR-",
3767	.init_dev_msi_info	= msi_parent_init_dev_msi_info,
3768};
3769
3770int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
3771{
3772	struct fwnode_handle *fn;
3773
3774	fn = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index);
3775	if (!fn)
3776		return -ENOMEM;
3777	iommu->ir_domain = irq_domain_create_hierarchy(arch_get_ir_parent_domain(), 0, 0,
3778						       fn, &amd_ir_domain_ops, iommu);
3779	if (!iommu->ir_domain) {
3780		irq_domain_free_fwnode(fn);
3781		return -ENOMEM;
3782	}
3783
3784	irq_domain_update_bus_token(iommu->ir_domain,  DOMAIN_BUS_AMDVI);
3785	iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT |
3786				   IRQ_DOMAIN_FLAG_ISOLATED_MSI;
3787
3788	if (amd_iommu_np_cache)
3789		iommu->ir_domain->msi_parent_ops = &virt_amdvi_msi_parent_ops;
3790	else
3791		iommu->ir_domain->msi_parent_ops = &amdvi_msi_parent_ops;
3792
3793	return 0;
3794}
3795
3796int amd_iommu_update_ga(int cpu, bool is_run, void *data)
3797{
3798	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3799	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3800
3801	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3802	    !entry || !entry->lo.fields_vapic.guest_mode)
3803		return 0;
3804
3805	if (!ir_data->iommu)
3806		return -ENODEV;
3807
3808	if (cpu >= 0) {
3809		entry->lo.fields_vapic.destination =
3810					APICID_TO_IRTE_DEST_LO(cpu);
3811		entry->hi.fields.destination =
3812					APICID_TO_IRTE_DEST_HI(cpu);
3813	}
3814	entry->lo.fields_vapic.is_run = is_run;
3815
3816	return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3817				ir_data->irq_2_irte.index, entry);
3818}
3819EXPORT_SYMBOL(amd_iommu_update_ga);
3820#endif