Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
   4 * Author: Joerg Roedel <jroedel@suse.de>
   5 *         Leo Duran <leo.duran@amd.com>
   6 */
   7
   8#define pr_fmt(fmt)     "AMD-Vi: " fmt
   9#define dev_fmt(fmt)    pr_fmt(fmt)
  10
  11#include <linux/ratelimit.h>
  12#include <linux/pci.h>
  13#include <linux/acpi.h>
  14#include <linux/pci-ats.h>
  15#include <linux/bitmap.h>
  16#include <linux/slab.h>
  17#include <linux/debugfs.h>
  18#include <linux/scatterlist.h>
  19#include <linux/dma-map-ops.h>
  20#include <linux/dma-direct.h>
  21#include <linux/idr.h>
  22#include <linux/iommu-helper.h>
  23#include <linux/delay.h>
  24#include <linux/amd-iommu.h>
  25#include <linux/notifier.h>
  26#include <linux/export.h>
  27#include <linux/irq.h>
  28#include <linux/msi.h>
  29#include <linux/irqdomain.h>
  30#include <linux/percpu.h>
  31#include <linux/io-pgtable.h>
  32#include <linux/cc_platform.h>
  33#include <asm/irq_remapping.h>
  34#include <asm/io_apic.h>
  35#include <asm/apic.h>
  36#include <asm/hw_irq.h>
  37#include <asm/proto.h>
  38#include <asm/iommu.h>
  39#include <asm/gart.h>
  40#include <asm/dma.h>
  41#include <uapi/linux/iommufd.h>
  42
  43#include "amd_iommu.h"
  44#include "../dma-iommu.h"
  45#include "../irq_remapping.h"
  46#include "../iommu-pages.h"
  47
  48#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
  49
  50/* Reserved IOVA ranges */
  51#define MSI_RANGE_START		(0xfee00000)
  52#define MSI_RANGE_END		(0xfeefffff)
  53#define HT_RANGE_START		(0xfd00000000ULL)
  54#define HT_RANGE_END		(0xffffffffffULL)
  55
  56LIST_HEAD(ioapic_map);
  57LIST_HEAD(hpet_map);
  58LIST_HEAD(acpihid_map);
  59
  60const struct iommu_ops amd_iommu_ops;
  61static const struct iommu_dirty_ops amd_dirty_ops;
  62
  63int amd_iommu_max_glx_val = -1;
  64
  65/*
  66 * general struct to manage commands send to an IOMMU
  67 */
  68struct iommu_cmd {
  69	u32 data[4];
  70};
  71
  72/*
  73 * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap
  74 * to know which ones are already in use.
  75 */
  76DEFINE_IDA(pdom_ids);
  77
  78struct kmem_cache *amd_iommu_irq_cache;
  79
  80static int amd_iommu_attach_device(struct iommu_domain *dom,
  81				   struct device *dev);
  82
  83static void set_dte_entry(struct amd_iommu *iommu,
  84			  struct iommu_dev_data *dev_data);
  85
  86/****************************************************************************
  87 *
  88 * Helper functions
  89 *
  90 ****************************************************************************/
  91
  92static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom)
  93{
  94	return (pdom && (pdom->pd_mode == PD_MODE_V2));
  95}
  96
  97static inline bool pdom_is_in_pt_mode(struct protection_domain *pdom)
  98{
  99	return (pdom->domain.type == IOMMU_DOMAIN_IDENTITY);
 100}
 101
 102/*
 103 * We cannot support PASID w/ existing v1 page table in the same domain
 104 * since it will be nested. However, existing domain w/ v2 page table
 105 * or passthrough mode can be used for PASID.
 106 */
 107static inline bool pdom_is_sva_capable(struct protection_domain *pdom)
 108{
 109	return pdom_is_v2_pgtbl_mode(pdom) || pdom_is_in_pt_mode(pdom);
 110}
 111
 112static inline int get_acpihid_device_id(struct device *dev,
 113					struct acpihid_map_entry **entry)
 114{
 115	struct acpi_device *adev = ACPI_COMPANION(dev);
 116	struct acpihid_map_entry *p;
 117
 118	if (!adev)
 119		return -ENODEV;
 120
 121	list_for_each_entry(p, &acpihid_map, list) {
 122		if (acpi_dev_hid_uid_match(adev, p->hid,
 123					   p->uid[0] ? p->uid : NULL)) {
 124			if (entry)
 125				*entry = p;
 126			return p->devid;
 127		}
 128	}
 129	return -EINVAL;
 130}
 131
 132static inline int get_device_sbdf_id(struct device *dev)
 133{
 134	int sbdf;
 135
 136	if (dev_is_pci(dev))
 137		sbdf = get_pci_sbdf_id(to_pci_dev(dev));
 138	else
 139		sbdf = get_acpihid_device_id(dev, NULL);
 140
 141	return sbdf;
 142}
 143
 144struct dev_table_entry *get_dev_table(struct amd_iommu *iommu)
 145{
 146	struct dev_table_entry *dev_table;
 147	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 148
 149	BUG_ON(pci_seg == NULL);
 150	dev_table = pci_seg->dev_table;
 151	BUG_ON(dev_table == NULL);
 152
 153	return dev_table;
 154}
 155
 156static inline u16 get_device_segment(struct device *dev)
 157{
 158	u16 seg;
 159
 160	if (dev_is_pci(dev)) {
 161		struct pci_dev *pdev = to_pci_dev(dev);
 162
 163		seg = pci_domain_nr(pdev->bus);
 164	} else {
 165		u32 devid = get_acpihid_device_id(dev, NULL);
 166
 167		seg = PCI_SBDF_TO_SEGID(devid);
 168	}
 169
 170	return seg;
 171}
 172
 173/* Writes the specific IOMMU for a device into the PCI segment rlookup table */
 174void amd_iommu_set_rlookup_table(struct amd_iommu *iommu, u16 devid)
 175{
 176	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 177
 178	pci_seg->rlookup_table[devid] = iommu;
 179}
 180
 181static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid)
 182{
 183	struct amd_iommu_pci_seg *pci_seg;
 184
 185	for_each_pci_segment(pci_seg) {
 186		if (pci_seg->id == seg)
 187			return pci_seg->rlookup_table[devid];
 188	}
 189	return NULL;
 190}
 191
 192static struct amd_iommu *rlookup_amd_iommu(struct device *dev)
 193{
 194	u16 seg = get_device_segment(dev);
 195	int devid = get_device_sbdf_id(dev);
 196
 197	if (devid < 0)
 198		return NULL;
 199	return __rlookup_amd_iommu(seg, PCI_SBDF_TO_DEVID(devid));
 200}
 201
 202static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid)
 203{
 204	struct iommu_dev_data *dev_data;
 205	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 206
 207	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
 208	if (!dev_data)
 209		return NULL;
 210
 211	mutex_init(&dev_data->mutex);
 212	dev_data->devid = devid;
 213	ratelimit_default_init(&dev_data->rs);
 214
 215	llist_add(&dev_data->dev_data_list, &pci_seg->dev_data_list);
 216	return dev_data;
 217}
 218
 219static struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid)
 220{
 221	struct iommu_dev_data *dev_data;
 222	struct llist_node *node;
 223	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 224
 225	if (llist_empty(&pci_seg->dev_data_list))
 226		return NULL;
 227
 228	node = pci_seg->dev_data_list.first;
 229	llist_for_each_entry(dev_data, node, dev_data_list) {
 230		if (dev_data->devid == devid)
 231			return dev_data;
 232	}
 233
 234	return NULL;
 235}
 236
 237static int clone_alias(struct pci_dev *pdev, u16 alias, void *data)
 238{
 239	struct amd_iommu *iommu;
 240	struct dev_table_entry *dev_table;
 241	u16 devid = pci_dev_id(pdev);
 242
 243	if (devid == alias)
 244		return 0;
 245
 246	iommu = rlookup_amd_iommu(&pdev->dev);
 247	if (!iommu)
 248		return 0;
 249
 250	amd_iommu_set_rlookup_table(iommu, alias);
 251	dev_table = get_dev_table(iommu);
 252	memcpy(dev_table[alias].data,
 253	       dev_table[devid].data,
 254	       sizeof(dev_table[alias].data));
 255
 256	return 0;
 257}
 258
 259static void clone_aliases(struct amd_iommu *iommu, struct device *dev)
 260{
 261	struct pci_dev *pdev;
 262
 263	if (!dev_is_pci(dev))
 264		return;
 265	pdev = to_pci_dev(dev);
 266
 267	/*
 268	 * The IVRS alias stored in the alias table may not be
 269	 * part of the PCI DMA aliases if it's bus differs
 270	 * from the original device.
 271	 */
 272	clone_alias(pdev, iommu->pci_seg->alias_table[pci_dev_id(pdev)], NULL);
 273
 274	pci_for_each_dma_alias(pdev, clone_alias, NULL);
 275}
 276
 277static void setup_aliases(struct amd_iommu *iommu, struct device *dev)
 278{
 279	struct pci_dev *pdev = to_pci_dev(dev);
 280	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 281	u16 ivrs_alias;
 282
 283	/* For ACPI HID devices, there are no aliases */
 284	if (!dev_is_pci(dev))
 285		return;
 286
 287	/*
 288	 * Add the IVRS alias to the pci aliases if it is on the same
 289	 * bus. The IVRS table may know about a quirk that we don't.
 290	 */
 291	ivrs_alias = pci_seg->alias_table[pci_dev_id(pdev)];
 292	if (ivrs_alias != pci_dev_id(pdev) &&
 293	    PCI_BUS_NUM(ivrs_alias) == pdev->bus->number)
 294		pci_add_dma_alias(pdev, ivrs_alias & 0xff, 1);
 295
 296	clone_aliases(iommu, dev);
 297}
 298
 299static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid)
 300{
 301	struct iommu_dev_data *dev_data;
 302
 303	dev_data = search_dev_data(iommu, devid);
 304
 305	if (dev_data == NULL) {
 306		dev_data = alloc_dev_data(iommu, devid);
 307		if (!dev_data)
 308			return NULL;
 309
 310		if (translation_pre_enabled(iommu))
 311			dev_data->defer_attach = true;
 312	}
 313
 314	return dev_data;
 315}
 316
 317/*
 318* Find or create an IOMMU group for a acpihid device.
 319*/
 320static struct iommu_group *acpihid_device_group(struct device *dev)
 321{
 322	struct acpihid_map_entry *p, *entry = NULL;
 323	int devid;
 324
 325	devid = get_acpihid_device_id(dev, &entry);
 326	if (devid < 0)
 327		return ERR_PTR(devid);
 328
 329	list_for_each_entry(p, &acpihid_map, list) {
 330		if ((devid == p->devid) && p->group)
 331			entry->group = p->group;
 332	}
 333
 334	if (!entry->group)
 335		entry->group = generic_device_group(dev);
 336	else
 337		iommu_group_ref_get(entry->group);
 338
 339	return entry->group;
 340}
 341
 342static inline bool pdev_pasid_supported(struct iommu_dev_data *dev_data)
 343{
 344	return (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP);
 345}
 346
 347static u32 pdev_get_caps(struct pci_dev *pdev)
 348{
 349	int features;
 350	u32 flags = 0;
 351
 352	if (pci_ats_supported(pdev))
 353		flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
 354
 355	if (pci_pri_supported(pdev))
 356		flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
 357
 358	features = pci_pasid_features(pdev);
 359	if (features >= 0) {
 360		flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
 361
 362		if (features & PCI_PASID_CAP_EXEC)
 363			flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
 364
 365		if (features & PCI_PASID_CAP_PRIV)
 366			flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
 367	}
 368
 369	return flags;
 370}
 371
 372static inline int pdev_enable_cap_ats(struct pci_dev *pdev)
 373{
 374	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 375	int ret = -EINVAL;
 376
 377	if (dev_data->ats_enabled)
 378		return 0;
 379
 380	if (amd_iommu_iotlb_sup &&
 381	    (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP)) {
 382		ret = pci_enable_ats(pdev, PAGE_SHIFT);
 383		if (!ret) {
 384			dev_data->ats_enabled = 1;
 385			dev_data->ats_qdep    = pci_ats_queue_depth(pdev);
 386		}
 387	}
 388
 389	return ret;
 390}
 391
 392static inline void pdev_disable_cap_ats(struct pci_dev *pdev)
 393{
 394	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 395
 396	if (dev_data->ats_enabled) {
 397		pci_disable_ats(pdev);
 398		dev_data->ats_enabled = 0;
 399	}
 400}
 401
 402static inline int pdev_enable_cap_pri(struct pci_dev *pdev)
 403{
 404	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 405	int ret = -EINVAL;
 406
 407	if (dev_data->pri_enabled)
 408		return 0;
 409
 410	if (!dev_data->ats_enabled)
 411		return 0;
 412
 413	if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) {
 414		/*
 415		 * First reset the PRI state of the device.
 416		 * FIXME: Hardcode number of outstanding requests for now
 417		 */
 418		if (!pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) {
 419			dev_data->pri_enabled = 1;
 420			dev_data->pri_tlp     = pci_prg_resp_pasid_required(pdev);
 421
 422			ret = 0;
 423		}
 424	}
 425
 426	return ret;
 427}
 428
 429static inline void pdev_disable_cap_pri(struct pci_dev *pdev)
 430{
 431	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 432
 433	if (dev_data->pri_enabled) {
 434		pci_disable_pri(pdev);
 435		dev_data->pri_enabled = 0;
 436	}
 437}
 438
 439static inline int pdev_enable_cap_pasid(struct pci_dev *pdev)
 440{
 441	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 442	int ret = -EINVAL;
 443
 444	if (dev_data->pasid_enabled)
 445		return 0;
 446
 447	if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) {
 448		/* Only allow access to user-accessible pages */
 449		ret = pci_enable_pasid(pdev, 0);
 450		if (!ret)
 451			dev_data->pasid_enabled = 1;
 452	}
 453
 454	return ret;
 455}
 456
 457static inline void pdev_disable_cap_pasid(struct pci_dev *pdev)
 458{
 459	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
 460
 461	if (dev_data->pasid_enabled) {
 462		pci_disable_pasid(pdev);
 463		dev_data->pasid_enabled = 0;
 464	}
 465}
 466
 467static void pdev_enable_caps(struct pci_dev *pdev)
 468{
 469	pdev_enable_cap_ats(pdev);
 470	pdev_enable_cap_pasid(pdev);
 471	pdev_enable_cap_pri(pdev);
 472}
 473
 474static void pdev_disable_caps(struct pci_dev *pdev)
 475{
 476	pdev_disable_cap_ats(pdev);
 477	pdev_disable_cap_pasid(pdev);
 478	pdev_disable_cap_pri(pdev);
 479}
 480
 481/*
 482 * This function checks if the driver got a valid device from the caller to
 483 * avoid dereferencing invalid pointers.
 484 */
 485static bool check_device(struct device *dev)
 486{
 487	struct amd_iommu_pci_seg *pci_seg;
 488	struct amd_iommu *iommu;
 489	int devid, sbdf;
 490
 491	if (!dev)
 492		return false;
 493
 494	sbdf = get_device_sbdf_id(dev);
 495	if (sbdf < 0)
 496		return false;
 497	devid = PCI_SBDF_TO_DEVID(sbdf);
 498
 499	iommu = rlookup_amd_iommu(dev);
 500	if (!iommu)
 501		return false;
 502
 503	/* Out of our scope? */
 504	pci_seg = iommu->pci_seg;
 505	if (devid > pci_seg->last_bdf)
 506		return false;
 507
 508	return true;
 509}
 510
 511static int iommu_init_device(struct amd_iommu *iommu, struct device *dev)
 512{
 513	struct iommu_dev_data *dev_data;
 514	int devid, sbdf;
 515
 516	if (dev_iommu_priv_get(dev))
 517		return 0;
 518
 519	sbdf = get_device_sbdf_id(dev);
 520	if (sbdf < 0)
 521		return sbdf;
 522
 523	devid = PCI_SBDF_TO_DEVID(sbdf);
 524	dev_data = find_dev_data(iommu, devid);
 525	if (!dev_data)
 526		return -ENOMEM;
 527
 528	dev_data->dev = dev;
 529	setup_aliases(iommu, dev);
 530
 531	/*
 532	 * By default we use passthrough mode for IOMMUv2 capable device.
 533	 * But if amd_iommu=force_isolation is set (e.g. to debug DMA to
 534	 * invalid address), we ignore the capability for the device so
 535	 * it'll be forced to go into translation mode.
 536	 */
 537	if ((iommu_default_passthrough() || !amd_iommu_force_isolation) &&
 538	    dev_is_pci(dev) && amd_iommu_gt_ppr_supported()) {
 539		dev_data->flags = pdev_get_caps(to_pci_dev(dev));
 540	}
 541
 542	dev_iommu_priv_set(dev, dev_data);
 543
 544	return 0;
 545}
 546
 547static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev)
 548{
 549	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
 550	struct dev_table_entry *dev_table = get_dev_table(iommu);
 551	int devid, sbdf;
 552
 553	sbdf = get_device_sbdf_id(dev);
 554	if (sbdf < 0)
 555		return;
 556
 557	devid = PCI_SBDF_TO_DEVID(sbdf);
 558	pci_seg->rlookup_table[devid] = NULL;
 559	memset(&dev_table[devid], 0, sizeof(struct dev_table_entry));
 560
 561	setup_aliases(iommu, dev);
 562}
 563
 564
 565/****************************************************************************
 566 *
 567 * Interrupt handling functions
 568 *
 569 ****************************************************************************/
 570
 571static void dump_dte_entry(struct amd_iommu *iommu, u16 devid)
 572{
 573	int i;
 574	struct dev_table_entry *dev_table = get_dev_table(iommu);
 575
 576	for (i = 0; i < 4; ++i)
 577		pr_err("DTE[%d]: %016llx\n", i, dev_table[devid].data[i]);
 578}
 579
 580static void dump_command(unsigned long phys_addr)
 581{
 582	struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr);
 583	int i;
 584
 585	for (i = 0; i < 4; ++i)
 586		pr_err("CMD[%d]: %08x\n", i, cmd->data[i]);
 587}
 588
 589static void amd_iommu_report_rmp_hw_error(struct amd_iommu *iommu, volatile u32 *event)
 590{
 591	struct iommu_dev_data *dev_data = NULL;
 592	int devid, vmg_tag, flags;
 593	struct pci_dev *pdev;
 594	u64 spa;
 595
 596	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
 597	vmg_tag = (event[1]) & 0xFFFF;
 598	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
 599	spa     = ((u64)event[3] << 32) | (event[2] & 0xFFFFFFF8);
 600
 601	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
 602					   devid & 0xff);
 603	if (pdev)
 604		dev_data = dev_iommu_priv_get(&pdev->dev);
 605
 606	if (dev_data) {
 607		if (__ratelimit(&dev_data->rs)) {
 608			pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
 609				vmg_tag, spa, flags);
 610		}
 611	} else {
 612		pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
 613			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 614			vmg_tag, spa, flags);
 615	}
 616
 617	if (pdev)
 618		pci_dev_put(pdev);
 619}
 620
 621static void amd_iommu_report_rmp_fault(struct amd_iommu *iommu, volatile u32 *event)
 622{
 623	struct iommu_dev_data *dev_data = NULL;
 624	int devid, flags_rmp, vmg_tag, flags;
 625	struct pci_dev *pdev;
 626	u64 gpa;
 627
 628	devid     = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
 629	flags_rmp = (event[0] >> EVENT_FLAGS_SHIFT) & 0xFF;
 630	vmg_tag   = (event[1]) & 0xFFFF;
 631	flags     = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
 632	gpa       = ((u64)event[3] << 32) | event[2];
 633
 634	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
 635					   devid & 0xff);
 636	if (pdev)
 637		dev_data = dev_iommu_priv_get(&pdev->dev);
 638
 639	if (dev_data) {
 640		if (__ratelimit(&dev_data->rs)) {
 641			pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
 642				vmg_tag, gpa, flags_rmp, flags);
 643		}
 644	} else {
 645		pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
 646			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 647			vmg_tag, gpa, flags_rmp, flags);
 648	}
 649
 650	if (pdev)
 651		pci_dev_put(pdev);
 652}
 653
 654#define IS_IOMMU_MEM_TRANSACTION(flags)		\
 655	(((flags) & EVENT_FLAG_I) == 0)
 656
 657#define IS_WRITE_REQUEST(flags)			\
 658	((flags) & EVENT_FLAG_RW)
 659
 660static void amd_iommu_report_page_fault(struct amd_iommu *iommu,
 661					u16 devid, u16 domain_id,
 662					u64 address, int flags)
 663{
 664	struct iommu_dev_data *dev_data = NULL;
 665	struct pci_dev *pdev;
 666
 667	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
 668					   devid & 0xff);
 669	if (pdev)
 670		dev_data = dev_iommu_priv_get(&pdev->dev);
 671
 672	if (dev_data) {
 673		/*
 674		 * If this is a DMA fault (for which the I(nterrupt)
 675		 * bit will be unset), allow report_iommu_fault() to
 676		 * prevent logging it.
 677		 */
 678		if (IS_IOMMU_MEM_TRANSACTION(flags)) {
 679			/* Device not attached to domain properly */
 680			if (dev_data->domain == NULL) {
 681				pr_err_ratelimited("Event logged [Device not attached to domain properly]\n");
 682				pr_err_ratelimited("  device=%04x:%02x:%02x.%x domain=0x%04x\n",
 683						   iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
 684						   PCI_FUNC(devid), domain_id);
 685				goto out;
 686			}
 687
 688			if (!report_iommu_fault(&dev_data->domain->domain,
 689						&pdev->dev, address,
 690						IS_WRITE_REQUEST(flags) ?
 691							IOMMU_FAULT_WRITE :
 692							IOMMU_FAULT_READ))
 693				goto out;
 694		}
 695
 696		if (__ratelimit(&dev_data->rs)) {
 697			pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n",
 698				domain_id, address, flags);
 699		}
 700	} else {
 701		pr_err_ratelimited("Event logged [IO_PAGE_FAULT device=%04x:%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n",
 702			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 703			domain_id, address, flags);
 704	}
 705
 706out:
 707	if (pdev)
 708		pci_dev_put(pdev);
 709}
 710
 711static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
 712{
 713	struct device *dev = iommu->iommu.dev;
 714	int type, devid, flags, tag;
 715	volatile u32 *event = __evt;
 716	int count = 0;
 717	u64 address;
 718	u32 pasid;
 719
 720retry:
 721	type    = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
 722	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
 723	pasid   = (event[0] & EVENT_DOMID_MASK_HI) |
 724		  (event[1] & EVENT_DOMID_MASK_LO);
 725	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
 726	address = (u64)(((u64)event[3]) << 32) | event[2];
 727
 728	if (type == 0) {
 729		/* Did we hit the erratum? */
 730		if (++count == LOOP_TIMEOUT) {
 731			pr_err("No event written to event log\n");
 732			return;
 733		}
 734		udelay(1);
 735		goto retry;
 736	}
 737
 738	if (type == EVENT_TYPE_IO_FAULT) {
 739		amd_iommu_report_page_fault(iommu, devid, pasid, address, flags);
 740		return;
 741	}
 742
 743	switch (type) {
 744	case EVENT_TYPE_ILL_DEV:
 745		dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
 746			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 747			pasid, address, flags);
 748		dump_dte_entry(iommu, devid);
 749		break;
 750	case EVENT_TYPE_DEV_TAB_ERR:
 751		dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x "
 752			"address=0x%llx flags=0x%04x]\n",
 753			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 754			address, flags);
 755		break;
 756	case EVENT_TYPE_PAGE_TAB_ERR:
 757		dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x pasid=0x%04x address=0x%llx flags=0x%04x]\n",
 758			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 759			pasid, address, flags);
 760		break;
 761	case EVENT_TYPE_ILL_CMD:
 762		dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%llx]\n", address);
 763		dump_command(address);
 764		break;
 765	case EVENT_TYPE_CMD_HARD_ERR:
 766		dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%llx flags=0x%04x]\n",
 767			address, flags);
 768		break;
 769	case EVENT_TYPE_IOTLB_INV_TO:
 770		dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%04x:%02x:%02x.%x address=0x%llx]\n",
 771			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 772			address);
 773		break;
 774	case EVENT_TYPE_INV_DEV_REQ:
 775		dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
 776			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 777			pasid, address, flags);
 778		break;
 779	case EVENT_TYPE_RMP_FAULT:
 780		amd_iommu_report_rmp_fault(iommu, event);
 781		break;
 782	case EVENT_TYPE_RMP_HW_ERR:
 783		amd_iommu_report_rmp_hw_error(iommu, event);
 784		break;
 785	case EVENT_TYPE_INV_PPR_REQ:
 786		pasid = PPR_PASID(*((u64 *)__evt));
 787		tag = event[1] & 0x03FF;
 788		dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n",
 789			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 790			pasid, address, flags, tag);
 791		break;
 792	default:
 793		dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n",
 794			event[0], event[1], event[2], event[3]);
 795	}
 796
 797	/*
 798	 * To detect the hardware errata 732 we need to clear the
 799	 * entry back to zero. This issue does not exist on SNP
 800	 * enabled system. Also this buffer is not writeable on
 801	 * SNP enabled system.
 802	 */
 803	if (!amd_iommu_snp_en)
 804		memset(__evt, 0, 4 * sizeof(u32));
 805}
 806
 807static void iommu_poll_events(struct amd_iommu *iommu)
 808{
 809	u32 head, tail;
 810
 811	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
 812	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
 813
 814	while (head != tail) {
 815		iommu_print_event(iommu, iommu->evt_buf + head);
 816
 817		/* Update head pointer of hardware ring-buffer */
 818		head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
 819		writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
 820	}
 821
 822}
 823
 824#ifdef CONFIG_IRQ_REMAP
 825static int (*iommu_ga_log_notifier)(u32);
 826
 827int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
 828{
 829	iommu_ga_log_notifier = notifier;
 830
 831	return 0;
 832}
 833EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
 834
 835static void iommu_poll_ga_log(struct amd_iommu *iommu)
 836{
 837	u32 head, tail;
 838
 839	if (iommu->ga_log == NULL)
 840		return;
 841
 842	head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
 843	tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
 844
 845	while (head != tail) {
 846		volatile u64 *raw;
 847		u64 log_entry;
 848
 849		raw = (u64 *)(iommu->ga_log + head);
 850
 851		/* Avoid memcpy function-call overhead */
 852		log_entry = *raw;
 853
 854		/* Update head pointer of hardware ring-buffer */
 855		head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
 856		writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
 857
 858		/* Handle GA entry */
 859		switch (GA_REQ_TYPE(log_entry)) {
 860		case GA_GUEST_NR:
 861			if (!iommu_ga_log_notifier)
 862				break;
 863
 864			pr_debug("%s: devid=%#x, ga_tag=%#x\n",
 865				 __func__, GA_DEVID(log_entry),
 866				 GA_TAG(log_entry));
 867
 868			if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
 869				pr_err("GA log notifier failed.\n");
 870			break;
 871		default:
 872			break;
 873		}
 874	}
 875}
 876
 877static void
 878amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu)
 879{
 880	if (!irq_remapping_enabled || !dev_is_pci(dev) ||
 881	    !pci_dev_has_default_msi_parent_domain(to_pci_dev(dev)))
 882		return;
 883
 884	dev_set_msi_domain(dev, iommu->ir_domain);
 885}
 886
 887#else /* CONFIG_IRQ_REMAP */
 888static inline void
 889amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { }
 890#endif /* !CONFIG_IRQ_REMAP */
 891
 892static void amd_iommu_handle_irq(void *data, const char *evt_type,
 893				 u32 int_mask, u32 overflow_mask,
 894				 void (*int_handler)(struct amd_iommu *),
 895				 void (*overflow_handler)(struct amd_iommu *))
 896{
 897	struct amd_iommu *iommu = (struct amd_iommu *) data;
 898	u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
 899	u32 mask = int_mask | overflow_mask;
 900
 901	while (status & mask) {
 902		/* Enable interrupt sources again */
 903		writel(mask, iommu->mmio_base + MMIO_STATUS_OFFSET);
 904
 905		if (int_handler) {
 906			pr_devel("Processing IOMMU (ivhd%d) %s Log\n",
 907				 iommu->index, evt_type);
 908			int_handler(iommu);
 909		}
 910
 911		if ((status & overflow_mask) && overflow_handler)
 912			overflow_handler(iommu);
 913
 914		/*
 915		 * Hardware bug: ERBT1312
 916		 * When re-enabling interrupt (by writing 1
 917		 * to clear the bit), the hardware might also try to set
 918		 * the interrupt bit in the event status register.
 919		 * In this scenario, the bit will be set, and disable
 920		 * subsequent interrupts.
 921		 *
 922		 * Workaround: The IOMMU driver should read back the
 923		 * status register and check if the interrupt bits are cleared.
 924		 * If not, driver will need to go through the interrupt handler
 925		 * again and re-clear the bits
 926		 */
 927		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
 928	}
 929}
 930
 931irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data)
 932{
 933	amd_iommu_handle_irq(data, "Evt", MMIO_STATUS_EVT_INT_MASK,
 934			     MMIO_STATUS_EVT_OVERFLOW_MASK,
 935			     iommu_poll_events, amd_iommu_restart_event_logging);
 936
 937	return IRQ_HANDLED;
 938}
 939
 940irqreturn_t amd_iommu_int_thread_pprlog(int irq, void *data)
 941{
 942	amd_iommu_handle_irq(data, "PPR", MMIO_STATUS_PPR_INT_MASK,
 943			     MMIO_STATUS_PPR_OVERFLOW_MASK,
 944			     amd_iommu_poll_ppr_log, amd_iommu_restart_ppr_log);
 945
 946	return IRQ_HANDLED;
 947}
 948
 949irqreturn_t amd_iommu_int_thread_galog(int irq, void *data)
 950{
 951#ifdef CONFIG_IRQ_REMAP
 952	amd_iommu_handle_irq(data, "GA", MMIO_STATUS_GALOG_INT_MASK,
 953			     MMIO_STATUS_GALOG_OVERFLOW_MASK,
 954			     iommu_poll_ga_log, amd_iommu_restart_ga_log);
 955#endif
 956
 957	return IRQ_HANDLED;
 958}
 959
 960irqreturn_t amd_iommu_int_thread(int irq, void *data)
 961{
 962	amd_iommu_int_thread_evtlog(irq, data);
 963	amd_iommu_int_thread_pprlog(irq, data);
 964	amd_iommu_int_thread_galog(irq, data);
 965
 966	return IRQ_HANDLED;
 967}
 968
 969irqreturn_t amd_iommu_int_handler(int irq, void *data)
 970{
 971	return IRQ_WAKE_THREAD;
 972}
 973
 974/****************************************************************************
 975 *
 976 * IOMMU command queuing functions
 977 *
 978 ****************************************************************************/
 979
 980static int wait_on_sem(struct amd_iommu *iommu, u64 data)
 981{
 982	int i = 0;
 983
 984	while (*iommu->cmd_sem != data && i < LOOP_TIMEOUT) {
 985		udelay(1);
 986		i += 1;
 987	}
 988
 989	if (i == LOOP_TIMEOUT) {
 990		pr_alert("Completion-Wait loop timed out\n");
 991		return -EIO;
 992	}
 993
 994	return 0;
 995}
 996
 997static void copy_cmd_to_buffer(struct amd_iommu *iommu,
 998			       struct iommu_cmd *cmd)
 999{
1000	u8 *target;
1001	u32 tail;
1002
1003	/* Copy command to buffer */
1004	tail = iommu->cmd_buf_tail;
1005	target = iommu->cmd_buf + tail;
1006	memcpy(target, cmd, sizeof(*cmd));
1007
1008	tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
1009	iommu->cmd_buf_tail = tail;
1010
1011	/* Tell the IOMMU about it */
1012	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
1013}
1014
1015static void build_completion_wait(struct iommu_cmd *cmd,
1016				  struct amd_iommu *iommu,
1017				  u64 data)
1018{
1019	u64 paddr = iommu_virt_to_phys((void *)iommu->cmd_sem);
1020
1021	memset(cmd, 0, sizeof(*cmd));
1022	cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
1023	cmd->data[1] = upper_32_bits(paddr);
1024	cmd->data[2] = lower_32_bits(data);
1025	cmd->data[3] = upper_32_bits(data);
1026	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
1027}
1028
1029static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
1030{
1031	memset(cmd, 0, sizeof(*cmd));
1032	cmd->data[0] = devid;
1033	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
1034}
1035
1036/*
1037 * Builds an invalidation address which is suitable for one page or multiple
1038 * pages. Sets the size bit (S) as needed is more than one page is flushed.
1039 */
1040static inline u64 build_inv_address(u64 address, size_t size)
1041{
1042	u64 pages, end, msb_diff;
1043
1044	pages = iommu_num_pages(address, size, PAGE_SIZE);
1045
1046	if (pages == 1)
1047		return address & PAGE_MASK;
1048
1049	end = address + size - 1;
1050
1051	/*
1052	 * msb_diff would hold the index of the most significant bit that
1053	 * flipped between the start and end.
1054	 */
1055	msb_diff = fls64(end ^ address) - 1;
1056
1057	/*
1058	 * Bits 63:52 are sign extended. If for some reason bit 51 is different
1059	 * between the start and the end, invalidate everything.
1060	 */
1061	if (unlikely(msb_diff > 51)) {
1062		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
1063	} else {
1064		/*
1065		 * The msb-bit must be clear on the address. Just set all the
1066		 * lower bits.
1067		 */
1068		address |= (1ull << msb_diff) - 1;
1069	}
1070
1071	/* Clear bits 11:0 */
1072	address &= PAGE_MASK;
1073
1074	/* Set the size bit - we flush more than one 4kb page */
1075	return address | CMD_INV_IOMMU_PAGES_SIZE_MASK;
1076}
1077
1078static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
1079				  size_t size, u16 domid,
1080				  ioasid_t pasid, bool gn)
1081{
1082	u64 inv_address = build_inv_address(address, size);
1083
1084	memset(cmd, 0, sizeof(*cmd));
1085
1086	cmd->data[1] |= domid;
1087	cmd->data[2]  = lower_32_bits(inv_address);
1088	cmd->data[3]  = upper_32_bits(inv_address);
1089	/* PDE bit - we want to flush everything, not only the PTEs */
1090	cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
1091	if (gn) {
1092		cmd->data[0] |= pasid;
1093		cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1094	}
1095	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
1096}
1097
1098static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
1099				  u64 address, size_t size,
1100				  ioasid_t pasid, bool gn)
1101{
1102	u64 inv_address = build_inv_address(address, size);
1103
1104	memset(cmd, 0, sizeof(*cmd));
1105
1106	cmd->data[0]  = devid;
1107	cmd->data[0] |= (qdep & 0xff) << 24;
1108	cmd->data[1]  = devid;
1109	cmd->data[2]  = lower_32_bits(inv_address);
1110	cmd->data[3]  = upper_32_bits(inv_address);
1111	if (gn) {
1112		cmd->data[0] |= ((pasid >> 8) & 0xff) << 16;
1113		cmd->data[1] |= (pasid & 0xff) << 16;
1114		cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1115	}
1116
1117	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
1118}
1119
1120static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid,
1121			       int status, int tag, u8 gn)
1122{
1123	memset(cmd, 0, sizeof(*cmd));
1124
1125	cmd->data[0]  = devid;
1126	if (gn) {
1127		cmd->data[1]  = pasid;
1128		cmd->data[2]  = CMD_INV_IOMMU_PAGES_GN_MASK;
1129	}
1130	cmd->data[3]  = tag & 0x1ff;
1131	cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;
1132
1133	CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
1134}
1135
1136static void build_inv_all(struct iommu_cmd *cmd)
1137{
1138	memset(cmd, 0, sizeof(*cmd));
1139	CMD_SET_TYPE(cmd, CMD_INV_ALL);
1140}
1141
1142static void build_inv_irt(struct iommu_cmd *cmd, u16 devid)
1143{
1144	memset(cmd, 0, sizeof(*cmd));
1145	cmd->data[0] = devid;
1146	CMD_SET_TYPE(cmd, CMD_INV_IRT);
1147}
1148
1149/*
1150 * Writes the command to the IOMMUs command buffer and informs the
1151 * hardware about the new command.
1152 */
1153static int __iommu_queue_command_sync(struct amd_iommu *iommu,
1154				      struct iommu_cmd *cmd,
1155				      bool sync)
1156{
1157	unsigned int count = 0;
1158	u32 left, next_tail;
1159
1160	next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
1161again:
1162	left      = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE;
1163
1164	if (left <= 0x20) {
1165		/* Skip udelay() the first time around */
1166		if (count++) {
1167			if (count == LOOP_TIMEOUT) {
1168				pr_err("Command buffer timeout\n");
1169				return -EIO;
1170			}
1171
1172			udelay(1);
1173		}
1174
1175		/* Update head and recheck remaining space */
1176		iommu->cmd_buf_head = readl(iommu->mmio_base +
1177					    MMIO_CMD_HEAD_OFFSET);
1178
1179		goto again;
1180	}
1181
1182	copy_cmd_to_buffer(iommu, cmd);
1183
1184	/* Do we need to make sure all commands are processed? */
1185	iommu->need_sync = sync;
1186
1187	return 0;
1188}
1189
1190static int iommu_queue_command_sync(struct amd_iommu *iommu,
1191				    struct iommu_cmd *cmd,
1192				    bool sync)
1193{
1194	unsigned long flags;
1195	int ret;
1196
1197	raw_spin_lock_irqsave(&iommu->lock, flags);
1198	ret = __iommu_queue_command_sync(iommu, cmd, sync);
1199	raw_spin_unlock_irqrestore(&iommu->lock, flags);
1200
1201	return ret;
1202}
1203
1204static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
1205{
1206	return iommu_queue_command_sync(iommu, cmd, true);
1207}
1208
1209/*
1210 * This function queues a completion wait command into the command
1211 * buffer of an IOMMU
1212 */
1213static int iommu_completion_wait(struct amd_iommu *iommu)
1214{
1215	struct iommu_cmd cmd;
1216	unsigned long flags;
1217	int ret;
1218	u64 data;
1219
1220	if (!iommu->need_sync)
1221		return 0;
1222
1223	data = atomic64_inc_return(&iommu->cmd_sem_val);
1224	build_completion_wait(&cmd, iommu, data);
1225
1226	raw_spin_lock_irqsave(&iommu->lock, flags);
1227
1228	ret = __iommu_queue_command_sync(iommu, &cmd, false);
1229	if (ret)
1230		goto out_unlock;
1231
1232	ret = wait_on_sem(iommu, data);
1233
1234out_unlock:
1235	raw_spin_unlock_irqrestore(&iommu->lock, flags);
1236
1237	return ret;
1238}
1239
1240static void domain_flush_complete(struct protection_domain *domain)
1241{
1242	struct pdom_iommu_info *pdom_iommu_info;
1243	unsigned long i;
1244
1245	lockdep_assert_held(&domain->lock);
1246
1247	/*
1248	 * Devices of this domain are behind this IOMMU
1249	 * We need to wait for completion of all commands.
1250	 */
1251	 xa_for_each(&domain->iommu_array, i, pdom_iommu_info)
1252		iommu_completion_wait(pdom_iommu_info->iommu);
1253}
1254
1255static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
1256{
1257	struct iommu_cmd cmd;
1258
1259	build_inv_dte(&cmd, devid);
1260
1261	return iommu_queue_command(iommu, &cmd);
1262}
1263
1264static void amd_iommu_flush_dte_all(struct amd_iommu *iommu)
1265{
1266	u32 devid;
1267	u16 last_bdf = iommu->pci_seg->last_bdf;
1268
1269	for (devid = 0; devid <= last_bdf; ++devid)
1270		iommu_flush_dte(iommu, devid);
1271
1272	iommu_completion_wait(iommu);
1273}
1274
1275/*
1276 * This function uses heavy locking and may disable irqs for some time. But
1277 * this is no issue because it is only called during resume.
1278 */
1279static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu)
1280{
1281	u32 dom_id;
1282	u16 last_bdf = iommu->pci_seg->last_bdf;
1283
1284	for (dom_id = 0; dom_id <= last_bdf; ++dom_id) {
1285		struct iommu_cmd cmd;
1286		build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1287				      dom_id, IOMMU_NO_PASID, false);
1288		iommu_queue_command(iommu, &cmd);
1289	}
1290
1291	iommu_completion_wait(iommu);
1292}
1293
1294static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
1295{
1296	struct iommu_cmd cmd;
1297
1298	build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1299			      dom_id, IOMMU_NO_PASID, false);
1300	iommu_queue_command(iommu, &cmd);
1301
1302	iommu_completion_wait(iommu);
1303}
1304
1305static void amd_iommu_flush_all(struct amd_iommu *iommu)
1306{
1307	struct iommu_cmd cmd;
1308
1309	build_inv_all(&cmd);
1310
1311	iommu_queue_command(iommu, &cmd);
1312	iommu_completion_wait(iommu);
1313}
1314
1315static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid)
1316{
1317	struct iommu_cmd cmd;
1318
1319	build_inv_irt(&cmd, devid);
1320
1321	iommu_queue_command(iommu, &cmd);
1322}
1323
1324static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
1325{
1326	u32 devid;
1327	u16 last_bdf = iommu->pci_seg->last_bdf;
1328
1329	if (iommu->irtcachedis_enabled)
1330		return;
1331
1332	for (devid = 0; devid <= last_bdf; devid++)
1333		iommu_flush_irt(iommu, devid);
1334
1335	iommu_completion_wait(iommu);
1336}
1337
1338void amd_iommu_flush_all_caches(struct amd_iommu *iommu)
1339{
1340	if (check_feature(FEATURE_IA)) {
1341		amd_iommu_flush_all(iommu);
1342	} else {
1343		amd_iommu_flush_dte_all(iommu);
1344		amd_iommu_flush_irt_all(iommu);
1345		amd_iommu_flush_tlb_all(iommu);
1346	}
1347}
1348
1349/*
1350 * Command send function for flushing on-device TLB
1351 */
1352static int device_flush_iotlb(struct iommu_dev_data *dev_data, u64 address,
1353			      size_t size, ioasid_t pasid, bool gn)
1354{
1355	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
1356	struct iommu_cmd cmd;
1357	int qdep = dev_data->ats_qdep;
1358
1359	build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address,
1360			      size, pasid, gn);
1361
1362	return iommu_queue_command(iommu, &cmd);
1363}
1364
1365static int device_flush_dte_alias(struct pci_dev *pdev, u16 alias, void *data)
1366{
1367	struct amd_iommu *iommu = data;
1368
1369	return iommu_flush_dte(iommu, alias);
1370}
1371
1372/*
1373 * Command send function for invalidating a device table entry
1374 */
1375static int device_flush_dte(struct iommu_dev_data *dev_data)
1376{
1377	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
1378	struct pci_dev *pdev = NULL;
1379	struct amd_iommu_pci_seg *pci_seg;
1380	u16 alias;
1381	int ret;
1382
1383	if (dev_is_pci(dev_data->dev))
1384		pdev = to_pci_dev(dev_data->dev);
1385
1386	if (pdev)
1387		ret = pci_for_each_dma_alias(pdev,
1388					     device_flush_dte_alias, iommu);
1389	else
1390		ret = iommu_flush_dte(iommu, dev_data->devid);
1391	if (ret)
1392		return ret;
1393
1394	pci_seg = iommu->pci_seg;
1395	alias = pci_seg->alias_table[dev_data->devid];
1396	if (alias != dev_data->devid) {
1397		ret = iommu_flush_dte(iommu, alias);
1398		if (ret)
1399			return ret;
1400	}
1401
1402	if (dev_data->ats_enabled) {
1403		/* Invalidate the entire contents of an IOTLB */
1404		ret = device_flush_iotlb(dev_data, 0, ~0UL,
1405					 IOMMU_NO_PASID, false);
1406	}
1407
1408	return ret;
1409}
1410
1411static int domain_flush_pages_v2(struct protection_domain *pdom,
1412				 u64 address, size_t size)
1413{
1414	struct iommu_dev_data *dev_data;
1415	struct iommu_cmd cmd;
1416	int ret = 0;
1417
1418	lockdep_assert_held(&pdom->lock);
1419	list_for_each_entry(dev_data, &pdom->dev_list, list) {
1420		struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
1421		u16 domid = dev_data->gcr3_info.domid;
1422
1423		build_inv_iommu_pages(&cmd, address, size,
1424				      domid, IOMMU_NO_PASID, true);
1425
1426		ret |= iommu_queue_command(iommu, &cmd);
1427	}
1428
1429	return ret;
1430}
1431
1432static int domain_flush_pages_v1(struct protection_domain *pdom,
1433				 u64 address, size_t size)
1434{
1435	struct pdom_iommu_info *pdom_iommu_info;
1436	struct iommu_cmd cmd;
1437	int ret = 0;
1438	unsigned long i;
1439
1440	lockdep_assert_held(&pdom->lock);
1441
1442	build_inv_iommu_pages(&cmd, address, size,
1443			      pdom->id, IOMMU_NO_PASID, false);
1444
1445	xa_for_each(&pdom->iommu_array, i, pdom_iommu_info) {
1446		/*
1447		 * Devices of this domain are behind this IOMMU
1448		 * We need a TLB flush
1449		 */
1450		ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd);
1451	}
1452
1453	return ret;
1454}
1455
1456/*
1457 * TLB invalidation function which is called from the mapping functions.
1458 * It flushes range of PTEs of the domain.
1459 */
1460static void __domain_flush_pages(struct protection_domain *domain,
1461				 u64 address, size_t size)
1462{
1463	struct iommu_dev_data *dev_data;
1464	int ret = 0;
1465	ioasid_t pasid = IOMMU_NO_PASID;
1466	bool gn = false;
1467
1468	lockdep_assert_held(&domain->lock);
1469
1470	if (pdom_is_v2_pgtbl_mode(domain)) {
1471		gn = true;
1472		ret = domain_flush_pages_v2(domain, address, size);
1473	} else {
1474		ret = domain_flush_pages_v1(domain, address, size);
1475	}
1476
1477	list_for_each_entry(dev_data, &domain->dev_list, list) {
1478
1479		if (!dev_data->ats_enabled)
1480			continue;
1481
1482		ret |= device_flush_iotlb(dev_data, address, size, pasid, gn);
1483	}
1484
1485	WARN_ON(ret);
1486}
1487
1488void amd_iommu_domain_flush_pages(struct protection_domain *domain,
1489				  u64 address, size_t size)
1490{
1491	lockdep_assert_held(&domain->lock);
1492
1493	if (likely(!amd_iommu_np_cache)) {
1494		__domain_flush_pages(domain, address, size);
1495
1496		/* Wait until IOMMU TLB and all device IOTLB flushes are complete */
1497		domain_flush_complete(domain);
1498
1499		return;
1500	}
1501
1502	/*
1503	 * When NpCache is on, we infer that we run in a VM and use a vIOMMU.
1504	 * In such setups it is best to avoid flushes of ranges which are not
1505	 * naturally aligned, since it would lead to flushes of unmodified
1506	 * PTEs. Such flushes would require the hypervisor to do more work than
1507	 * necessary. Therefore, perform repeated flushes of aligned ranges
1508	 * until you cover the range. Each iteration flushes the smaller
1509	 * between the natural alignment of the address that we flush and the
1510	 * greatest naturally aligned region that fits in the range.
1511	 */
1512	while (size != 0) {
1513		int addr_alignment = __ffs(address);
1514		int size_alignment = __fls(size);
1515		int min_alignment;
1516		size_t flush_size;
1517
1518		/*
1519		 * size is always non-zero, but address might be zero, causing
1520		 * addr_alignment to be negative. As the casting of the
1521		 * argument in __ffs(address) to long might trim the high bits
1522		 * of the address on x86-32, cast to long when doing the check.
1523		 */
1524		if (likely((unsigned long)address != 0))
1525			min_alignment = min(addr_alignment, size_alignment);
1526		else
1527			min_alignment = size_alignment;
1528
1529		flush_size = 1ul << min_alignment;
1530
1531		__domain_flush_pages(domain, address, flush_size);
1532		address += flush_size;
1533		size -= flush_size;
1534	}
1535
1536	/* Wait until IOMMU TLB and all device IOTLB flushes are complete */
1537	domain_flush_complete(domain);
1538}
1539
1540/* Flush the whole IO/TLB for a given protection domain - including PDE */
1541static void amd_iommu_domain_flush_all(struct protection_domain *domain)
1542{
1543	amd_iommu_domain_flush_pages(domain, 0,
1544				     CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
1545}
1546
1547void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data,
1548				     ioasid_t pasid, u64 address, size_t size)
1549{
1550	struct iommu_cmd cmd;
1551	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
1552
1553	build_inv_iommu_pages(&cmd, address, size,
1554			      dev_data->gcr3_info.domid, pasid, true);
1555	iommu_queue_command(iommu, &cmd);
1556
1557	if (dev_data->ats_enabled)
1558		device_flush_iotlb(dev_data, address, size, pasid, true);
1559
1560	iommu_completion_wait(iommu);
1561}
1562
1563static void dev_flush_pasid_all(struct iommu_dev_data *dev_data,
1564				ioasid_t pasid)
1565{
1566	amd_iommu_dev_flush_pasid_pages(dev_data, pasid, 0,
1567					CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
1568}
1569
1570/* Flush the not present cache if it exists */
1571static void domain_flush_np_cache(struct protection_domain *domain,
1572		dma_addr_t iova, size_t size)
1573{
1574	if (unlikely(amd_iommu_np_cache)) {
1575		unsigned long flags;
1576
1577		spin_lock_irqsave(&domain->lock, flags);
1578		amd_iommu_domain_flush_pages(domain, iova, size);
1579		spin_unlock_irqrestore(&domain->lock, flags);
1580	}
1581}
1582
1583
1584/*
1585 * This function flushes the DTEs for all devices in domain
1586 */
1587void amd_iommu_update_and_flush_device_table(struct protection_domain *domain)
1588{
1589	struct iommu_dev_data *dev_data;
1590
1591	lockdep_assert_held(&domain->lock);
1592
1593	list_for_each_entry(dev_data, &domain->dev_list, list) {
1594		struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
1595
1596		set_dte_entry(iommu, dev_data);
1597		clone_aliases(iommu, dev_data->dev);
1598	}
1599
1600	list_for_each_entry(dev_data, &domain->dev_list, list)
1601		device_flush_dte(dev_data);
1602
1603	domain_flush_complete(domain);
1604}
1605
1606int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag)
1607{
1608	struct iommu_dev_data *dev_data;
1609	struct amd_iommu *iommu;
1610	struct iommu_cmd cmd;
1611
1612	dev_data = dev_iommu_priv_get(dev);
1613	iommu    = get_amd_iommu_from_dev(dev);
1614
1615	build_complete_ppr(&cmd, dev_data->devid, pasid, status,
1616			   tag, dev_data->pri_tlp);
1617
1618	return iommu_queue_command(iommu, &cmd);
1619}
1620
1621/****************************************************************************
1622 *
1623 * The next functions belong to the domain allocation. A domain is
1624 * allocated for every IOMMU as the default domain. If device isolation
1625 * is enabled, every device get its own domain. The most important thing
1626 * about domains is the page table mapping the DMA address space they
1627 * contain.
1628 *
1629 ****************************************************************************/
1630
1631static int pdom_id_alloc(void)
1632{
1633	return ida_alloc_range(&pdom_ids, 1, MAX_DOMAIN_ID - 1, GFP_ATOMIC);
1634}
1635
1636static void pdom_id_free(int id)
1637{
1638	ida_free(&pdom_ids, id);
1639}
1640
1641static void free_gcr3_tbl_level1(u64 *tbl)
1642{
1643	u64 *ptr;
1644	int i;
1645
1646	for (i = 0; i < 512; ++i) {
1647		if (!(tbl[i] & GCR3_VALID))
1648			continue;
1649
1650		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1651
1652		iommu_free_page(ptr);
1653	}
1654}
1655
1656static void free_gcr3_tbl_level2(u64 *tbl)
1657{
1658	u64 *ptr;
1659	int i;
1660
1661	for (i = 0; i < 512; ++i) {
1662		if (!(tbl[i] & GCR3_VALID))
1663			continue;
1664
1665		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1666
1667		free_gcr3_tbl_level1(ptr);
1668	}
1669}
1670
1671static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info)
1672{
1673	if (gcr3_info->glx == 2)
1674		free_gcr3_tbl_level2(gcr3_info->gcr3_tbl);
1675	else if (gcr3_info->glx == 1)
1676		free_gcr3_tbl_level1(gcr3_info->gcr3_tbl);
1677	else
1678		WARN_ON_ONCE(gcr3_info->glx != 0);
1679
1680	gcr3_info->glx = 0;
1681
1682	/* Free per device domain ID */
1683	pdom_id_free(gcr3_info->domid);
1684
1685	iommu_free_page(gcr3_info->gcr3_tbl);
1686	gcr3_info->gcr3_tbl = NULL;
1687}
1688
1689/*
1690 * Number of GCR3 table levels required. Level must be 4-Kbyte
1691 * page and can contain up to 512 entries.
1692 */
1693static int get_gcr3_levels(int pasids)
1694{
1695	int levels;
1696
1697	if (pasids == -1)
1698		return amd_iommu_max_glx_val;
1699
1700	levels = get_count_order(pasids);
1701
1702	return levels ? (DIV_ROUND_UP(levels, 9) - 1) : levels;
1703}
1704
1705static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info,
1706			    struct amd_iommu *iommu, int pasids)
1707{
1708	int levels = get_gcr3_levels(pasids);
1709	int nid = iommu ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE;
1710	int domid;
1711
1712	if (levels > amd_iommu_max_glx_val)
1713		return -EINVAL;
1714
1715	if (gcr3_info->gcr3_tbl)
1716		return -EBUSY;
1717
1718	/* Allocate per device domain ID */
1719	domid = pdom_id_alloc();
1720	if (domid <= 0)
1721		return -ENOSPC;
1722	gcr3_info->domid = domid;
1723
1724	gcr3_info->gcr3_tbl = iommu_alloc_page_node(nid, GFP_ATOMIC);
1725	if (gcr3_info->gcr3_tbl == NULL) {
1726		pdom_id_free(domid);
1727		return -ENOMEM;
1728	}
1729
1730	gcr3_info->glx = levels;
1731
1732	return 0;
1733}
1734
1735static u64 *__get_gcr3_pte(struct gcr3_tbl_info *gcr3_info,
1736			   ioasid_t pasid, bool alloc)
1737{
1738	int index;
1739	u64 *pte;
1740	u64 *root = gcr3_info->gcr3_tbl;
1741	int level = gcr3_info->glx;
1742
1743	while (true) {
1744
1745		index = (pasid >> (9 * level)) & 0x1ff;
1746		pte   = &root[index];
1747
1748		if (level == 0)
1749			break;
1750
1751		if (!(*pte & GCR3_VALID)) {
1752			if (!alloc)
1753				return NULL;
1754
1755			root = (void *)get_zeroed_page(GFP_ATOMIC);
1756			if (root == NULL)
1757				return NULL;
1758
1759			*pte = iommu_virt_to_phys(root) | GCR3_VALID;
1760		}
1761
1762		root = iommu_phys_to_virt(*pte & PAGE_MASK);
1763
1764		level -= 1;
1765	}
1766
1767	return pte;
1768}
1769
1770static int update_gcr3(struct iommu_dev_data *dev_data,
1771		       ioasid_t pasid, unsigned long gcr3, bool set)
1772{
1773	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1774	u64 *pte;
1775
1776	pte = __get_gcr3_pte(gcr3_info, pasid, true);
1777	if (pte == NULL)
1778		return -ENOMEM;
1779
1780	if (set)
1781		*pte = (gcr3 & PAGE_MASK) | GCR3_VALID;
1782	else
1783		*pte = 0;
1784
1785	dev_flush_pasid_all(dev_data, pasid);
1786	return 0;
1787}
1788
1789int amd_iommu_set_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid,
1790		       unsigned long gcr3)
1791{
1792	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1793	int ret;
1794
1795	iommu_group_mutex_assert(dev_data->dev);
1796
1797	ret = update_gcr3(dev_data, pasid, gcr3, true);
1798	if (ret)
1799		return ret;
1800
1801	gcr3_info->pasid_cnt++;
1802	return ret;
1803}
1804
1805int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid)
1806{
1807	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1808	int ret;
1809
1810	iommu_group_mutex_assert(dev_data->dev);
1811
1812	ret = update_gcr3(dev_data, pasid, 0, false);
1813	if (ret)
1814		return ret;
1815
1816	gcr3_info->pasid_cnt--;
1817	return ret;
1818}
1819
1820static void set_dte_entry(struct amd_iommu *iommu,
1821			  struct iommu_dev_data *dev_data)
1822{
1823	u64 pte_root = 0;
1824	u64 flags = 0;
1825	u32 old_domid;
1826	u16 devid = dev_data->devid;
1827	u16 domid;
1828	struct protection_domain *domain = dev_data->domain;
1829	struct dev_table_entry *dev_table = get_dev_table(iommu);
1830	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1831
1832	if (gcr3_info && gcr3_info->gcr3_tbl)
1833		domid = dev_data->gcr3_info.domid;
1834	else
1835		domid = domain->id;
1836
1837	if (domain->iop.mode != PAGE_MODE_NONE)
1838		pte_root = iommu_virt_to_phys(domain->iop.root);
1839
1840	pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
1841		    << DEV_ENTRY_MODE_SHIFT;
1842
1843	pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
1844
1845	/*
1846	 * When SNP is enabled, Only set TV bit when IOMMU
1847	 * page translation is in use.
1848	 */
1849	if (!amd_iommu_snp_en || (domid != 0))
1850		pte_root |= DTE_FLAG_TV;
1851
1852	flags = dev_table[devid].data[1];
1853
1854	if (dev_data->ats_enabled)
1855		flags |= DTE_FLAG_IOTLB;
1856
1857	if (dev_data->ppr)
1858		pte_root |= 1ULL << DEV_ENTRY_PPR;
1859
1860	if (domain->dirty_tracking)
1861		pte_root |= DTE_FLAG_HAD;
1862
1863	if (gcr3_info && gcr3_info->gcr3_tbl) {
1864		u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
1865		u64 glx  = gcr3_info->glx;
1866		u64 tmp;
1867
1868		pte_root |= DTE_FLAG_GV;
1869		pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
1870
1871		/* First mask out possible old values for GCR3 table */
1872		tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
1873		flags    &= ~tmp;
1874
1875		tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
1876		flags    &= ~tmp;
1877
1878		/* Encode GCR3 table into DTE */
1879		tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
1880		pte_root |= tmp;
1881
1882		tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
1883		flags    |= tmp;
1884
1885		tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
1886		flags    |= tmp;
1887
1888		if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
1889			dev_table[devid].data[2] |=
1890				((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
1891		}
1892
1893		/* GIOV is supported with V2 page table mode only */
1894		if (pdom_is_v2_pgtbl_mode(domain))
1895			pte_root |= DTE_FLAG_GIOV;
1896	}
1897
1898	flags &= ~DEV_DOMID_MASK;
1899	flags |= domid;
1900
1901	old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
1902	dev_table[devid].data[1]  = flags;
1903	dev_table[devid].data[0]  = pte_root;
1904
1905	/*
1906	 * A kdump kernel might be replacing a domain ID that was copied from
1907	 * the previous kernel--if so, it needs to flush the translation cache
1908	 * entries for the old domain ID that is being overwritten
1909	 */
1910	if (old_domid) {
1911		amd_iommu_flush_tlb_domid(iommu, old_domid);
1912	}
1913}
1914
1915static void clear_dte_entry(struct amd_iommu *iommu, u16 devid)
1916{
1917	struct dev_table_entry *dev_table = get_dev_table(iommu);
1918
1919	/* remove entry from the device table seen by the hardware */
1920	dev_table[devid].data[0]  = DTE_FLAG_V;
1921
1922	if (!amd_iommu_snp_en)
1923		dev_table[devid].data[0] |= DTE_FLAG_TV;
1924
1925	dev_table[devid].data[1] &= DTE_FLAG_MASK;
1926
1927	amd_iommu_apply_erratum_63(iommu, devid);
1928}
1929
1930/* Update and flush DTE for the given device */
1931static void dev_update_dte(struct iommu_dev_data *dev_data, bool set)
1932{
1933	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
1934
1935	if (set)
1936		set_dte_entry(iommu, dev_data);
1937	else
1938		clear_dte_entry(iommu, dev_data->devid);
1939
1940	clone_aliases(iommu, dev_data->dev);
1941	device_flush_dte(dev_data);
1942	iommu_completion_wait(iommu);
1943}
1944
1945/*
1946 * If domain is SVA capable then initialize GCR3 table. Also if domain is
1947 * in v2 page table mode then update GCR3[0].
1948 */
1949static int init_gcr3_table(struct iommu_dev_data *dev_data,
1950			   struct protection_domain *pdom)
1951{
1952	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
1953	int max_pasids = dev_data->max_pasids;
1954	int ret = 0;
1955
1956	 /*
1957	  * If domain is in pt mode then setup GCR3 table only if device
1958	  * is PASID capable
1959	  */
1960	if (pdom_is_in_pt_mode(pdom) && !pdev_pasid_supported(dev_data))
1961		return ret;
1962
1963	/*
1964	 * By default, setup GCR3 table to support MAX PASIDs
1965	 * supported by the device/IOMMU.
1966	 */
1967	ret = setup_gcr3_table(&dev_data->gcr3_info, iommu,
1968			       max_pasids > 0 ?  max_pasids : 1);
1969	if (ret)
1970		return ret;
1971
1972	/* Setup GCR3[0] only if domain is setup with v2 page table mode */
1973	if (!pdom_is_v2_pgtbl_mode(pdom))
1974		return ret;
1975
1976	ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true);
1977	if (ret)
1978		free_gcr3_table(&dev_data->gcr3_info);
1979
1980	return ret;
1981}
1982
1983static void destroy_gcr3_table(struct iommu_dev_data *dev_data,
1984			       struct protection_domain *pdom)
1985{
1986	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1987
1988	if (pdom_is_v2_pgtbl_mode(pdom))
1989		update_gcr3(dev_data, 0, 0, false);
1990
1991	if (gcr3_info->gcr3_tbl == NULL)
1992		return;
1993
1994	free_gcr3_table(gcr3_info);
1995}
1996
1997static int pdom_attach_iommu(struct amd_iommu *iommu,
1998			     struct protection_domain *pdom)
1999{
2000	struct pdom_iommu_info *pdom_iommu_info, *curr;
2001	struct io_pgtable_cfg *cfg = &pdom->iop.pgtbl.cfg;
2002	unsigned long flags;
2003	int ret = 0;
2004
2005	spin_lock_irqsave(&pdom->lock, flags);
2006
2007	pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index);
2008	if (pdom_iommu_info) {
2009		pdom_iommu_info->refcnt++;
2010		goto out_unlock;
2011	}
2012
2013	pdom_iommu_info = kzalloc(sizeof(*pdom_iommu_info), GFP_ATOMIC);
2014	if (!pdom_iommu_info) {
2015		ret = -ENOMEM;
2016		goto out_unlock;
2017	}
2018
2019	pdom_iommu_info->iommu = iommu;
2020	pdom_iommu_info->refcnt = 1;
2021
2022	curr = xa_cmpxchg(&pdom->iommu_array, iommu->index,
2023			  NULL, pdom_iommu_info, GFP_ATOMIC);
2024	if (curr) {
2025		kfree(pdom_iommu_info);
2026		ret = -ENOSPC;
2027		goto out_unlock;
2028	}
2029
2030	/* Update NUMA Node ID */
2031	if (cfg->amd.nid == NUMA_NO_NODE)
2032		cfg->amd.nid = dev_to_node(&iommu->dev->dev);
2033
2034out_unlock:
2035	spin_unlock_irqrestore(&pdom->lock, flags);
2036	return ret;
2037}
2038
2039static void pdom_detach_iommu(struct amd_iommu *iommu,
2040			      struct protection_domain *pdom)
2041{
2042	struct pdom_iommu_info *pdom_iommu_info;
2043	unsigned long flags;
2044
2045	spin_lock_irqsave(&pdom->lock, flags);
2046
2047	pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index);
2048	if (!pdom_iommu_info) {
2049		spin_unlock_irqrestore(&pdom->lock, flags);
2050		return;
2051	}
2052
2053	pdom_iommu_info->refcnt--;
2054	if (pdom_iommu_info->refcnt == 0) {
2055		xa_erase(&pdom->iommu_array, iommu->index);
2056		kfree(pdom_iommu_info);
2057	}
2058
2059	spin_unlock_irqrestore(&pdom->lock, flags);
2060}
2061
2062/*
2063 * If a device is not yet associated with a domain, this function makes the
2064 * device visible in the domain
2065 */
2066static int attach_device(struct device *dev,
2067			 struct protection_domain *domain)
2068{
2069	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2070	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
2071	struct pci_dev *pdev;
2072	unsigned long flags;
2073	int ret = 0;
2074
2075	mutex_lock(&dev_data->mutex);
2076
2077	if (dev_data->domain != NULL) {
2078		ret = -EBUSY;
2079		goto out;
2080	}
2081
2082	/* Do reference counting */
2083	ret = pdom_attach_iommu(iommu, domain);
2084	if (ret)
2085		goto out;
2086
2087	/* Setup GCR3 table */
2088	if (pdom_is_sva_capable(domain)) {
2089		ret = init_gcr3_table(dev_data, domain);
2090		if (ret) {
2091			pdom_detach_iommu(iommu, domain);
2092			goto out;
2093		}
2094	}
2095
2096	pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL;
2097	if (pdev && pdom_is_sva_capable(domain)) {
2098		pdev_enable_caps(pdev);
2099
2100		/*
2101		 * Device can continue to function even if IOPF
2102		 * enablement failed. Hence in error path just
2103		 * disable device PRI support.
2104		 */
2105		if (amd_iommu_iopf_add_device(iommu, dev_data))
2106			pdev_disable_cap_pri(pdev);
2107	} else if (pdev) {
2108		pdev_enable_cap_ats(pdev);
2109	}
2110
2111	/* Update data structures */
2112	dev_data->domain = domain;
2113	spin_lock_irqsave(&domain->lock, flags);
2114	list_add(&dev_data->list, &domain->dev_list);
2115	spin_unlock_irqrestore(&domain->lock, flags);
2116
2117	/* Update device table */
2118	dev_update_dte(dev_data, true);
2119
2120out:
2121	mutex_unlock(&dev_data->mutex);
2122
2123	return ret;
2124}
2125
2126/*
2127 * Removes a device from a protection domain (with devtable_lock held)
2128 */
2129static void detach_device(struct device *dev)
2130{
2131	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2132	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
2133	struct protection_domain *domain = dev_data->domain;
2134	unsigned long flags;
2135
2136	mutex_lock(&dev_data->mutex);
2137
2138	/*
2139	 * First check if the device is still attached. It might already
2140	 * be detached from its domain because the generic
2141	 * iommu_detach_group code detached it and we try again here in
2142	 * our alias handling.
2143	 */
2144	if (WARN_ON(!dev_data->domain))
2145		goto out;
2146
2147	/* Remove IOPF handler */
2148	if (dev_data->ppr) {
2149		iopf_queue_flush_dev(dev);
2150		amd_iommu_iopf_remove_device(iommu, dev_data);
2151	}
2152
2153	if (dev_is_pci(dev))
2154		pdev_disable_caps(to_pci_dev(dev));
2155
2156	/* Clear DTE and flush the entry */
2157	dev_update_dte(dev_data, false);
2158
2159	/* Flush IOTLB and wait for the flushes to finish */
2160	spin_lock_irqsave(&domain->lock, flags);
2161	amd_iommu_domain_flush_all(domain);
2162	list_del(&dev_data->list);
2163	spin_unlock_irqrestore(&domain->lock, flags);
2164
2165	/* Clear GCR3 table */
2166	if (pdom_is_sva_capable(domain))
2167		destroy_gcr3_table(dev_data, domain);
2168
2169	/* Update data structures */
2170	dev_data->domain = NULL;
2171
2172	/* decrease reference counters - needs to happen after the flushes */
2173	pdom_detach_iommu(iommu, domain);
2174
2175out:
2176	mutex_unlock(&dev_data->mutex);
2177}
2178
2179static struct iommu_device *amd_iommu_probe_device(struct device *dev)
2180{
2181	struct iommu_device *iommu_dev;
2182	struct amd_iommu *iommu;
2183	struct iommu_dev_data *dev_data;
2184	int ret;
2185
2186	if (!check_device(dev))
2187		return ERR_PTR(-ENODEV);
2188
2189	iommu = rlookup_amd_iommu(dev);
2190	if (!iommu)
2191		return ERR_PTR(-ENODEV);
2192
2193	/* Not registered yet? */
2194	if (!iommu->iommu.ops)
2195		return ERR_PTR(-ENODEV);
2196
2197	if (dev_iommu_priv_get(dev))
2198		return &iommu->iommu;
2199
2200	ret = iommu_init_device(iommu, dev);
2201	if (ret) {
2202		dev_err(dev, "Failed to initialize - trying to proceed anyway\n");
2203		iommu_dev = ERR_PTR(ret);
2204		iommu_ignore_device(iommu, dev);
2205		goto out_err;
2206	}
2207
2208	amd_iommu_set_pci_msi_domain(dev, iommu);
2209	iommu_dev = &iommu->iommu;
2210
2211	/*
2212	 * If IOMMU and device supports PASID then it will contain max
2213	 * supported PASIDs, else it will be zero.
2214	 */
2215	dev_data = dev_iommu_priv_get(dev);
2216	if (amd_iommu_pasid_supported() && dev_is_pci(dev) &&
2217	    pdev_pasid_supported(dev_data)) {
2218		dev_data->max_pasids = min_t(u32, iommu->iommu.max_pasids,
2219					     pci_max_pasids(to_pci_dev(dev)));
2220	}
2221
2222out_err:
2223	iommu_completion_wait(iommu);
2224
2225	if (dev_is_pci(dev))
2226		pci_prepare_ats(to_pci_dev(dev), PAGE_SHIFT);
2227
2228	return iommu_dev;
2229}
2230
2231static void amd_iommu_release_device(struct device *dev)
2232{
2233	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2234
2235	WARN_ON(dev_data->domain);
2236
2237	/*
2238	 * We keep dev_data around for unplugged devices and reuse it when the
2239	 * device is re-plugged - not doing so would introduce a ton of races.
2240	 */
2241}
2242
2243static struct iommu_group *amd_iommu_device_group(struct device *dev)
2244{
2245	if (dev_is_pci(dev))
2246		return pci_device_group(dev);
2247
2248	return acpihid_device_group(dev);
2249}
2250
2251/*****************************************************************************
2252 *
2253 * The following functions belong to the exported interface of AMD IOMMU
2254 *
2255 * This interface allows access to lower level functions of the IOMMU
2256 * like protection domain handling and assignement of devices to domains
2257 * which is not possible with the dma_ops interface.
2258 *
2259 *****************************************************************************/
2260
2261void protection_domain_free(struct protection_domain *domain)
2262{
2263	WARN_ON(!list_empty(&domain->dev_list));
2264	if (domain->domain.type & __IOMMU_DOMAIN_PAGING)
2265		free_io_pgtable_ops(&domain->iop.pgtbl.ops);
2266	pdom_id_free(domain->id);
2267	kfree(domain);
2268}
2269
2270static void protection_domain_init(struct protection_domain *domain, int nid)
2271{
2272	spin_lock_init(&domain->lock);
2273	INIT_LIST_HEAD(&domain->dev_list);
2274	INIT_LIST_HEAD(&domain->dev_data_list);
2275	xa_init(&domain->iommu_array);
2276	domain->iop.pgtbl.cfg.amd.nid = nid;
2277}
2278
2279struct protection_domain *protection_domain_alloc(int nid)
2280{
2281	struct protection_domain *domain;
2282	int domid;
2283
2284	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2285	if (!domain)
2286		return NULL;
2287
2288	domid = pdom_id_alloc();
2289	if (domid <= 0) {
2290		kfree(domain);
2291		return NULL;
2292	}
2293	domain->id = domid;
2294
2295	protection_domain_init(domain, nid);
2296
2297	return domain;
2298}
2299
2300static int pdom_setup_pgtable(struct protection_domain *domain)
2301{
2302	struct io_pgtable_ops *pgtbl_ops;
2303	enum io_pgtable_fmt fmt;
2304
2305	switch (domain->pd_mode) {
2306	case PD_MODE_V1:
2307		fmt = AMD_IOMMU_V1;
2308		break;
2309	case PD_MODE_V2:
2310		fmt = AMD_IOMMU_V2;
2311		break;
2312	}
2313
2314	pgtbl_ops = alloc_io_pgtable_ops(fmt, &domain->iop.pgtbl.cfg, domain);
2315	if (!pgtbl_ops)
2316		return -ENOMEM;
2317
2318	return 0;
2319}
2320
2321static inline u64 dma_max_address(enum protection_domain_mode pgtable)
2322{
2323	if (pgtable == PD_MODE_V1)
2324		return ~0ULL;
2325
2326	/* V2 with 4/5 level page table */
2327	return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
2328}
2329
2330static bool amd_iommu_hd_support(struct amd_iommu *iommu)
2331{
2332	return iommu && (iommu->features & FEATURE_HDSUP);
2333}
2334
2335static struct iommu_domain *
2336do_iommu_domain_alloc(struct device *dev, u32 flags,
2337		      enum protection_domain_mode pgtable)
2338{
2339	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
2340	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
2341	struct protection_domain *domain;
2342	int ret;
2343
2344	domain = protection_domain_alloc(dev_to_node(dev));
2345	if (!domain)
2346		return ERR_PTR(-ENOMEM);
2347
2348	domain->pd_mode = pgtable;
2349	ret = pdom_setup_pgtable(domain);
2350	if (ret) {
2351		pdom_id_free(domain->id);
2352		kfree(domain);
2353		return ERR_PTR(ret);
2354	}
2355
2356	domain->domain.geometry.aperture_start = 0;
2357	domain->domain.geometry.aperture_end   = dma_max_address(pgtable);
2358	domain->domain.geometry.force_aperture = true;
2359	domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap;
2360
2361	domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
2362	domain->domain.ops = iommu->iommu.ops->default_domain_ops;
2363
2364	if (dirty_tracking)
2365		domain->domain.dirty_ops = &amd_dirty_ops;
2366
2367	return &domain->domain;
2368}
2369
2370static struct iommu_domain *
2371amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
2372				    const struct iommu_user_data *user_data)
2373
2374{
2375	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
2376	const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
2377						IOMMU_HWPT_ALLOC_PASID;
2378
2379	if ((flags & ~supported_flags) || user_data)
2380		return ERR_PTR(-EOPNOTSUPP);
2381
2382	switch (flags & supported_flags) {
2383	case IOMMU_HWPT_ALLOC_DIRTY_TRACKING:
2384		/* Allocate domain with v1 page table for dirty tracking */
2385		if (!amd_iommu_hd_support(iommu))
2386			break;
2387		return do_iommu_domain_alloc(dev, flags, PD_MODE_V1);
2388	case IOMMU_HWPT_ALLOC_PASID:
2389		/* Allocate domain with v2 page table if IOMMU supports PASID. */
2390		if (!amd_iommu_pasid_supported())
2391			break;
2392		return do_iommu_domain_alloc(dev, flags, PD_MODE_V2);
2393	case 0:
2394		/* If nothing specific is required use the kernel commandline default */
2395		return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable);
2396	default:
2397		break;
2398	}
2399	return ERR_PTR(-EOPNOTSUPP);
2400}
2401
2402void amd_iommu_domain_free(struct iommu_domain *dom)
2403{
2404	struct protection_domain *domain = to_pdomain(dom);
2405
2406	protection_domain_free(domain);
2407}
2408
2409static int blocked_domain_attach_device(struct iommu_domain *domain,
2410					struct device *dev)
2411{
2412	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2413
2414	if (dev_data->domain)
2415		detach_device(dev);
2416
2417	/* Clear DTE and flush the entry */
2418	mutex_lock(&dev_data->mutex);
2419	dev_update_dte(dev_data, false);
2420	mutex_unlock(&dev_data->mutex);
2421
2422	return 0;
2423}
2424
2425static struct iommu_domain blocked_domain = {
2426	.type = IOMMU_DOMAIN_BLOCKED,
2427	.ops = &(const struct iommu_domain_ops) {
2428		.attach_dev     = blocked_domain_attach_device,
2429	}
2430};
2431
2432static struct protection_domain identity_domain;
2433
2434static const struct iommu_domain_ops identity_domain_ops = {
2435	.attach_dev = amd_iommu_attach_device,
2436};
2437
2438void amd_iommu_init_identity_domain(void)
2439{
2440	struct iommu_domain *domain = &identity_domain.domain;
2441
2442	domain->type = IOMMU_DOMAIN_IDENTITY;
2443	domain->ops = &identity_domain_ops;
2444	domain->owner = &amd_iommu_ops;
2445
2446	identity_domain.id = pdom_id_alloc();
2447
2448	protection_domain_init(&identity_domain, NUMA_NO_NODE);
2449}
2450
2451/* Same as blocked domain except it supports only ops->attach_dev() */
2452static struct iommu_domain release_domain = {
2453	.type = IOMMU_DOMAIN_BLOCKED,
2454	.ops = &(const struct iommu_domain_ops) {
2455		.attach_dev     = blocked_domain_attach_device,
2456	}
2457};
2458
2459static int amd_iommu_attach_device(struct iommu_domain *dom,
2460				   struct device *dev)
2461{
2462	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2463	struct protection_domain *domain = to_pdomain(dom);
2464	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
2465	int ret;
2466
2467	/*
2468	 * Skip attach device to domain if new domain is same as
2469	 * devices current domain
2470	 */
2471	if (dev_data->domain == domain)
2472		return 0;
2473
2474	dev_data->defer_attach = false;
2475
2476	/*
2477	 * Restrict to devices with compatible IOMMU hardware support
2478	 * when enforcement of dirty tracking is enabled.
2479	 */
2480	if (dom->dirty_ops && !amd_iommu_hd_support(iommu))
2481		return -EINVAL;
2482
2483	if (dev_data->domain)
2484		detach_device(dev);
2485
2486	ret = attach_device(dev, domain);
2487
2488#ifdef CONFIG_IRQ_REMAP
2489	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
2490		if (dom->type == IOMMU_DOMAIN_UNMANAGED)
2491			dev_data->use_vapic = 1;
2492		else
2493			dev_data->use_vapic = 0;
2494	}
2495#endif
2496
2497	return ret;
2498}
2499
2500static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
2501				    unsigned long iova, size_t size)
2502{
2503	struct protection_domain *domain = to_pdomain(dom);
2504	struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
2505
2506	if (ops->map_pages)
2507		domain_flush_np_cache(domain, iova, size);
2508	return 0;
2509}
2510
2511static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
2512			       phys_addr_t paddr, size_t pgsize, size_t pgcount,
2513			       int iommu_prot, gfp_t gfp, size_t *mapped)
2514{
2515	struct protection_domain *domain = to_pdomain(dom);
2516	struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
2517	int prot = 0;
2518	int ret = -EINVAL;
2519
2520	if ((domain->pd_mode == PD_MODE_V1) &&
2521	    (domain->iop.mode == PAGE_MODE_NONE))
2522		return -EINVAL;
2523
2524	if (iommu_prot & IOMMU_READ)
2525		prot |= IOMMU_PROT_IR;
2526	if (iommu_prot & IOMMU_WRITE)
2527		prot |= IOMMU_PROT_IW;
2528
2529	if (ops->map_pages) {
2530		ret = ops->map_pages(ops, iova, paddr, pgsize,
2531				     pgcount, prot, gfp, mapped);
2532	}
2533
2534	return ret;
2535}
2536
2537static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
2538					    struct iommu_iotlb_gather *gather,
2539					    unsigned long iova, size_t size)
2540{
2541	/*
2542	 * AMD's IOMMU can flush as many pages as necessary in a single flush.
2543	 * Unless we run in a virtual machine, which can be inferred according
2544	 * to whether "non-present cache" is on, it is probably best to prefer
2545	 * (potentially) too extensive TLB flushing (i.e., more misses) over
2546	 * mutliple TLB flushes (i.e., more flushes). For virtual machines the
2547	 * hypervisor needs to synchronize the host IOMMU PTEs with those of
2548	 * the guest, and the trade-off is different: unnecessary TLB flushes
2549	 * should be avoided.
2550	 */
2551	if (amd_iommu_np_cache &&
2552	    iommu_iotlb_gather_is_disjoint(gather, iova, size))
2553		iommu_iotlb_sync(domain, gather);
2554
2555	iommu_iotlb_gather_add_range(gather, iova, size);
2556}
2557
2558static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova,
2559				    size_t pgsize, size_t pgcount,
2560				    struct iommu_iotlb_gather *gather)
2561{
2562	struct protection_domain *domain = to_pdomain(dom);
2563	struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
2564	size_t r;
2565
2566	if ((domain->pd_mode == PD_MODE_V1) &&
2567	    (domain->iop.mode == PAGE_MODE_NONE))
2568		return 0;
2569
2570	r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0;
2571
2572	if (r)
2573		amd_iommu_iotlb_gather_add_page(dom, gather, iova, r);
2574
2575	return r;
2576}
2577
2578static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2579					  dma_addr_t iova)
2580{
2581	struct protection_domain *domain = to_pdomain(dom);
2582	struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops;
2583
2584	return ops->iova_to_phys(ops, iova);
2585}
2586
2587static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
2588{
2589	switch (cap) {
2590	case IOMMU_CAP_CACHE_COHERENCY:
2591		return true;
2592	case IOMMU_CAP_NOEXEC:
2593		return false;
2594	case IOMMU_CAP_PRE_BOOT_PROTECTION:
2595		return amdr_ivrs_remap_support;
2596	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
2597		return true;
2598	case IOMMU_CAP_DEFERRED_FLUSH:
2599		return true;
2600	case IOMMU_CAP_DIRTY_TRACKING: {
2601		struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
2602
2603		return amd_iommu_hd_support(iommu);
2604	}
2605	default:
2606		break;
2607	}
2608
2609	return false;
2610}
2611
2612static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
2613					bool enable)
2614{
2615	struct protection_domain *pdomain = to_pdomain(domain);
2616	struct dev_table_entry *dev_table;
2617	struct iommu_dev_data *dev_data;
2618	bool domain_flush = false;
2619	struct amd_iommu *iommu;
2620	unsigned long flags;
2621	u64 pte_root;
2622
2623	spin_lock_irqsave(&pdomain->lock, flags);
2624	if (!(pdomain->dirty_tracking ^ enable)) {
2625		spin_unlock_irqrestore(&pdomain->lock, flags);
2626		return 0;
2627	}
2628
2629	list_for_each_entry(dev_data, &pdomain->dev_list, list) {
2630		iommu = get_amd_iommu_from_dev_data(dev_data);
2631
2632		dev_table = get_dev_table(iommu);
2633		pte_root = dev_table[dev_data->devid].data[0];
2634
2635		pte_root = (enable ? pte_root | DTE_FLAG_HAD :
2636				     pte_root & ~DTE_FLAG_HAD);
2637
2638		/* Flush device DTE */
2639		dev_table[dev_data->devid].data[0] = pte_root;
2640		device_flush_dte(dev_data);
2641		domain_flush = true;
2642	}
2643
2644	/* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
2645	if (domain_flush)
2646		amd_iommu_domain_flush_all(pdomain);
2647
2648	pdomain->dirty_tracking = enable;
2649	spin_unlock_irqrestore(&pdomain->lock, flags);
2650
2651	return 0;
2652}
2653
2654static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
2655					  unsigned long iova, size_t size,
2656					  unsigned long flags,
2657					  struct iommu_dirty_bitmap *dirty)
2658{
2659	struct protection_domain *pdomain = to_pdomain(domain);
2660	struct io_pgtable_ops *ops = &pdomain->iop.pgtbl.ops;
2661	unsigned long lflags;
2662
2663	if (!ops || !ops->read_and_clear_dirty)
2664		return -EOPNOTSUPP;
2665
2666	spin_lock_irqsave(&pdomain->lock, lflags);
2667	if (!pdomain->dirty_tracking && dirty->bitmap) {
2668		spin_unlock_irqrestore(&pdomain->lock, lflags);
2669		return -EINVAL;
2670	}
2671	spin_unlock_irqrestore(&pdomain->lock, lflags);
2672
2673	return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
2674}
2675
2676static void amd_iommu_get_resv_regions(struct device *dev,
2677				       struct list_head *head)
2678{
2679	struct iommu_resv_region *region;
2680	struct unity_map_entry *entry;
2681	struct amd_iommu *iommu;
2682	struct amd_iommu_pci_seg *pci_seg;
2683	int devid, sbdf;
2684
2685	sbdf = get_device_sbdf_id(dev);
2686	if (sbdf < 0)
2687		return;
2688
2689	devid = PCI_SBDF_TO_DEVID(sbdf);
2690	iommu = get_amd_iommu_from_dev(dev);
2691	pci_seg = iommu->pci_seg;
2692
2693	list_for_each_entry(entry, &pci_seg->unity_map, list) {
2694		int type, prot = 0;
2695		size_t length;
2696
2697		if (devid < entry->devid_start || devid > entry->devid_end)
2698			continue;
2699
2700		type   = IOMMU_RESV_DIRECT;
2701		length = entry->address_end - entry->address_start;
2702		if (entry->prot & IOMMU_PROT_IR)
2703			prot |= IOMMU_READ;
2704		if (entry->prot & IOMMU_PROT_IW)
2705			prot |= IOMMU_WRITE;
2706		if (entry->prot & IOMMU_UNITY_MAP_FLAG_EXCL_RANGE)
2707			/* Exclusion range */
2708			type = IOMMU_RESV_RESERVED;
2709
2710		region = iommu_alloc_resv_region(entry->address_start,
2711						 length, prot, type,
2712						 GFP_KERNEL);
2713		if (!region) {
2714			dev_err(dev, "Out of memory allocating dm-regions\n");
2715			return;
2716		}
2717		list_add_tail(&region->list, head);
2718	}
2719
2720	region = iommu_alloc_resv_region(MSI_RANGE_START,
2721					 MSI_RANGE_END - MSI_RANGE_START + 1,
2722					 0, IOMMU_RESV_MSI, GFP_KERNEL);
2723	if (!region)
2724		return;
2725	list_add_tail(&region->list, head);
2726
2727	region = iommu_alloc_resv_region(HT_RANGE_START,
2728					 HT_RANGE_END - HT_RANGE_START + 1,
2729					 0, IOMMU_RESV_RESERVED, GFP_KERNEL);
2730	if (!region)
2731		return;
2732	list_add_tail(&region->list, head);
2733}
2734
2735static bool amd_iommu_is_attach_deferred(struct device *dev)
2736{
2737	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2738
2739	return dev_data->defer_attach;
2740}
2741
2742static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
2743{
2744	struct protection_domain *dom = to_pdomain(domain);
2745	unsigned long flags;
2746
2747	spin_lock_irqsave(&dom->lock, flags);
2748	amd_iommu_domain_flush_all(dom);
2749	spin_unlock_irqrestore(&dom->lock, flags);
2750}
2751
2752static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
2753				 struct iommu_iotlb_gather *gather)
2754{
2755	struct protection_domain *dom = to_pdomain(domain);
2756	unsigned long flags;
2757
2758	spin_lock_irqsave(&dom->lock, flags);
2759	amd_iommu_domain_flush_pages(dom, gather->start,
2760				     gather->end - gather->start + 1);
2761	spin_unlock_irqrestore(&dom->lock, flags);
2762}
2763
2764static int amd_iommu_def_domain_type(struct device *dev)
2765{
2766	struct iommu_dev_data *dev_data;
2767
2768	dev_data = dev_iommu_priv_get(dev);
2769	if (!dev_data)
2770		return 0;
2771
2772	/* Always use DMA domain for untrusted device */
2773	if (dev_is_pci(dev) && to_pci_dev(dev)->untrusted)
2774		return IOMMU_DOMAIN_DMA;
2775
2776	/*
2777	 * Do not identity map IOMMUv2 capable devices when:
2778	 *  - memory encryption is active, because some of those devices
2779	 *    (AMD GPUs) don't have the encryption bit in their DMA-mask
2780	 *    and require remapping.
2781	 *  - SNP is enabled, because it prohibits DTE[Mode]=0.
2782	 */
2783	if (pdev_pasid_supported(dev_data) &&
2784	    !cc_platform_has(CC_ATTR_MEM_ENCRYPT) &&
2785	    !amd_iommu_snp_en) {
2786		return IOMMU_DOMAIN_IDENTITY;
2787	}
2788
2789	return 0;
2790}
2791
2792static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
2793{
2794	/* IOMMU_PTE_FC is always set */
2795	return true;
2796}
2797
2798static const struct iommu_dirty_ops amd_dirty_ops = {
2799	.set_dirty_tracking = amd_iommu_set_dirty_tracking,
2800	.read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
2801};
2802
2803static int amd_iommu_dev_enable_feature(struct device *dev,
2804					enum iommu_dev_features feat)
2805{
2806	int ret = 0;
2807
2808	switch (feat) {
2809	case IOMMU_DEV_FEAT_IOPF:
2810	case IOMMU_DEV_FEAT_SVA:
2811		break;
2812	default:
2813		ret = -EINVAL;
2814		break;
2815	}
2816	return ret;
2817}
2818
2819static int amd_iommu_dev_disable_feature(struct device *dev,
2820					 enum iommu_dev_features feat)
2821{
2822	int ret = 0;
2823
2824	switch (feat) {
2825	case IOMMU_DEV_FEAT_IOPF:
2826	case IOMMU_DEV_FEAT_SVA:
2827		break;
2828	default:
2829		ret = -EINVAL;
2830		break;
2831	}
2832	return ret;
2833}
2834
2835const struct iommu_ops amd_iommu_ops = {
2836	.capable = amd_iommu_capable,
2837	.blocked_domain = &blocked_domain,
2838	.release_domain = &release_domain,
2839	.identity_domain = &identity_domain.domain,
2840	.domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags,
2841	.domain_alloc_sva = amd_iommu_domain_alloc_sva,
2842	.probe_device = amd_iommu_probe_device,
2843	.release_device = amd_iommu_release_device,
2844	.device_group = amd_iommu_device_group,
2845	.get_resv_regions = amd_iommu_get_resv_regions,
2846	.is_attach_deferred = amd_iommu_is_attach_deferred,
2847	.def_domain_type = amd_iommu_def_domain_type,
2848	.dev_enable_feat = amd_iommu_dev_enable_feature,
2849	.dev_disable_feat = amd_iommu_dev_disable_feature,
2850	.remove_dev_pasid = amd_iommu_remove_dev_pasid,
2851	.page_response = amd_iommu_page_response,
2852	.default_domain_ops = &(const struct iommu_domain_ops) {
2853		.attach_dev	= amd_iommu_attach_device,
2854		.map_pages	= amd_iommu_map_pages,
2855		.unmap_pages	= amd_iommu_unmap_pages,
2856		.iotlb_sync_map	= amd_iommu_iotlb_sync_map,
2857		.iova_to_phys	= amd_iommu_iova_to_phys,
2858		.flush_iotlb_all = amd_iommu_flush_iotlb_all,
2859		.iotlb_sync	= amd_iommu_iotlb_sync,
2860		.free		= amd_iommu_domain_free,
2861		.enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
2862	}
2863};
2864
2865#ifdef CONFIG_IRQ_REMAP
2866
2867/*****************************************************************************
2868 *
2869 * Interrupt Remapping Implementation
2870 *
2871 *****************************************************************************/
2872
2873static struct irq_chip amd_ir_chip;
2874static DEFINE_SPINLOCK(iommu_table_lock);
2875
2876static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid)
2877{
2878	int ret;
2879	u64 data;
2880	unsigned long flags;
2881	struct iommu_cmd cmd, cmd2;
2882
2883	if (iommu->irtcachedis_enabled)
2884		return;
2885
2886	build_inv_irt(&cmd, devid);
2887	data = atomic64_inc_return(&iommu->cmd_sem_val);
2888	build_completion_wait(&cmd2, iommu, data);
2889
2890	raw_spin_lock_irqsave(&iommu->lock, flags);
2891	ret = __iommu_queue_command_sync(iommu, &cmd, true);
2892	if (ret)
2893		goto out;
2894	ret = __iommu_queue_command_sync(iommu, &cmd2, false);
2895	if (ret)
2896		goto out;
2897	wait_on_sem(iommu, data);
2898out:
2899	raw_spin_unlock_irqrestore(&iommu->lock, flags);
2900}
2901
2902static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid,
2903			      struct irq_remap_table *table)
2904{
2905	u64 dte;
2906	struct dev_table_entry *dev_table = get_dev_table(iommu);
2907
2908	dte	= dev_table[devid].data[2];
2909	dte	&= ~DTE_IRQ_PHYS_ADDR_MASK;
2910	dte	|= iommu_virt_to_phys(table->table);
2911	dte	|= DTE_IRQ_REMAP_INTCTL;
2912	dte	|= DTE_INTTABLEN;
2913	dte	|= DTE_IRQ_REMAP_ENABLE;
2914
2915	dev_table[devid].data[2] = dte;
2916}
2917
2918static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid)
2919{
2920	struct irq_remap_table *table;
2921	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2922
2923	if (WARN_ONCE(!pci_seg->rlookup_table[devid],
2924		      "%s: no iommu for devid %x:%x\n",
2925		      __func__, pci_seg->id, devid))
2926		return NULL;
2927
2928	table = pci_seg->irq_lookup_table[devid];
2929	if (WARN_ONCE(!table, "%s: no table for devid %x:%x\n",
2930		      __func__, pci_seg->id, devid))
2931		return NULL;
2932
2933	return table;
2934}
2935
2936static struct irq_remap_table *__alloc_irq_table(void)
2937{
2938	struct irq_remap_table *table;
2939
2940	table = kzalloc(sizeof(*table), GFP_KERNEL);
2941	if (!table)
2942		return NULL;
2943
2944	table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
2945	if (!table->table) {
2946		kfree(table);
2947		return NULL;
2948	}
2949	raw_spin_lock_init(&table->lock);
2950
2951	if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
2952		memset(table->table, 0,
2953		       MAX_IRQS_PER_TABLE * sizeof(u32));
2954	else
2955		memset(table->table, 0,
2956		       (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
2957	return table;
2958}
2959
2960static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
2961				  struct irq_remap_table *table)
2962{
2963	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2964
2965	pci_seg->irq_lookup_table[devid] = table;
2966	set_dte_irq_entry(iommu, devid, table);
2967	iommu_flush_dte(iommu, devid);
2968}
2969
2970static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias,
2971				       void *data)
2972{
2973	struct irq_remap_table *table = data;
2974	struct amd_iommu_pci_seg *pci_seg;
2975	struct amd_iommu *iommu = rlookup_amd_iommu(&pdev->dev);
2976
2977	if (!iommu)
2978		return -EINVAL;
2979
2980	pci_seg = iommu->pci_seg;
2981	pci_seg->irq_lookup_table[alias] = table;
2982	set_dte_irq_entry(iommu, alias, table);
2983	iommu_flush_dte(pci_seg->rlookup_table[alias], alias);
2984
2985	return 0;
2986}
2987
2988static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu,
2989					       u16 devid, struct pci_dev *pdev)
2990{
2991	struct irq_remap_table *table = NULL;
2992	struct irq_remap_table *new_table = NULL;
2993	struct amd_iommu_pci_seg *pci_seg;
2994	unsigned long flags;
2995	u16 alias;
2996
2997	spin_lock_irqsave(&iommu_table_lock, flags);
2998
2999	pci_seg = iommu->pci_seg;
3000	table = pci_seg->irq_lookup_table[devid];
3001	if (table)
3002		goto out_unlock;
3003
3004	alias = pci_seg->alias_table[devid];
3005	table = pci_seg->irq_lookup_table[alias];
3006	if (table) {
3007		set_remap_table_entry(iommu, devid, table);
3008		goto out_wait;
3009	}
3010	spin_unlock_irqrestore(&iommu_table_lock, flags);
3011
3012	/* Nothing there yet, allocate new irq remapping table */
3013	new_table = __alloc_irq_table();
3014	if (!new_table)
3015		return NULL;
3016
3017	spin_lock_irqsave(&iommu_table_lock, flags);
3018
3019	table = pci_seg->irq_lookup_table[devid];
3020	if (table)
3021		goto out_unlock;
3022
3023	table = pci_seg->irq_lookup_table[alias];
3024	if (table) {
3025		set_remap_table_entry(iommu, devid, table);
3026		goto out_wait;
3027	}
3028
3029	table = new_table;
3030	new_table = NULL;
3031
3032	if (pdev)
3033		pci_for_each_dma_alias(pdev, set_remap_table_entry_alias,
3034				       table);
3035	else
3036		set_remap_table_entry(iommu, devid, table);
3037
3038	if (devid != alias)
3039		set_remap_table_entry(iommu, alias, table);
3040
3041out_wait:
3042	iommu_completion_wait(iommu);
3043
3044out_unlock:
3045	spin_unlock_irqrestore(&iommu_table_lock, flags);
3046
3047	if (new_table) {
3048		kmem_cache_free(amd_iommu_irq_cache, new_table->table);
3049		kfree(new_table);
3050	}
3051	return table;
3052}
3053
3054static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count,
3055			   bool align, struct pci_dev *pdev)
3056{
3057	struct irq_remap_table *table;
3058	int index, c, alignment = 1;
3059	unsigned long flags;
3060
3061	table = alloc_irq_table(iommu, devid, pdev);
3062	if (!table)
3063		return -ENODEV;
3064
3065	if (align)
3066		alignment = roundup_pow_of_two(count);
3067
3068	raw_spin_lock_irqsave(&table->lock, flags);
3069
3070	/* Scan table for free entries */
3071	for (index = ALIGN(table->min_index, alignment), c = 0;
3072	     index < MAX_IRQS_PER_TABLE;) {
3073		if (!iommu->irte_ops->is_allocated(table, index)) {
3074			c += 1;
3075		} else {
3076			c     = 0;
3077			index = ALIGN(index + 1, alignment);
3078			continue;
3079		}
3080
3081		if (c == count)	{
3082			for (; c != 0; --c)
3083				iommu->irte_ops->set_allocated(table, index - c + 1);
3084
3085			index -= count - 1;
3086			goto out;
3087		}
3088
3089		index++;
3090	}
3091
3092	index = -ENOSPC;
3093
3094out:
3095	raw_spin_unlock_irqrestore(&table->lock, flags);
3096
3097	return index;
3098}
3099
3100static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
3101			    struct irte_ga *irte)
3102{
3103	struct irq_remap_table *table;
3104	struct irte_ga *entry;
3105	unsigned long flags;
3106	u128 old;
3107
3108	table = get_irq_table(iommu, devid);
3109	if (!table)
3110		return -ENOMEM;
3111
3112	raw_spin_lock_irqsave(&table->lock, flags);
3113
3114	entry = (struct irte_ga *)table->table;
3115	entry = &entry[index];
3116
3117	/*
3118	 * We use cmpxchg16 to atomically update the 128-bit IRTE,
3119	 * and it cannot be updated by the hardware or other processors
3120	 * behind us, so the return value of cmpxchg16 should be the
3121	 * same as the old value.
3122	 */
3123	old = entry->irte;
3124	WARN_ON(!try_cmpxchg128(&entry->irte, &old, irte->irte));
3125
3126	raw_spin_unlock_irqrestore(&table->lock, flags);
3127
3128	return 0;
3129}
3130
3131static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
3132			  struct irte_ga *irte)
3133{
3134	bool ret;
3135
3136	ret = __modify_irte_ga(iommu, devid, index, irte);
3137	if (ret)
3138		return ret;
3139
3140	iommu_flush_irt_and_complete(iommu, devid);
3141
3142	return 0;
3143}
3144
3145static int modify_irte(struct amd_iommu *iommu,
3146		       u16 devid, int index, union irte *irte)
3147{
3148	struct irq_remap_table *table;
3149	unsigned long flags;
3150
3151	table = get_irq_table(iommu, devid);
3152	if (!table)
3153		return -ENOMEM;
3154
3155	raw_spin_lock_irqsave(&table->lock, flags);
3156	table->table[index] = irte->val;
3157	raw_spin_unlock_irqrestore(&table->lock, flags);
3158
3159	iommu_flush_irt_and_complete(iommu, devid);
3160
3161	return 0;
3162}
3163
3164static void free_irte(struct amd_iommu *iommu, u16 devid, int index)
3165{
3166	struct irq_remap_table *table;
3167	unsigned long flags;
3168
3169	table = get_irq_table(iommu, devid);
3170	if (!table)
3171		return;
3172
3173	raw_spin_lock_irqsave(&table->lock, flags);
3174	iommu->irte_ops->clear_allocated(table, index);
3175	raw_spin_unlock_irqrestore(&table->lock, flags);
3176
3177	iommu_flush_irt_and_complete(iommu, devid);
3178}
3179
3180static void irte_prepare(void *entry,
3181			 u32 delivery_mode, bool dest_mode,
3182			 u8 vector, u32 dest_apicid, int devid)
3183{
3184	union irte *irte = (union irte *) entry;
3185
3186	irte->val                = 0;
3187	irte->fields.vector      = vector;
3188	irte->fields.int_type    = delivery_mode;
3189	irte->fields.destination = dest_apicid;
3190	irte->fields.dm          = dest_mode;
3191	irte->fields.valid       = 1;
3192}
3193
3194static void irte_ga_prepare(void *entry,
3195			    u32 delivery_mode, bool dest_mode,
3196			    u8 vector, u32 dest_apicid, int devid)
3197{
3198	struct irte_ga *irte = (struct irte_ga *) entry;
3199
3200	irte->lo.val                      = 0;
3201	irte->hi.val                      = 0;
3202	irte->lo.fields_remap.int_type    = delivery_mode;
3203	irte->lo.fields_remap.dm          = dest_mode;
3204	irte->hi.fields.vector            = vector;
3205	irte->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(dest_apicid);
3206	irte->hi.fields.destination       = APICID_TO_IRTE_DEST_HI(dest_apicid);
3207	irte->lo.fields_remap.valid       = 1;
3208}
3209
3210static void irte_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3211{
3212	union irte *irte = (union irte *) entry;
3213
3214	irte->fields.valid = 1;
3215	modify_irte(iommu, devid, index, irte);
3216}
3217
3218static void irte_ga_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3219{
3220	struct irte_ga *irte = (struct irte_ga *) entry;
3221
3222	irte->lo.fields_remap.valid = 1;
3223	modify_irte_ga(iommu, devid, index, irte);
3224}
3225
3226static void irte_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3227{
3228	union irte *irte = (union irte *) entry;
3229
3230	irte->fields.valid = 0;
3231	modify_irte(iommu, devid, index, irte);
3232}
3233
3234static void irte_ga_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3235{
3236	struct irte_ga *irte = (struct irte_ga *) entry;
3237
3238	irte->lo.fields_remap.valid = 0;
3239	modify_irte_ga(iommu, devid, index, irte);
3240}
3241
3242static void irte_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3243			      u8 vector, u32 dest_apicid)
3244{
3245	union irte *irte = (union irte *) entry;
3246
3247	irte->fields.vector = vector;
3248	irte->fields.destination = dest_apicid;
3249	modify_irte(iommu, devid, index, irte);
3250}
3251
3252static void irte_ga_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3253				 u8 vector, u32 dest_apicid)
3254{
3255	struct irte_ga *irte = (struct irte_ga *) entry;
3256
3257	if (!irte->lo.fields_remap.guest_mode) {
3258		irte->hi.fields.vector = vector;
3259		irte->lo.fields_remap.destination =
3260					APICID_TO_IRTE_DEST_LO(dest_apicid);
3261		irte->hi.fields.destination =
3262					APICID_TO_IRTE_DEST_HI(dest_apicid);
3263		modify_irte_ga(iommu, devid, index, irte);
3264	}
3265}
3266
3267#define IRTE_ALLOCATED (~1U)
3268static void irte_set_allocated(struct irq_remap_table *table, int index)
3269{
3270	table->table[index] = IRTE_ALLOCATED;
3271}
3272
3273static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
3274{
3275	struct irte_ga *ptr = (struct irte_ga *)table->table;
3276	struct irte_ga *irte = &ptr[index];
3277
3278	memset(&irte->lo.val, 0, sizeof(u64));
3279	memset(&irte->hi.val, 0, sizeof(u64));
3280	irte->hi.fields.vector = 0xff;
3281}
3282
3283static bool irte_is_allocated(struct irq_remap_table *table, int index)
3284{
3285	union irte *ptr = (union irte *)table->table;
3286	union irte *irte = &ptr[index];
3287
3288	return irte->val != 0;
3289}
3290
3291static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
3292{
3293	struct irte_ga *ptr = (struct irte_ga *)table->table;
3294	struct irte_ga *irte = &ptr[index];
3295
3296	return irte->hi.fields.vector != 0;
3297}
3298
3299static void irte_clear_allocated(struct irq_remap_table *table, int index)
3300{
3301	table->table[index] = 0;
3302}
3303
3304static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
3305{
3306	struct irte_ga *ptr = (struct irte_ga *)table->table;
3307	struct irte_ga *irte = &ptr[index];
3308
3309	memset(&irte->lo.val, 0, sizeof(u64));
3310	memset(&irte->hi.val, 0, sizeof(u64));
3311}
3312
3313static int get_devid(struct irq_alloc_info *info)
3314{
3315	switch (info->type) {
3316	case X86_IRQ_ALLOC_TYPE_IOAPIC:
3317		return get_ioapic_devid(info->devid);
3318	case X86_IRQ_ALLOC_TYPE_HPET:
3319		return get_hpet_devid(info->devid);
3320	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3321	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3322		return get_device_sbdf_id(msi_desc_to_dev(info->desc));
3323	default:
3324		WARN_ON_ONCE(1);
3325		return -1;
3326	}
3327}
3328
3329struct irq_remap_ops amd_iommu_irq_ops = {
3330	.prepare		= amd_iommu_prepare,
3331	.enable			= amd_iommu_enable,
3332	.disable		= amd_iommu_disable,
3333	.reenable		= amd_iommu_reenable,
3334	.enable_faulting	= amd_iommu_enable_faulting,
3335};
3336
3337static void fill_msi_msg(struct msi_msg *msg, u32 index)
3338{
3339	msg->data = index;
3340	msg->address_lo = 0;
3341	msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
3342	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
3343}
3344
3345static void irq_remapping_prepare_irte(struct amd_ir_data *data,
3346				       struct irq_cfg *irq_cfg,
3347				       struct irq_alloc_info *info,
3348				       int devid, int index, int sub_handle)
3349{
3350	struct irq_2_irte *irte_info = &data->irq_2_irte;
3351	struct amd_iommu *iommu = data->iommu;
3352
3353	if (!iommu)
3354		return;
3355
3356	data->irq_2_irte.devid = devid;
3357	data->irq_2_irte.index = index + sub_handle;
3358	iommu->irte_ops->prepare(data->entry, APIC_DELIVERY_MODE_FIXED,
3359				 apic->dest_mode_logical, irq_cfg->vector,
3360				 irq_cfg->dest_apicid, devid);
3361
3362	switch (info->type) {
3363	case X86_IRQ_ALLOC_TYPE_IOAPIC:
3364	case X86_IRQ_ALLOC_TYPE_HPET:
3365	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3366	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3367		fill_msi_msg(&data->msi_entry, irte_info->index);
3368		break;
3369
3370	default:
3371		BUG_ON(1);
3372		break;
3373	}
3374}
3375
3376struct amd_irte_ops irte_32_ops = {
3377	.prepare = irte_prepare,
3378	.activate = irte_activate,
3379	.deactivate = irte_deactivate,
3380	.set_affinity = irte_set_affinity,
3381	.set_allocated = irte_set_allocated,
3382	.is_allocated = irte_is_allocated,
3383	.clear_allocated = irte_clear_allocated,
3384};
3385
3386struct amd_irte_ops irte_128_ops = {
3387	.prepare = irte_ga_prepare,
3388	.activate = irte_ga_activate,
3389	.deactivate = irte_ga_deactivate,
3390	.set_affinity = irte_ga_set_affinity,
3391	.set_allocated = irte_ga_set_allocated,
3392	.is_allocated = irte_ga_is_allocated,
3393	.clear_allocated = irte_ga_clear_allocated,
3394};
3395
3396static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
3397			       unsigned int nr_irqs, void *arg)
3398{
3399	struct irq_alloc_info *info = arg;
3400	struct irq_data *irq_data;
3401	struct amd_ir_data *data = NULL;
3402	struct amd_iommu *iommu;
3403	struct irq_cfg *cfg;
3404	int i, ret, devid, seg, sbdf;
3405	int index;
3406
3407	if (!info)
3408		return -EINVAL;
3409	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI)
3410		return -EINVAL;
3411
3412	sbdf = get_devid(info);
3413	if (sbdf < 0)
3414		return -EINVAL;
3415
3416	seg = PCI_SBDF_TO_SEGID(sbdf);
3417	devid = PCI_SBDF_TO_DEVID(sbdf);
3418	iommu = __rlookup_amd_iommu(seg, devid);
3419	if (!iommu)
3420		return -EINVAL;
3421
3422	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
3423	if (ret < 0)
3424		return ret;
3425
3426	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
3427		struct irq_remap_table *table;
3428
3429		table = alloc_irq_table(iommu, devid, NULL);
3430		if (table) {
3431			if (!table->min_index) {
3432				/*
3433				 * Keep the first 32 indexes free for IOAPIC
3434				 * interrupts.
3435				 */
3436				table->min_index = 32;
3437				for (i = 0; i < 32; ++i)
3438					iommu->irte_ops->set_allocated(table, i);
3439			}
3440			WARN_ON(table->min_index != 32);
3441			index = info->ioapic.pin;
3442		} else {
3443			index = -ENOMEM;
3444		}
3445	} else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI ||
3446		   info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) {
3447		bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI);
3448
3449		index = alloc_irq_index(iommu, devid, nr_irqs, align,
3450					msi_desc_to_pci_dev(info->desc));
3451	} else {
3452		index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL);
3453	}
3454
3455	if (index < 0) {
3456		pr_warn("Failed to allocate IRTE\n");
3457		ret = index;
3458		goto out_free_parent;
3459	}
3460
3461	for (i = 0; i < nr_irqs; i++) {
3462		irq_data = irq_domain_get_irq_data(domain, virq + i);
3463		cfg = irq_data ? irqd_cfg(irq_data) : NULL;
3464		if (!cfg) {
3465			ret = -EINVAL;
3466			goto out_free_data;
3467		}
3468
3469		ret = -ENOMEM;
3470		data = kzalloc(sizeof(*data), GFP_KERNEL);
3471		if (!data)
3472			goto out_free_data;
3473
3474		if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
3475			data->entry = kzalloc(sizeof(union irte), GFP_KERNEL);
3476		else
3477			data->entry = kzalloc(sizeof(struct irte_ga),
3478						     GFP_KERNEL);
3479		if (!data->entry) {
3480			kfree(data);
3481			goto out_free_data;
3482		}
3483
3484		data->iommu = iommu;
3485		irq_data->hwirq = (devid << 16) + i;
3486		irq_data->chip_data = data;
3487		irq_data->chip = &amd_ir_chip;
3488		irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
3489		irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
3490	}
3491
3492	return 0;
3493
3494out_free_data:
3495	for (i--; i >= 0; i--) {
3496		irq_data = irq_domain_get_irq_data(domain, virq + i);
3497		if (irq_data)
3498			kfree(irq_data->chip_data);
3499	}
3500	for (i = 0; i < nr_irqs; i++)
3501		free_irte(iommu, devid, index + i);
3502out_free_parent:
3503	irq_domain_free_irqs_common(domain, virq, nr_irqs);
3504	return ret;
3505}
3506
3507static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
3508			       unsigned int nr_irqs)
3509{
3510	struct irq_2_irte *irte_info;
3511	struct irq_data *irq_data;
3512	struct amd_ir_data *data;
3513	int i;
3514
3515	for (i = 0; i < nr_irqs; i++) {
3516		irq_data = irq_domain_get_irq_data(domain, virq  + i);
3517		if (irq_data && irq_data->chip_data) {
3518			data = irq_data->chip_data;
3519			irte_info = &data->irq_2_irte;
3520			free_irte(data->iommu, irte_info->devid, irte_info->index);
3521			kfree(data->entry);
3522			kfree(data);
3523		}
3524	}
3525	irq_domain_free_irqs_common(domain, virq, nr_irqs);
3526}
3527
3528static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3529			       struct amd_ir_data *ir_data,
3530			       struct irq_2_irte *irte_info,
3531			       struct irq_cfg *cfg);
3532
3533static int irq_remapping_activate(struct irq_domain *domain,
3534				  struct irq_data *irq_data, bool reserve)
3535{
3536	struct amd_ir_data *data = irq_data->chip_data;
3537	struct irq_2_irte *irte_info = &data->irq_2_irte;
3538	struct amd_iommu *iommu = data->iommu;
3539	struct irq_cfg *cfg = irqd_cfg(irq_data);
3540
3541	if (!iommu)
3542		return 0;
3543
3544	iommu->irte_ops->activate(iommu, data->entry, irte_info->devid,
3545				  irte_info->index);
3546	amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg);
3547	return 0;
3548}
3549
3550static void irq_remapping_deactivate(struct irq_domain *domain,
3551				     struct irq_data *irq_data)
3552{
3553	struct amd_ir_data *data = irq_data->chip_data;
3554	struct irq_2_irte *irte_info = &data->irq_2_irte;
3555	struct amd_iommu *iommu = data->iommu;
3556
3557	if (iommu)
3558		iommu->irte_ops->deactivate(iommu, data->entry, irte_info->devid,
3559					    irte_info->index);
3560}
3561
3562static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec,
3563				enum irq_domain_bus_token bus_token)
3564{
3565	struct amd_iommu *iommu;
3566	int devid = -1;
3567
3568	if (!amd_iommu_irq_remap)
3569		return 0;
3570
3571	if (x86_fwspec_is_ioapic(fwspec))
3572		devid = get_ioapic_devid(fwspec->param[0]);
3573	else if (x86_fwspec_is_hpet(fwspec))
3574		devid = get_hpet_devid(fwspec->param[0]);
3575
3576	if (devid < 0)
3577		return 0;
3578	iommu = __rlookup_amd_iommu((devid >> 16), (devid & 0xffff));
3579
3580	return iommu && iommu->ir_domain == d;
3581}
3582
3583static const struct irq_domain_ops amd_ir_domain_ops = {
3584	.select = irq_remapping_select,
3585	.alloc = irq_remapping_alloc,
3586	.free = irq_remapping_free,
3587	.activate = irq_remapping_activate,
3588	.deactivate = irq_remapping_deactivate,
3589};
3590
3591int amd_iommu_activate_guest_mode(void *data)
3592{
3593	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3594	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3595	u64 valid;
3596
3597	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry)
3598		return 0;
3599
3600	valid = entry->lo.fields_vapic.valid;
3601
3602	entry->lo.val = 0;
3603	entry->hi.val = 0;
3604
3605	entry->lo.fields_vapic.valid       = valid;
3606	entry->lo.fields_vapic.guest_mode  = 1;
3607	entry->lo.fields_vapic.ga_log_intr = 1;
3608	entry->hi.fields.ga_root_ptr       = ir_data->ga_root_ptr;
3609	entry->hi.fields.vector            = ir_data->ga_vector;
3610	entry->lo.fields_vapic.ga_tag      = ir_data->ga_tag;
3611
3612	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3613			      ir_data->irq_2_irte.index, entry);
3614}
3615EXPORT_SYMBOL(amd_iommu_activate_guest_mode);
3616
3617int amd_iommu_deactivate_guest_mode(void *data)
3618{
3619	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3620	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3621	struct irq_cfg *cfg = ir_data->cfg;
3622	u64 valid;
3623
3624	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3625	    !entry || !entry->lo.fields_vapic.guest_mode)
3626		return 0;
3627
3628	valid = entry->lo.fields_remap.valid;
3629
3630	entry->lo.val = 0;
3631	entry->hi.val = 0;
3632
3633	entry->lo.fields_remap.valid       = valid;
3634	entry->lo.fields_remap.dm          = apic->dest_mode_logical;
3635	entry->lo.fields_remap.int_type    = APIC_DELIVERY_MODE_FIXED;
3636	entry->hi.fields.vector            = cfg->vector;
3637	entry->lo.fields_remap.destination =
3638				APICID_TO_IRTE_DEST_LO(cfg->dest_apicid);
3639	entry->hi.fields.destination =
3640				APICID_TO_IRTE_DEST_HI(cfg->dest_apicid);
3641
3642	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3643			      ir_data->irq_2_irte.index, entry);
3644}
3645EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
3646
3647static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
3648{
3649	int ret;
3650	struct amd_iommu_pi_data *pi_data = vcpu_info;
3651	struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
3652	struct amd_ir_data *ir_data = data->chip_data;
3653	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3654	struct iommu_dev_data *dev_data;
3655
3656	if (ir_data->iommu == NULL)
3657		return -EINVAL;
3658
3659	dev_data = search_dev_data(ir_data->iommu, irte_info->devid);
3660
3661	/* Note:
3662	 * This device has never been set up for guest mode.
3663	 * we should not modify the IRTE
3664	 */
3665	if (!dev_data || !dev_data->use_vapic)
3666		return 0;
3667
3668	ir_data->cfg = irqd_cfg(data);
3669	pi_data->ir_data = ir_data;
3670
3671	/* Note:
3672	 * SVM tries to set up for VAPIC mode, but we are in
3673	 * legacy mode. So, we force legacy mode instead.
3674	 */
3675	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
3676		pr_debug("%s: Fall back to using intr legacy remap\n",
3677			 __func__);
3678		pi_data->is_guest_mode = false;
3679	}
3680
3681	pi_data->prev_ga_tag = ir_data->cached_ga_tag;
3682	if (pi_data->is_guest_mode) {
3683		ir_data->ga_root_ptr = (pi_data->base >> 12);
3684		ir_data->ga_vector = vcpu_pi_info->vector;
3685		ir_data->ga_tag = pi_data->ga_tag;
3686		ret = amd_iommu_activate_guest_mode(ir_data);
3687		if (!ret)
3688			ir_data->cached_ga_tag = pi_data->ga_tag;
3689	} else {
3690		ret = amd_iommu_deactivate_guest_mode(ir_data);
3691
3692		/*
3693		 * This communicates the ga_tag back to the caller
3694		 * so that it can do all the necessary clean up.
3695		 */
3696		if (!ret)
3697			ir_data->cached_ga_tag = 0;
3698	}
3699
3700	return ret;
3701}
3702
3703
3704static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3705			       struct amd_ir_data *ir_data,
3706			       struct irq_2_irte *irte_info,
3707			       struct irq_cfg *cfg)
3708{
3709
3710	/*
3711	 * Atomically updates the IRTE with the new destination, vector
3712	 * and flushes the interrupt entry cache.
3713	 */
3714	iommu->irte_ops->set_affinity(iommu, ir_data->entry, irte_info->devid,
3715				      irte_info->index, cfg->vector,
3716				      cfg->dest_apicid);
3717}
3718
3719static int amd_ir_set_affinity(struct irq_data *data,
3720			       const struct cpumask *mask, bool force)
3721{
3722	struct amd_ir_data *ir_data = data->chip_data;
3723	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3724	struct irq_cfg *cfg = irqd_cfg(data);
3725	struct irq_data *parent = data->parent_data;
3726	struct amd_iommu *iommu = ir_data->iommu;
3727	int ret;
3728
3729	if (!iommu)
3730		return -ENODEV;
3731
3732	ret = parent->chip->irq_set_affinity(parent, mask, force);
3733	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
3734		return ret;
3735
3736	amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg);
3737	/*
3738	 * After this point, all the interrupts will start arriving
3739	 * at the new destination. So, time to cleanup the previous
3740	 * vector allocation.
3741	 */
3742	vector_schedule_cleanup(cfg);
3743
3744	return IRQ_SET_MASK_OK_DONE;
3745}
3746
3747static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
3748{
3749	struct amd_ir_data *ir_data = irq_data->chip_data;
3750
3751	*msg = ir_data->msi_entry;
3752}
3753
3754static struct irq_chip amd_ir_chip = {
3755	.name			= "AMD-IR",
3756	.irq_ack		= apic_ack_irq,
3757	.irq_set_affinity	= amd_ir_set_affinity,
3758	.irq_set_vcpu_affinity	= amd_ir_set_vcpu_affinity,
3759	.irq_compose_msi_msg	= ir_compose_msi_msg,
3760};
3761
3762static const struct msi_parent_ops amdvi_msi_parent_ops = {
3763	.supported_flags	= X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI,
3764	.prefix			= "IR-",
3765	.init_dev_msi_info	= msi_parent_init_dev_msi_info,
3766};
3767
3768int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
3769{
3770	struct fwnode_handle *fn;
3771
3772	fn = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index);
3773	if (!fn)
3774		return -ENOMEM;
3775	iommu->ir_domain = irq_domain_create_hierarchy(arch_get_ir_parent_domain(), 0, 0,
3776						       fn, &amd_ir_domain_ops, iommu);
3777	if (!iommu->ir_domain) {
3778		irq_domain_free_fwnode(fn);
3779		return -ENOMEM;
3780	}
3781
3782	irq_domain_update_bus_token(iommu->ir_domain,  DOMAIN_BUS_AMDVI);
3783	iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT |
3784				   IRQ_DOMAIN_FLAG_ISOLATED_MSI;
3785	iommu->ir_domain->msi_parent_ops = &amdvi_msi_parent_ops;
3786
3787	return 0;
3788}
3789
3790int amd_iommu_update_ga(int cpu, bool is_run, void *data)
3791{
3792	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3793	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3794
3795	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3796	    !entry || !entry->lo.fields_vapic.guest_mode)
3797		return 0;
3798
3799	if (!ir_data->iommu)
3800		return -ENODEV;
3801
3802	if (cpu >= 0) {
3803		entry->lo.fields_vapic.destination =
3804					APICID_TO_IRTE_DEST_LO(cpu);
3805		entry->hi.fields.destination =
3806					APICID_TO_IRTE_DEST_HI(cpu);
3807	}
3808	entry->lo.fields_vapic.is_run = is_run;
3809
3810	return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3811				ir_data->irq_2_irte.index, entry);
3812}
3813EXPORT_SYMBOL(amd_iommu_update_ga);
3814#endif