Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright © 2006-2014 Intel Corporation.
   4 *
   5 * Authors: David Woodhouse <dwmw2@infradead.org>,
   6 *          Ashok Raj <ashok.raj@intel.com>,
   7 *          Shaohua Li <shaohua.li@intel.com>,
   8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9 *          Fenghua Yu <fenghua.yu@intel.com>
  10 *          Joerg Roedel <jroedel@suse.de>
  11 */
  12
  13#define pr_fmt(fmt)     "DMAR: " fmt
  14#define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16#include <linux/crash_dump.h>
  17#include <linux/dma-direct.h>
  18#include <linux/dmi.h>
  19#include <linux/memory.h>
  20#include <linux/pci.h>
  21#include <linux/pci-ats.h>
  22#include <linux/spinlock.h>
  23#include <linux/syscore_ops.h>
  24#include <linux/tboot.h>
  25#include <uapi/linux/iommufd.h>
  26
  27#include "iommu.h"
  28#include "../dma-iommu.h"
  29#include "../irq_remapping.h"
  30#include "../iommu-pages.h"
  31#include "pasid.h"
  32#include "cap_audit.h"
  33#include "perfmon.h"
  34
  35#define ROOT_SIZE		VTD_PAGE_SIZE
  36#define CONTEXT_SIZE		VTD_PAGE_SIZE
  37
  38#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  39#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  40#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  41#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  42
  43#define IOAPIC_RANGE_START	(0xfee00000)
  44#define IOAPIC_RANGE_END	(0xfeefffff)
  45#define IOVA_START_ADDR		(0x1000)
  46
  47#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  48
  49#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
  50#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
  51
  52/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  53   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  54#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
  55				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  56#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  57
  58static void __init check_tylersburg_isoch(void);
  59static int rwbf_quirk;
  60
  61/*
  62 * set to 1 to panic kernel if can't successfully enable VT-d
  63 * (used when kernel is launched w/ TXT)
  64 */
  65static int force_on = 0;
  66static int intel_iommu_tboot_noforce;
  67static int no_platform_optin;
  68
  69#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
  70
  71/*
  72 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
  73 * if marked present.
  74 */
  75static phys_addr_t root_entry_lctp(struct root_entry *re)
  76{
  77	if (!(re->lo & 1))
  78		return 0;
  79
  80	return re->lo & VTD_PAGE_MASK;
  81}
  82
  83/*
  84 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
  85 * if marked present.
  86 */
  87static phys_addr_t root_entry_uctp(struct root_entry *re)
  88{
  89	if (!(re->hi & 1))
  90		return 0;
  91
  92	return re->hi & VTD_PAGE_MASK;
  93}
  94
  95static int device_rid_cmp_key(const void *key, const struct rb_node *node)
  96{
  97	struct device_domain_info *info =
  98		rb_entry(node, struct device_domain_info, node);
  99	const u16 *rid_lhs = key;
 100
 101	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
 102		return -1;
 103
 104	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
 105		return 1;
 106
 107	return 0;
 108}
 109
 110static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
 111{
 112	struct device_domain_info *info =
 113		rb_entry(lhs, struct device_domain_info, node);
 114	u16 key = PCI_DEVID(info->bus, info->devfn);
 115
 116	return device_rid_cmp_key(&key, rhs);
 117}
 118
 119/*
 120 * Looks up an IOMMU-probed device using its source ID.
 121 *
 122 * Returns the pointer to the device if there is a match. Otherwise,
 123 * returns NULL.
 124 *
 125 * Note that this helper doesn't guarantee that the device won't be
 126 * released by the iommu subsystem after being returned. The caller
 127 * should use its own synchronization mechanism to avoid the device
 128 * being released during its use if its possibly the case.
 129 */
 130struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
 131{
 132	struct device_domain_info *info = NULL;
 133	struct rb_node *node;
 134	unsigned long flags;
 135
 136	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
 137	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
 138	if (node)
 139		info = rb_entry(node, struct device_domain_info, node);
 140	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
 141
 142	return info ? info->dev : NULL;
 143}
 144
 145static int device_rbtree_insert(struct intel_iommu *iommu,
 146				struct device_domain_info *info)
 147{
 148	struct rb_node *curr;
 149	unsigned long flags;
 150
 151	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
 152	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
 153	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
 154	if (WARN_ON(curr))
 155		return -EEXIST;
 156
 157	return 0;
 158}
 159
 160static void device_rbtree_remove(struct device_domain_info *info)
 161{
 162	struct intel_iommu *iommu = info->iommu;
 163	unsigned long flags;
 164
 165	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
 166	rb_erase(&info->node, &iommu->device_rbtree);
 167	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
 168}
 169
 170struct dmar_rmrr_unit {
 171	struct list_head list;		/* list of rmrr units	*/
 172	struct acpi_dmar_header *hdr;	/* ACPI header		*/
 173	u64	base_address;		/* reserved base address*/
 174	u64	end_address;		/* reserved end address */
 175	struct dmar_dev_scope *devices;	/* target devices */
 176	int	devices_cnt;		/* target device count */
 177};
 178
 179struct dmar_atsr_unit {
 180	struct list_head list;		/* list of ATSR units */
 181	struct acpi_dmar_header *hdr;	/* ACPI header */
 182	struct dmar_dev_scope *devices;	/* target devices */
 183	int devices_cnt;		/* target device count */
 184	u8 include_all:1;		/* include all ports */
 185};
 186
 187struct dmar_satc_unit {
 188	struct list_head list;		/* list of SATC units */
 189	struct acpi_dmar_header *hdr;	/* ACPI header */
 190	struct dmar_dev_scope *devices;	/* target devices */
 191	struct intel_iommu *iommu;	/* the corresponding iommu */
 192	int devices_cnt;		/* target device count */
 193	u8 atc_required:1;		/* ATS is required */
 194};
 195
 196static LIST_HEAD(dmar_atsr_units);
 197static LIST_HEAD(dmar_rmrr_units);
 198static LIST_HEAD(dmar_satc_units);
 199
 200#define for_each_rmrr_units(rmrr) \
 201	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 202
 203static void intel_iommu_domain_free(struct iommu_domain *domain);
 204
 205int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
 206int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
 207
 208int intel_iommu_enabled = 0;
 209EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 210
 211static int intel_iommu_superpage = 1;
 212static int iommu_identity_mapping;
 213static int iommu_skip_te_disable;
 214static int disable_igfx_iommu;
 215
 216#define IDENTMAP_AZALIA		4
 217
 218const struct iommu_ops intel_iommu_ops;
 219static const struct iommu_dirty_ops intel_dirty_ops;
 220
 221static bool translation_pre_enabled(struct intel_iommu *iommu)
 222{
 223	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 224}
 225
 226static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 227{
 228	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 229}
 230
 231static void init_translation_status(struct intel_iommu *iommu)
 232{
 233	u32 gsts;
 234
 235	gsts = readl(iommu->reg + DMAR_GSTS_REG);
 236	if (gsts & DMA_GSTS_TES)
 237		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 238}
 239
 240static int __init intel_iommu_setup(char *str)
 241{
 242	if (!str)
 243		return -EINVAL;
 244
 245	while (*str) {
 246		if (!strncmp(str, "on", 2)) {
 247			dmar_disabled = 0;
 248			pr_info("IOMMU enabled\n");
 249		} else if (!strncmp(str, "off", 3)) {
 250			dmar_disabled = 1;
 251			no_platform_optin = 1;
 252			pr_info("IOMMU disabled\n");
 253		} else if (!strncmp(str, "igfx_off", 8)) {
 254			disable_igfx_iommu = 1;
 255			pr_info("Disable GFX device mapping\n");
 256		} else if (!strncmp(str, "forcedac", 8)) {
 257			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 258			iommu_dma_forcedac = true;
 259		} else if (!strncmp(str, "strict", 6)) {
 260			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
 261			iommu_set_dma_strict();
 262		} else if (!strncmp(str, "sp_off", 6)) {
 263			pr_info("Disable supported super page\n");
 264			intel_iommu_superpage = 0;
 265		} else if (!strncmp(str, "sm_on", 5)) {
 266			pr_info("Enable scalable mode if hardware supports\n");
 267			intel_iommu_sm = 1;
 268		} else if (!strncmp(str, "sm_off", 6)) {
 269			pr_info("Scalable mode is disallowed\n");
 270			intel_iommu_sm = 0;
 271		} else if (!strncmp(str, "tboot_noforce", 13)) {
 272			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 273			intel_iommu_tboot_noforce = 1;
 274		} else {
 275			pr_notice("Unknown option - '%s'\n", str);
 276		}
 277
 278		str += strcspn(str, ",");
 279		while (*str == ',')
 280			str++;
 281	}
 282
 283	return 1;
 284}
 285__setup("intel_iommu=", intel_iommu_setup);
 286
 287static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
 288{
 289	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 290
 291	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 292}
 293
 294/*
 295 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
 296 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
 297 * the returned SAGAW.
 298 */
 299static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
 300{
 301	unsigned long fl_sagaw, sl_sagaw;
 302
 303	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
 304	sl_sagaw = cap_sagaw(iommu->cap);
 305
 306	/* Second level only. */
 307	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
 308		return sl_sagaw;
 309
 310	/* First level only. */
 311	if (!ecap_slts(iommu->ecap))
 312		return fl_sagaw;
 313
 314	return fl_sagaw & sl_sagaw;
 315}
 316
 317static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 318{
 319	unsigned long sagaw;
 320	int agaw;
 321
 322	sagaw = __iommu_calculate_sagaw(iommu);
 323	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
 324		if (test_bit(agaw, &sagaw))
 325			break;
 326	}
 327
 328	return agaw;
 329}
 330
 331/*
 332 * Calculate max SAGAW for each iommu.
 333 */
 334int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 335{
 336	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 337}
 338
 339/*
 340 * calculate agaw for each iommu.
 341 * "SAGAW" may be different across iommus, use a default agaw, and
 342 * get a supported less agaw for iommus that don't support the default agaw.
 343 */
 344int iommu_calculate_agaw(struct intel_iommu *iommu)
 345{
 346	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 347}
 348
 349static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 350{
 351	return sm_supported(iommu) ?
 352			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 353}
 354
 355/* Return the super pagesize bitmap if supported. */
 356static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
 357{
 358	unsigned long bitmap = 0;
 359
 360	/*
 361	 * 1-level super page supports page size of 2MiB, 2-level super page
 362	 * supports page size of both 2MiB and 1GiB.
 363	 */
 364	if (domain->iommu_superpage == 1)
 365		bitmap |= SZ_2M;
 366	else if (domain->iommu_superpage == 2)
 367		bitmap |= SZ_2M | SZ_1G;
 368
 369	return bitmap;
 370}
 371
 372struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 373					 u8 devfn, int alloc)
 374{
 375	struct root_entry *root = &iommu->root_entry[bus];
 376	struct context_entry *context;
 377	u64 *entry;
 378
 379	/*
 380	 * Except that the caller requested to allocate a new entry,
 381	 * returning a copied context entry makes no sense.
 382	 */
 383	if (!alloc && context_copied(iommu, bus, devfn))
 384		return NULL;
 385
 386	entry = &root->lo;
 387	if (sm_supported(iommu)) {
 388		if (devfn >= 0x80) {
 389			devfn -= 0x80;
 390			entry = &root->hi;
 391		}
 392		devfn *= 2;
 393	}
 394	if (*entry & 1)
 395		context = phys_to_virt(*entry & VTD_PAGE_MASK);
 396	else {
 397		unsigned long phy_addr;
 398		if (!alloc)
 399			return NULL;
 400
 401		context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
 402		if (!context)
 403			return NULL;
 404
 405		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 406		phy_addr = virt_to_phys((void *)context);
 407		*entry = phy_addr | 1;
 408		__iommu_flush_cache(iommu, entry, sizeof(*entry));
 409	}
 410	return &context[devfn];
 411}
 412
 413/**
 414 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 415 *				 sub-hierarchy of a candidate PCI-PCI bridge
 416 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 417 * @bridge: the candidate PCI-PCI bridge
 418 *
 419 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 420 */
 421static bool
 422is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 423{
 424	struct pci_dev *pdev, *pbridge;
 425
 426	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 427		return false;
 428
 429	pdev = to_pci_dev(dev);
 430	pbridge = to_pci_dev(bridge);
 431
 432	if (pbridge->subordinate &&
 433	    pbridge->subordinate->number <= pdev->bus->number &&
 434	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
 435		return true;
 436
 437	return false;
 438}
 439
 440static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 441{
 442	struct dmar_drhd_unit *drhd;
 443	u32 vtbar;
 444	int rc;
 445
 446	/* We know that this device on this chipset has its own IOMMU.
 447	 * If we find it under a different IOMMU, then the BIOS is lying
 448	 * to us. Hope that the IOMMU for this device is actually
 449	 * disabled, and it needs no translation...
 450	 */
 451	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 452	if (rc) {
 453		/* "can't" happen */
 454		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 455		return false;
 456	}
 457	vtbar &= 0xffff0000;
 458
 459	/* we know that the this iommu should be at offset 0xa000 from vtbar */
 460	drhd = dmar_find_matched_drhd_unit(pdev);
 461	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 462		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 463		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 464		return true;
 465	}
 466
 467	return false;
 468}
 469
 470static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 471{
 472	if (!iommu || iommu->drhd->ignored)
 473		return true;
 474
 475	if (dev_is_pci(dev)) {
 476		struct pci_dev *pdev = to_pci_dev(dev);
 477
 478		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 479		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 480		    quirk_ioat_snb_local_iommu(pdev))
 481			return true;
 482	}
 483
 484	return false;
 485}
 486
 487static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
 488{
 489	struct dmar_drhd_unit *drhd = NULL;
 490	struct pci_dev *pdev = NULL;
 491	struct intel_iommu *iommu;
 492	struct device *tmp;
 493	u16 segment = 0;
 494	int i;
 495
 496	if (!dev)
 497		return NULL;
 498
 499	if (dev_is_pci(dev)) {
 500		struct pci_dev *pf_pdev;
 501
 502		pdev = pci_real_dma_dev(to_pci_dev(dev));
 503
 504		/* VFs aren't listed in scope tables; we need to look up
 505		 * the PF instead to find the IOMMU. */
 506		pf_pdev = pci_physfn(pdev);
 507		dev = &pf_pdev->dev;
 508		segment = pci_domain_nr(pdev->bus);
 509	} else if (has_acpi_companion(dev))
 510		dev = &ACPI_COMPANION(dev)->dev;
 511
 512	rcu_read_lock();
 513	for_each_iommu(iommu, drhd) {
 514		if (pdev && segment != drhd->segment)
 515			continue;
 516
 517		for_each_active_dev_scope(drhd->devices,
 518					  drhd->devices_cnt, i, tmp) {
 519			if (tmp == dev) {
 520				/* For a VF use its original BDF# not that of the PF
 521				 * which we used for the IOMMU lookup. Strictly speaking
 522				 * we could do this for all PCI devices; we only need to
 523				 * get the BDF# from the scope table for ACPI matches. */
 524				if (pdev && pdev->is_virtfn)
 525					goto got_pdev;
 526
 527				if (bus && devfn) {
 528					*bus = drhd->devices[i].bus;
 529					*devfn = drhd->devices[i].devfn;
 530				}
 531				goto out;
 532			}
 533
 534			if (is_downstream_to_pci_bridge(dev, tmp))
 535				goto got_pdev;
 536		}
 537
 538		if (pdev && drhd->include_all) {
 539got_pdev:
 540			if (bus && devfn) {
 541				*bus = pdev->bus->number;
 542				*devfn = pdev->devfn;
 543			}
 544			goto out;
 545		}
 546	}
 547	iommu = NULL;
 548out:
 549	if (iommu_is_dummy(iommu, dev))
 550		iommu = NULL;
 551
 552	rcu_read_unlock();
 553
 554	return iommu;
 555}
 556
 557static void domain_flush_cache(struct dmar_domain *domain,
 558			       void *addr, int size)
 559{
 560	if (!domain->iommu_coherency)
 561		clflush_cache_range(addr, size);
 562}
 563
 564static void free_context_table(struct intel_iommu *iommu)
 565{
 566	struct context_entry *context;
 567	int i;
 568
 569	if (!iommu->root_entry)
 570		return;
 571
 572	for (i = 0; i < ROOT_ENTRY_NR; i++) {
 573		context = iommu_context_addr(iommu, i, 0, 0);
 574		if (context)
 575			iommu_free_page(context);
 576
 577		if (!sm_supported(iommu))
 578			continue;
 579
 580		context = iommu_context_addr(iommu, i, 0x80, 0);
 581		if (context)
 582			iommu_free_page(context);
 583	}
 584
 585	iommu_free_page(iommu->root_entry);
 586	iommu->root_entry = NULL;
 587}
 588
 589#ifdef CONFIG_DMAR_DEBUG
 590static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
 591			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
 592{
 593	struct dma_pte *pte;
 594	int offset;
 595
 596	while (1) {
 597		offset = pfn_level_offset(pfn, level);
 598		pte = &parent[offset];
 599
 600		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
 601
 602		if (!dma_pte_present(pte)) {
 603			pr_info("page table not present at level %d\n", level - 1);
 604			break;
 605		}
 606
 607		if (level == 1 || dma_pte_superpage(pte))
 608			break;
 609
 610		parent = phys_to_virt(dma_pte_addr(pte));
 611		level--;
 612	}
 613}
 614
 615void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
 616			  unsigned long long addr, u32 pasid)
 617{
 618	struct pasid_dir_entry *dir, *pde;
 619	struct pasid_entry *entries, *pte;
 620	struct context_entry *ctx_entry;
 621	struct root_entry *rt_entry;
 622	int i, dir_index, index, level;
 623	u8 devfn = source_id & 0xff;
 624	u8 bus = source_id >> 8;
 625	struct dma_pte *pgtable;
 626
 627	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
 628
 629	/* root entry dump */
 630	if (!iommu->root_entry) {
 631		pr_info("root table is not present\n");
 632		return;
 633	}
 634	rt_entry = &iommu->root_entry[bus];
 635
 636	if (sm_supported(iommu))
 637		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
 638			rt_entry->hi, rt_entry->lo);
 639	else
 640		pr_info("root entry: 0x%016llx", rt_entry->lo);
 641
 642	/* context entry dump */
 643	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
 644	if (!ctx_entry) {
 645		pr_info("context table is not present\n");
 646		return;
 647	}
 648
 649	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
 650		ctx_entry->hi, ctx_entry->lo);
 651
 652	/* legacy mode does not require PASID entries */
 653	if (!sm_supported(iommu)) {
 654		if (!context_present(ctx_entry)) {
 655			pr_info("legacy mode page table is not present\n");
 656			return;
 657		}
 658		level = agaw_to_level(ctx_entry->hi & 7);
 659		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 660		goto pgtable_walk;
 661	}
 662
 663	if (!context_present(ctx_entry)) {
 664		pr_info("pasid directory table is not present\n");
 665		return;
 666	}
 667
 668	/* get the pointer to pasid directory entry */
 669	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 670
 671	/* For request-without-pasid, get the pasid from context entry */
 672	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
 673		pasid = IOMMU_NO_PASID;
 674
 675	dir_index = pasid >> PASID_PDE_SHIFT;
 676	pde = &dir[dir_index];
 677	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
 678
 679	/* get the pointer to the pasid table entry */
 680	entries = get_pasid_table_from_pde(pde);
 681	if (!entries) {
 682		pr_info("pasid table is not present\n");
 683		return;
 684	}
 685	index = pasid & PASID_PTE_MASK;
 686	pte = &entries[index];
 687	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
 688		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
 689
 690	if (!pasid_pte_is_present(pte)) {
 691		pr_info("scalable mode page table is not present\n");
 692		return;
 693	}
 694
 695	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
 696		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
 697		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
 698	} else {
 699		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
 700		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
 701	}
 702
 703pgtable_walk:
 704	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
 705}
 706#endif
 707
 708static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 709				      unsigned long pfn, int *target_level,
 710				      gfp_t gfp)
 711{
 712	struct dma_pte *parent, *pte;
 713	int level = agaw_to_level(domain->agaw);
 714	int offset;
 715
 716	if (!domain_pfn_supported(domain, pfn))
 717		/* Address beyond IOMMU's addressing capabilities. */
 718		return NULL;
 719
 720	parent = domain->pgd;
 721
 722	while (1) {
 723		void *tmp_page;
 724
 725		offset = pfn_level_offset(pfn, level);
 726		pte = &parent[offset];
 727		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 728			break;
 729		if (level == *target_level)
 730			break;
 731
 732		if (!dma_pte_present(pte)) {
 733			uint64_t pteval, tmp;
 734
 735			tmp_page = iommu_alloc_page_node(domain->nid, gfp);
 736
 737			if (!tmp_page)
 738				return NULL;
 739
 740			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 741			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 742			if (domain->use_first_level)
 743				pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
 744
 745			tmp = 0ULL;
 746			if (!try_cmpxchg64(&pte->val, &tmp, pteval))
 747				/* Someone else set it while we were thinking; use theirs. */
 748				iommu_free_page(tmp_page);
 749			else
 750				domain_flush_cache(domain, pte, sizeof(*pte));
 751		}
 752		if (level == 1)
 753			break;
 754
 755		parent = phys_to_virt(dma_pte_addr(pte));
 756		level--;
 757	}
 758
 759	if (!*target_level)
 760		*target_level = level;
 761
 762	return pte;
 763}
 764
 765/* return address's pte at specific level */
 766static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 767					 unsigned long pfn,
 768					 int level, int *large_page)
 769{
 770	struct dma_pte *parent, *pte;
 771	int total = agaw_to_level(domain->agaw);
 772	int offset;
 773
 774	parent = domain->pgd;
 775	while (level <= total) {
 776		offset = pfn_level_offset(pfn, total);
 777		pte = &parent[offset];
 778		if (level == total)
 779			return pte;
 780
 781		if (!dma_pte_present(pte)) {
 782			*large_page = total;
 783			break;
 784		}
 785
 786		if (dma_pte_superpage(pte)) {
 787			*large_page = total;
 788			return pte;
 789		}
 790
 791		parent = phys_to_virt(dma_pte_addr(pte));
 792		total--;
 793	}
 794	return NULL;
 795}
 796
 797/* clear last level pte, a tlb flush should be followed */
 798static void dma_pte_clear_range(struct dmar_domain *domain,
 799				unsigned long start_pfn,
 800				unsigned long last_pfn)
 801{
 802	unsigned int large_page;
 803	struct dma_pte *first_pte, *pte;
 804
 805	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
 806	    WARN_ON(start_pfn > last_pfn))
 807		return;
 808
 809	/* we don't need lock here; nobody else touches the iova range */
 810	do {
 811		large_page = 1;
 812		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 813		if (!pte) {
 814			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 815			continue;
 816		}
 817		do {
 818			dma_clear_pte(pte);
 819			start_pfn += lvl_to_nr_pages(large_page);
 820			pte++;
 821		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 822
 823		domain_flush_cache(domain, first_pte,
 824				   (void *)pte - (void *)first_pte);
 825
 826	} while (start_pfn && start_pfn <= last_pfn);
 827}
 828
 829static void dma_pte_free_level(struct dmar_domain *domain, int level,
 830			       int retain_level, struct dma_pte *pte,
 831			       unsigned long pfn, unsigned long start_pfn,
 832			       unsigned long last_pfn)
 833{
 834	pfn = max(start_pfn, pfn);
 835	pte = &pte[pfn_level_offset(pfn, level)];
 836
 837	do {
 838		unsigned long level_pfn;
 839		struct dma_pte *level_pte;
 840
 841		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
 842			goto next;
 843
 844		level_pfn = pfn & level_mask(level);
 845		level_pte = phys_to_virt(dma_pte_addr(pte));
 846
 847		if (level > 2) {
 848			dma_pte_free_level(domain, level - 1, retain_level,
 849					   level_pte, level_pfn, start_pfn,
 850					   last_pfn);
 851		}
 852
 853		/*
 854		 * Free the page table if we're below the level we want to
 855		 * retain and the range covers the entire table.
 856		 */
 857		if (level < retain_level && !(start_pfn > level_pfn ||
 858		      last_pfn < level_pfn + level_size(level) - 1)) {
 859			dma_clear_pte(pte);
 860			domain_flush_cache(domain, pte, sizeof(*pte));
 861			iommu_free_page(level_pte);
 862		}
 863next:
 864		pfn += level_size(level);
 865	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
 866}
 867
 868/*
 869 * clear last level (leaf) ptes and free page table pages below the
 870 * level we wish to keep intact.
 871 */
 872static void dma_pte_free_pagetable(struct dmar_domain *domain,
 873				   unsigned long start_pfn,
 874				   unsigned long last_pfn,
 875				   int retain_level)
 876{
 877	dma_pte_clear_range(domain, start_pfn, last_pfn);
 878
 879	/* We don't need lock here; nobody else touches the iova range */
 880	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
 881			   domain->pgd, 0, start_pfn, last_pfn);
 882
 883	/* free pgd */
 884	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 885		iommu_free_page(domain->pgd);
 886		domain->pgd = NULL;
 887	}
 888}
 889
 890/* When a page at a given level is being unlinked from its parent, we don't
 891   need to *modify* it at all. All we need to do is make a list of all the
 892   pages which can be freed just as soon as we've flushed the IOTLB and we
 893   know the hardware page-walk will no longer touch them.
 894   The 'pte' argument is the *parent* PTE, pointing to the page that is to
 895   be freed. */
 896static void dma_pte_list_pagetables(struct dmar_domain *domain,
 897				    int level, struct dma_pte *pte,
 898				    struct list_head *freelist)
 899{
 900	struct page *pg;
 901
 902	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
 903	list_add_tail(&pg->lru, freelist);
 904
 905	if (level == 1)
 906		return;
 907
 908	pte = page_address(pg);
 909	do {
 910		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
 911			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
 912		pte++;
 913	} while (!first_pte_in_page(pte));
 914}
 915
 916static void dma_pte_clear_level(struct dmar_domain *domain, int level,
 917				struct dma_pte *pte, unsigned long pfn,
 918				unsigned long start_pfn, unsigned long last_pfn,
 919				struct list_head *freelist)
 920{
 921	struct dma_pte *first_pte = NULL, *last_pte = NULL;
 922
 923	pfn = max(start_pfn, pfn);
 924	pte = &pte[pfn_level_offset(pfn, level)];
 925
 926	do {
 927		unsigned long level_pfn = pfn & level_mask(level);
 928
 929		if (!dma_pte_present(pte))
 930			goto next;
 931
 932		/* If range covers entire pagetable, free it */
 933		if (start_pfn <= level_pfn &&
 934		    last_pfn >= level_pfn + level_size(level) - 1) {
 935			/* These suborbinate page tables are going away entirely. Don't
 936			   bother to clear them; we're just going to *free* them. */
 937			if (level > 1 && !dma_pte_superpage(pte))
 938				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
 939
 940			dma_clear_pte(pte);
 941			if (!first_pte)
 942				first_pte = pte;
 943			last_pte = pte;
 944		} else if (level > 1) {
 945			/* Recurse down into a level that isn't *entirely* obsolete */
 946			dma_pte_clear_level(domain, level - 1,
 947					    phys_to_virt(dma_pte_addr(pte)),
 948					    level_pfn, start_pfn, last_pfn,
 949					    freelist);
 950		}
 951next:
 952		pfn = level_pfn + level_size(level);
 953	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
 954
 955	if (first_pte)
 956		domain_flush_cache(domain, first_pte,
 957				   (void *)++last_pte - (void *)first_pte);
 958}
 959
 960/* We can't just free the pages because the IOMMU may still be walking
 961   the page tables, and may have cached the intermediate levels. The
 962   pages can only be freed after the IOTLB flush has been done. */
 963static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
 964			 unsigned long last_pfn, struct list_head *freelist)
 965{
 966	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
 967	    WARN_ON(start_pfn > last_pfn))
 968		return;
 969
 970	/* we don't need lock here; nobody else touches the iova range */
 971	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
 972			    domain->pgd, 0, start_pfn, last_pfn, freelist);
 973
 974	/* free pgd */
 975	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 976		struct page *pgd_page = virt_to_page(domain->pgd);
 977		list_add_tail(&pgd_page->lru, freelist);
 978		domain->pgd = NULL;
 979	}
 980}
 981
 982/* iommu handling */
 983static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 984{
 985	struct root_entry *root;
 986
 987	root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
 988	if (!root) {
 989		pr_err("Allocating root entry for %s failed\n",
 990			iommu->name);
 991		return -ENOMEM;
 992	}
 993
 994	__iommu_flush_cache(iommu, root, ROOT_SIZE);
 995	iommu->root_entry = root;
 996
 997	return 0;
 998}
 999
1000static void iommu_set_root_entry(struct intel_iommu *iommu)
1001{
1002	u64 addr;
1003	u32 sts;
1004	unsigned long flag;
1005
1006	addr = virt_to_phys(iommu->root_entry);
1007	if (sm_supported(iommu))
1008		addr |= DMA_RTADDR_SMT;
1009
1010	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1011	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1012
1013	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1014
1015	/* Make sure hardware complete it */
1016	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1017		      readl, (sts & DMA_GSTS_RTPS), sts);
1018
1019	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1020
1021	/*
1022	 * Hardware invalidates all DMA remapping hardware translation
1023	 * caches as part of SRTP flow.
1024	 */
1025	if (cap_esrtps(iommu->cap))
1026		return;
1027
1028	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1029	if (sm_supported(iommu))
1030		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1031	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1032}
1033
1034void iommu_flush_write_buffer(struct intel_iommu *iommu)
1035{
1036	u32 val;
1037	unsigned long flag;
1038
1039	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1040		return;
1041
1042	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1043	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1044
1045	/* Make sure hardware complete it */
1046	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1047		      readl, (!(val & DMA_GSTS_WBFS)), val);
1048
1049	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1050}
1051
1052/* return value determine if we need a write buffer flush */
1053static void __iommu_flush_context(struct intel_iommu *iommu,
1054				  u16 did, u16 source_id, u8 function_mask,
1055				  u64 type)
1056{
1057	u64 val = 0;
1058	unsigned long flag;
1059
1060	switch (type) {
1061	case DMA_CCMD_GLOBAL_INVL:
1062		val = DMA_CCMD_GLOBAL_INVL;
1063		break;
1064	case DMA_CCMD_DOMAIN_INVL:
1065		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1066		break;
1067	case DMA_CCMD_DEVICE_INVL:
1068		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1069			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1070		break;
1071	default:
1072		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1073			iommu->name, type);
1074		return;
1075	}
1076	val |= DMA_CCMD_ICC;
1077
1078	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1079	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1080
1081	/* Make sure hardware complete it */
1082	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1083		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1084
1085	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1086}
1087
1088void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1089			 unsigned int size_order, u64 type)
1090{
1091	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1092	u64 val = 0, val_iva = 0;
1093	unsigned long flag;
1094
1095	switch (type) {
1096	case DMA_TLB_GLOBAL_FLUSH:
1097		/* global flush doesn't need set IVA_REG */
1098		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1099		break;
1100	case DMA_TLB_DSI_FLUSH:
1101		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1102		break;
1103	case DMA_TLB_PSI_FLUSH:
1104		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1105		/* IH bit is passed in as part of address */
1106		val_iva = size_order | addr;
1107		break;
1108	default:
1109		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1110			iommu->name, type);
1111		return;
1112	}
1113
1114	if (cap_write_drain(iommu->cap))
1115		val |= DMA_TLB_WRITE_DRAIN;
1116
1117	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1118	/* Note: Only uses first TLB reg currently */
1119	if (val_iva)
1120		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1121	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1122
1123	/* Make sure hardware complete it */
1124	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1125		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1126
1127	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1128
1129	/* check IOTLB invalidation granularity */
1130	if (DMA_TLB_IAIG(val) == 0)
1131		pr_err("Flush IOTLB failed\n");
1132	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1133		pr_debug("TLB flush request %Lx, actual %Lx\n",
1134			(unsigned long long)DMA_TLB_IIRG(type),
1135			(unsigned long long)DMA_TLB_IAIG(val));
1136}
1137
1138static struct device_domain_info *
1139domain_lookup_dev_info(struct dmar_domain *domain,
1140		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1141{
1142	struct device_domain_info *info;
1143	unsigned long flags;
1144
1145	spin_lock_irqsave(&domain->lock, flags);
1146	list_for_each_entry(info, &domain->devices, link) {
1147		if (info->iommu == iommu && info->bus == bus &&
1148		    info->devfn == devfn) {
1149			spin_unlock_irqrestore(&domain->lock, flags);
1150			return info;
1151		}
1152	}
1153	spin_unlock_irqrestore(&domain->lock, flags);
1154
1155	return NULL;
1156}
1157
1158/*
1159 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1160 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1161 * check because it applies only to the built-in QAT devices and it doesn't
1162 * grant additional privileges.
1163 */
1164#define BUGGY_QAT_DEVID_MASK 0x4940
1165static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1166{
1167	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1168		return false;
1169
1170	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1171		return false;
1172
1173	return true;
1174}
1175
1176static void iommu_enable_pci_caps(struct device_domain_info *info)
1177{
1178	struct pci_dev *pdev;
1179
1180	if (!dev_is_pci(info->dev))
1181		return;
1182
1183	pdev = to_pci_dev(info->dev);
1184	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1185	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1186		info->ats_enabled = 1;
1187}
1188
1189static void iommu_disable_pci_caps(struct device_domain_info *info)
1190{
1191	struct pci_dev *pdev;
1192
1193	if (!dev_is_pci(info->dev))
1194		return;
1195
1196	pdev = to_pci_dev(info->dev);
1197
1198	if (info->ats_enabled) {
1199		pci_disable_ats(pdev);
1200		info->ats_enabled = 0;
1201	}
1202}
1203
1204static void intel_flush_iotlb_all(struct iommu_domain *domain)
1205{
1206	cache_tag_flush_all(to_dmar_domain(domain));
1207}
1208
1209static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1210{
1211	u32 pmen;
1212	unsigned long flags;
1213
1214	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1215		return;
1216
1217	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1218	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1219	pmen &= ~DMA_PMEN_EPM;
1220	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1221
1222	/* wait for the protected region status bit to clear */
1223	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1224		readl, !(pmen & DMA_PMEN_PRS), pmen);
1225
1226	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1227}
1228
1229static void iommu_enable_translation(struct intel_iommu *iommu)
1230{
1231	u32 sts;
1232	unsigned long flags;
1233
1234	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1235	iommu->gcmd |= DMA_GCMD_TE;
1236	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1237
1238	/* Make sure hardware complete it */
1239	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1240		      readl, (sts & DMA_GSTS_TES), sts);
1241
1242	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1243}
1244
1245static void iommu_disable_translation(struct intel_iommu *iommu)
1246{
1247	u32 sts;
1248	unsigned long flag;
1249
1250	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1251	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1252		return;
1253
1254	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1255	iommu->gcmd &= ~DMA_GCMD_TE;
1256	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1257
1258	/* Make sure hardware complete it */
1259	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1260		      readl, (!(sts & DMA_GSTS_TES)), sts);
1261
1262	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1263}
1264
1265static int iommu_init_domains(struct intel_iommu *iommu)
1266{
1267	u32 ndomains;
1268
1269	ndomains = cap_ndoms(iommu->cap);
1270	pr_debug("%s: Number of Domains supported <%d>\n",
1271		 iommu->name, ndomains);
1272
1273	spin_lock_init(&iommu->lock);
1274
1275	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1276	if (!iommu->domain_ids)
1277		return -ENOMEM;
1278
1279	/*
1280	 * If Caching mode is set, then invalid translations are tagged
1281	 * with domain-id 0, hence we need to pre-allocate it. We also
1282	 * use domain-id 0 as a marker for non-allocated domain-id, so
1283	 * make sure it is not used for a real domain.
1284	 */
1285	set_bit(0, iommu->domain_ids);
1286
1287	/*
1288	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1289	 * entry for first-level or pass-through translation modes should
1290	 * be programmed with a domain id different from those used for
1291	 * second-level or nested translation. We reserve a domain id for
1292	 * this purpose. This domain id is also used for identity domain
1293	 * in legacy mode.
1294	 */
1295	set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1296
1297	return 0;
1298}
1299
1300static void disable_dmar_iommu(struct intel_iommu *iommu)
1301{
1302	if (!iommu->domain_ids)
1303		return;
1304
1305	/*
1306	 * All iommu domains must have been detached from the devices,
1307	 * hence there should be no domain IDs in use.
1308	 */
1309	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1310		    > NUM_RESERVED_DID))
1311		return;
1312
1313	if (iommu->gcmd & DMA_GCMD_TE)
1314		iommu_disable_translation(iommu);
1315}
1316
1317static void free_dmar_iommu(struct intel_iommu *iommu)
1318{
1319	if (iommu->domain_ids) {
1320		bitmap_free(iommu->domain_ids);
1321		iommu->domain_ids = NULL;
1322	}
1323
1324	if (iommu->copied_tables) {
1325		bitmap_free(iommu->copied_tables);
1326		iommu->copied_tables = NULL;
1327	}
1328
1329	/* free context mapping */
1330	free_context_table(iommu);
1331
1332	if (ecap_prs(iommu->ecap))
1333		intel_iommu_finish_prq(iommu);
1334}
1335
1336/*
1337 * Check and return whether first level is used by default for
1338 * DMA translation.
1339 */
1340static bool first_level_by_default(struct intel_iommu *iommu)
1341{
1342	/* Only SL is available in legacy mode */
1343	if (!sm_supported(iommu))
1344		return false;
1345
1346	/* Only level (either FL or SL) is available, just use it */
1347	if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1348		return ecap_flts(iommu->ecap);
1349
1350	return true;
1351}
1352
1353int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1354{
1355	struct iommu_domain_info *info, *curr;
1356	unsigned long ndomains;
1357	int num, ret = -ENOSPC;
1358
1359	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1360		return 0;
1361
1362	info = kzalloc(sizeof(*info), GFP_KERNEL);
1363	if (!info)
1364		return -ENOMEM;
1365
1366	spin_lock(&iommu->lock);
1367	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1368	if (curr) {
1369		curr->refcnt++;
1370		spin_unlock(&iommu->lock);
1371		kfree(info);
1372		return 0;
1373	}
1374
1375	ndomains = cap_ndoms(iommu->cap);
1376	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1377	if (num >= ndomains) {
1378		pr_err("%s: No free domain ids\n", iommu->name);
1379		goto err_unlock;
1380	}
1381
1382	set_bit(num, iommu->domain_ids);
1383	info->refcnt	= 1;
1384	info->did	= num;
1385	info->iommu	= iommu;
1386	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1387			  NULL, info, GFP_ATOMIC);
1388	if (curr) {
1389		ret = xa_err(curr) ? : -EBUSY;
1390		goto err_clear;
1391	}
1392
1393	spin_unlock(&iommu->lock);
1394	return 0;
1395
1396err_clear:
1397	clear_bit(info->did, iommu->domain_ids);
1398err_unlock:
1399	spin_unlock(&iommu->lock);
1400	kfree(info);
1401	return ret;
1402}
1403
1404void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1405{
1406	struct iommu_domain_info *info;
1407
1408	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1409		return;
1410
1411	spin_lock(&iommu->lock);
1412	info = xa_load(&domain->iommu_array, iommu->seq_id);
1413	if (--info->refcnt == 0) {
1414		clear_bit(info->did, iommu->domain_ids);
1415		xa_erase(&domain->iommu_array, iommu->seq_id);
1416		domain->nid = NUMA_NO_NODE;
1417		kfree(info);
1418	}
1419	spin_unlock(&iommu->lock);
1420}
1421
1422static void domain_exit(struct dmar_domain *domain)
1423{
1424	if (domain->pgd) {
1425		LIST_HEAD(freelist);
1426
1427		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1428		iommu_put_pages_list(&freelist);
1429	}
1430
1431	if (WARN_ON(!list_empty(&domain->devices)))
1432		return;
1433
1434	kfree(domain->qi_batch);
1435	kfree(domain);
1436}
1437
1438/*
1439 * For kdump cases, old valid entries may be cached due to the
1440 * in-flight DMA and copied pgtable, but there is no unmapping
1441 * behaviour for them, thus we need an explicit cache flush for
1442 * the newly-mapped device. For kdump, at this point, the device
1443 * is supposed to finish reset at its driver probe stage, so no
1444 * in-flight DMA will exist, and we don't need to worry anymore
1445 * hereafter.
1446 */
1447static void copied_context_tear_down(struct intel_iommu *iommu,
1448				     struct context_entry *context,
1449				     u8 bus, u8 devfn)
1450{
1451	u16 did_old;
1452
1453	if (!context_copied(iommu, bus, devfn))
1454		return;
1455
1456	assert_spin_locked(&iommu->lock);
1457
1458	did_old = context_domain_id(context);
1459	context_clear_entry(context);
1460
1461	if (did_old < cap_ndoms(iommu->cap)) {
1462		iommu->flush.flush_context(iommu, did_old,
1463					   PCI_DEVID(bus, devfn),
1464					   DMA_CCMD_MASK_NOBIT,
1465					   DMA_CCMD_DEVICE_INVL);
1466		iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1467					 DMA_TLB_DSI_FLUSH);
1468	}
1469
1470	clear_context_copied(iommu, bus, devfn);
1471}
1472
1473/*
1474 * It's a non-present to present mapping. If hardware doesn't cache
1475 * non-present entry we only need to flush the write-buffer. If the
1476 * _does_ cache non-present entries, then it does so in the special
1477 * domain #0, which we have to flush:
1478 */
1479static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1480					u8 bus, u8 devfn)
1481{
1482	if (cap_caching_mode(iommu->cap)) {
1483		iommu->flush.flush_context(iommu, 0,
1484					   PCI_DEVID(bus, devfn),
1485					   DMA_CCMD_MASK_NOBIT,
1486					   DMA_CCMD_DEVICE_INVL);
1487		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1488	} else {
1489		iommu_flush_write_buffer(iommu);
1490	}
1491}
1492
1493static int domain_context_mapping_one(struct dmar_domain *domain,
1494				      struct intel_iommu *iommu,
1495				      u8 bus, u8 devfn)
1496{
1497	struct device_domain_info *info =
1498			domain_lookup_dev_info(domain, iommu, bus, devfn);
1499	u16 did = domain_id_iommu(domain, iommu);
1500	int translation = CONTEXT_TT_MULTI_LEVEL;
1501	struct dma_pte *pgd = domain->pgd;
1502	struct context_entry *context;
1503	int ret;
1504
1505	pr_debug("Set context mapping for %02x:%02x.%d\n",
1506		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1507
1508	spin_lock(&iommu->lock);
1509	ret = -ENOMEM;
1510	context = iommu_context_addr(iommu, bus, devfn, 1);
1511	if (!context)
1512		goto out_unlock;
1513
1514	ret = 0;
1515	if (context_present(context) && !context_copied(iommu, bus, devfn))
1516		goto out_unlock;
1517
1518	copied_context_tear_down(iommu, context, bus, devfn);
1519	context_clear_entry(context);
1520	context_set_domain_id(context, did);
1521
1522	if (info && info->ats_supported)
1523		translation = CONTEXT_TT_DEV_IOTLB;
1524	else
1525		translation = CONTEXT_TT_MULTI_LEVEL;
1526
1527	context_set_address_root(context, virt_to_phys(pgd));
1528	context_set_address_width(context, domain->agaw);
1529	context_set_translation_type(context, translation);
1530	context_set_fault_enable(context);
1531	context_set_present(context);
1532	if (!ecap_coherent(iommu->ecap))
1533		clflush_cache_range(context, sizeof(*context));
1534	context_present_cache_flush(iommu, did, bus, devfn);
1535	ret = 0;
1536
1537out_unlock:
1538	spin_unlock(&iommu->lock);
1539
1540	return ret;
1541}
1542
1543static int domain_context_mapping_cb(struct pci_dev *pdev,
1544				     u16 alias, void *opaque)
1545{
1546	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1547	struct intel_iommu *iommu = info->iommu;
1548	struct dmar_domain *domain = opaque;
1549
1550	return domain_context_mapping_one(domain, iommu,
1551					  PCI_BUS_NUM(alias), alias & 0xff);
1552}
1553
1554static int
1555domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1556{
1557	struct device_domain_info *info = dev_iommu_priv_get(dev);
1558	struct intel_iommu *iommu = info->iommu;
1559	u8 bus = info->bus, devfn = info->devfn;
1560
1561	if (!dev_is_pci(dev))
1562		return domain_context_mapping_one(domain, iommu, bus, devfn);
1563
1564	return pci_for_each_dma_alias(to_pci_dev(dev),
1565				      domain_context_mapping_cb, domain);
1566}
1567
1568/* Return largest possible superpage level for a given mapping */
1569static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1570				   unsigned long phy_pfn, unsigned long pages)
1571{
1572	int support, level = 1;
1573	unsigned long pfnmerge;
1574
1575	support = domain->iommu_superpage;
1576
1577	/* To use a large page, the virtual *and* physical addresses
1578	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1579	   of them will mean we have to use smaller pages. So just
1580	   merge them and check both at once. */
1581	pfnmerge = iov_pfn | phy_pfn;
1582
1583	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1584		pages >>= VTD_STRIDE_SHIFT;
1585		if (!pages)
1586			break;
1587		pfnmerge >>= VTD_STRIDE_SHIFT;
1588		level++;
1589		support--;
1590	}
1591	return level;
1592}
1593
1594/*
1595 * Ensure that old small page tables are removed to make room for superpage(s).
1596 * We're going to add new large pages, so make sure we don't remove their parent
1597 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1598 */
1599static void switch_to_super_page(struct dmar_domain *domain,
1600				 unsigned long start_pfn,
1601				 unsigned long end_pfn, int level)
1602{
1603	unsigned long lvl_pages = lvl_to_nr_pages(level);
1604	struct dma_pte *pte = NULL;
1605
1606	while (start_pfn <= end_pfn) {
1607		if (!pte)
1608			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1609					     GFP_ATOMIC);
1610
1611		if (dma_pte_present(pte)) {
1612			dma_pte_free_pagetable(domain, start_pfn,
1613					       start_pfn + lvl_pages - 1,
1614					       level + 1);
1615
1616			cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1617					      end_pfn << VTD_PAGE_SHIFT, 0);
1618		}
1619
1620		pte++;
1621		start_pfn += lvl_pages;
1622		if (first_pte_in_page(pte))
1623			pte = NULL;
1624	}
1625}
1626
1627static int
1628__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1629		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1630		 gfp_t gfp)
1631{
1632	struct dma_pte *first_pte = NULL, *pte = NULL;
1633	unsigned int largepage_lvl = 0;
1634	unsigned long lvl_pages = 0;
1635	phys_addr_t pteval;
1636	u64 attr;
1637
1638	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1639		return -EINVAL;
1640
1641	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1642		return -EINVAL;
1643
1644	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1645		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1646		return -EINVAL;
1647	}
1648
1649	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1650	attr |= DMA_FL_PTE_PRESENT;
1651	if (domain->use_first_level) {
1652		attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1653		if (prot & DMA_PTE_WRITE)
1654			attr |= DMA_FL_PTE_DIRTY;
1655	}
1656
1657	domain->has_mappings = true;
1658
1659	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1660
1661	while (nr_pages > 0) {
1662		uint64_t tmp;
1663
1664		if (!pte) {
1665			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1666					phys_pfn, nr_pages);
1667
1668			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1669					     gfp);
1670			if (!pte)
1671				return -ENOMEM;
1672			first_pte = pte;
1673
1674			lvl_pages = lvl_to_nr_pages(largepage_lvl);
1675
1676			/* It is large page*/
1677			if (largepage_lvl > 1) {
1678				unsigned long end_pfn;
1679				unsigned long pages_to_remove;
1680
1681				pteval |= DMA_PTE_LARGE_PAGE;
1682				pages_to_remove = min_t(unsigned long, nr_pages,
1683							nr_pte_to_next_page(pte) * lvl_pages);
1684				end_pfn = iov_pfn + pages_to_remove - 1;
1685				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1686			} else {
1687				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1688			}
1689
1690		}
1691		/* We don't need lock here, nobody else
1692		 * touches the iova range
1693		 */
1694		tmp = 0ULL;
1695		if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1696			static int dumps = 5;
1697			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1698				iov_pfn, tmp, (unsigned long long)pteval);
1699			if (dumps) {
1700				dumps--;
1701				debug_dma_dump_mappings(NULL);
1702			}
1703			WARN_ON(1);
1704		}
1705
1706		nr_pages -= lvl_pages;
1707		iov_pfn += lvl_pages;
1708		phys_pfn += lvl_pages;
1709		pteval += lvl_pages * VTD_PAGE_SIZE;
1710
1711		/* If the next PTE would be the first in a new page, then we
1712		 * need to flush the cache on the entries we've just written.
1713		 * And then we'll need to recalculate 'pte', so clear it and
1714		 * let it get set again in the if (!pte) block above.
1715		 *
1716		 * If we're done (!nr_pages) we need to flush the cache too.
1717		 *
1718		 * Also if we've been setting superpages, we may need to
1719		 * recalculate 'pte' and switch back to smaller pages for the
1720		 * end of the mapping, if the trailing size is not enough to
1721		 * use another superpage (i.e. nr_pages < lvl_pages).
1722		 */
1723		pte++;
1724		if (!nr_pages || first_pte_in_page(pte) ||
1725		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1726			domain_flush_cache(domain, first_pte,
1727					   (void *)pte - (void *)first_pte);
1728			pte = NULL;
1729		}
1730	}
1731
1732	return 0;
1733}
1734
1735static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1736{
1737	struct intel_iommu *iommu = info->iommu;
1738	struct context_entry *context;
1739	u16 did;
1740
1741	spin_lock(&iommu->lock);
1742	context = iommu_context_addr(iommu, bus, devfn, 0);
1743	if (!context) {
1744		spin_unlock(&iommu->lock);
1745		return;
1746	}
1747
1748	did = context_domain_id(context);
1749	context_clear_entry(context);
1750	__iommu_flush_cache(iommu, context, sizeof(*context));
1751	spin_unlock(&iommu->lock);
1752	intel_context_flush_present(info, context, did, true);
1753}
1754
1755int __domain_setup_first_level(struct intel_iommu *iommu,
1756			       struct device *dev, ioasid_t pasid,
1757			       u16 did, pgd_t *pgd, int flags,
1758			       struct iommu_domain *old)
1759{
1760	if (!old)
1761		return intel_pasid_setup_first_level(iommu, dev, pgd,
1762						     pasid, did, flags);
1763	return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did,
1764					       iommu_domain_did(old, iommu),
1765					       flags);
1766}
1767
1768static int domain_setup_second_level(struct intel_iommu *iommu,
1769				     struct dmar_domain *domain,
1770				     struct device *dev, ioasid_t pasid,
1771				     struct iommu_domain *old)
1772{
1773	if (!old)
1774		return intel_pasid_setup_second_level(iommu, domain,
1775						      dev, pasid);
1776	return intel_pasid_replace_second_level(iommu, domain, dev,
1777						iommu_domain_did(old, iommu),
1778						pasid);
1779}
1780
1781static int domain_setup_passthrough(struct intel_iommu *iommu,
1782				    struct device *dev, ioasid_t pasid,
1783				    struct iommu_domain *old)
1784{
1785	if (!old)
1786		return intel_pasid_setup_pass_through(iommu, dev, pasid);
1787	return intel_pasid_replace_pass_through(iommu, dev,
1788						iommu_domain_did(old, iommu),
1789						pasid);
1790}
1791
1792static int domain_setup_first_level(struct intel_iommu *iommu,
1793				    struct dmar_domain *domain,
1794				    struct device *dev,
1795				    u32 pasid, struct iommu_domain *old)
1796{
1797	struct dma_pte *pgd = domain->pgd;
1798	int level, flags = 0;
1799
1800	level = agaw_to_level(domain->agaw);
1801	if (level != 4 && level != 5)
1802		return -EINVAL;
1803
1804	if (level == 5)
1805		flags |= PASID_FLAG_FL5LP;
1806
1807	if (domain->force_snooping)
1808		flags |= PASID_FLAG_PAGE_SNOOP;
1809
1810	return __domain_setup_first_level(iommu, dev, pasid,
1811					  domain_id_iommu(domain, iommu),
1812					  (pgd_t *)pgd, flags, old);
1813}
1814
1815static int dmar_domain_attach_device(struct dmar_domain *domain,
1816				     struct device *dev)
1817{
1818	struct device_domain_info *info = dev_iommu_priv_get(dev);
1819	struct intel_iommu *iommu = info->iommu;
1820	unsigned long flags;
1821	int ret;
1822
1823	ret = domain_attach_iommu(domain, iommu);
1824	if (ret)
1825		return ret;
1826
1827	info->domain = domain;
1828	spin_lock_irqsave(&domain->lock, flags);
1829	list_add(&info->link, &domain->devices);
1830	spin_unlock_irqrestore(&domain->lock, flags);
1831
1832	if (dev_is_real_dma_subdevice(dev))
1833		return 0;
1834
1835	if (!sm_supported(iommu))
1836		ret = domain_context_mapping(domain, dev);
1837	else if (domain->use_first_level)
1838		ret = domain_setup_first_level(iommu, domain, dev,
1839					       IOMMU_NO_PASID, NULL);
1840	else
1841		ret = domain_setup_second_level(iommu, domain, dev,
1842						IOMMU_NO_PASID, NULL);
1843
1844	if (ret)
1845		goto out_block_translation;
1846
1847	iommu_enable_pci_caps(info);
1848
1849	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1850	if (ret)
1851		goto out_block_translation;
1852
1853	return 0;
1854
1855out_block_translation:
1856	device_block_translation(dev);
1857	return ret;
1858}
1859
1860/**
1861 * device_rmrr_is_relaxable - Test whether the RMRR of this device
1862 * is relaxable (ie. is allowed to be not enforced under some conditions)
1863 * @dev: device handle
1864 *
1865 * We assume that PCI USB devices with RMRRs have them largely
1866 * for historical reasons and that the RMRR space is not actively used post
1867 * boot.  This exclusion may change if vendors begin to abuse it.
1868 *
1869 * The same exception is made for graphics devices, with the requirement that
1870 * any use of the RMRR regions will be torn down before assigning the device
1871 * to a guest.
1872 *
1873 * Return: true if the RMRR is relaxable, false otherwise
1874 */
1875static bool device_rmrr_is_relaxable(struct device *dev)
1876{
1877	struct pci_dev *pdev;
1878
1879	if (!dev_is_pci(dev))
1880		return false;
1881
1882	pdev = to_pci_dev(dev);
1883	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
1884		return true;
1885	else
1886		return false;
1887}
1888
1889static int device_def_domain_type(struct device *dev)
1890{
1891	struct device_domain_info *info = dev_iommu_priv_get(dev);
1892	struct intel_iommu *iommu = info->iommu;
1893
1894	/*
1895	 * Hardware does not support the passthrough translation mode.
1896	 * Always use a dynamaic mapping domain.
1897	 */
1898	if (!ecap_pass_through(iommu->ecap))
1899		return IOMMU_DOMAIN_DMA;
1900
1901	if (dev_is_pci(dev)) {
1902		struct pci_dev *pdev = to_pci_dev(dev);
1903
1904		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1905			return IOMMU_DOMAIN_IDENTITY;
1906	}
1907
1908	return 0;
1909}
1910
1911static void intel_iommu_init_qi(struct intel_iommu *iommu)
1912{
1913	/*
1914	 * Start from the sane iommu hardware state.
1915	 * If the queued invalidation is already initialized by us
1916	 * (for example, while enabling interrupt-remapping) then
1917	 * we got the things already rolling from a sane state.
1918	 */
1919	if (!iommu->qi) {
1920		/*
1921		 * Clear any previous faults.
1922		 */
1923		dmar_fault(-1, iommu);
1924		/*
1925		 * Disable queued invalidation if supported and already enabled
1926		 * before OS handover.
1927		 */
1928		dmar_disable_qi(iommu);
1929	}
1930
1931	if (dmar_enable_qi(iommu)) {
1932		/*
1933		 * Queued Invalidate not enabled, use Register Based Invalidate
1934		 */
1935		iommu->flush.flush_context = __iommu_flush_context;
1936		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1937		pr_info("%s: Using Register based invalidation\n",
1938			iommu->name);
1939	} else {
1940		iommu->flush.flush_context = qi_flush_context;
1941		iommu->flush.flush_iotlb = qi_flush_iotlb;
1942		pr_info("%s: Using Queued invalidation\n", iommu->name);
1943	}
1944}
1945
1946static int copy_context_table(struct intel_iommu *iommu,
1947			      struct root_entry *old_re,
1948			      struct context_entry **tbl,
1949			      int bus, bool ext)
1950{
1951	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
1952	struct context_entry *new_ce = NULL, ce;
1953	struct context_entry *old_ce = NULL;
1954	struct root_entry re;
1955	phys_addr_t old_ce_phys;
1956
1957	tbl_idx = ext ? bus * 2 : bus;
1958	memcpy(&re, old_re, sizeof(re));
1959
1960	for (devfn = 0; devfn < 256; devfn++) {
1961		/* First calculate the correct index */
1962		idx = (ext ? devfn * 2 : devfn) % 256;
1963
1964		if (idx == 0) {
1965			/* First save what we may have and clean up */
1966			if (new_ce) {
1967				tbl[tbl_idx] = new_ce;
1968				__iommu_flush_cache(iommu, new_ce,
1969						    VTD_PAGE_SIZE);
1970				pos = 1;
1971			}
1972
1973			if (old_ce)
1974				memunmap(old_ce);
1975
1976			ret = 0;
1977			if (devfn < 0x80)
1978				old_ce_phys = root_entry_lctp(&re);
1979			else
1980				old_ce_phys = root_entry_uctp(&re);
1981
1982			if (!old_ce_phys) {
1983				if (ext && devfn == 0) {
1984					/* No LCTP, try UCTP */
1985					devfn = 0x7f;
1986					continue;
1987				} else {
1988					goto out;
1989				}
1990			}
1991
1992			ret = -ENOMEM;
1993			old_ce = memremap(old_ce_phys, PAGE_SIZE,
1994					MEMREMAP_WB);
1995			if (!old_ce)
1996				goto out;
1997
1998			new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
1999			if (!new_ce)
2000				goto out_unmap;
2001
2002			ret = 0;
2003		}
2004
2005		/* Now copy the context entry */
2006		memcpy(&ce, old_ce + idx, sizeof(ce));
2007
2008		if (!context_present(&ce))
2009			continue;
2010
2011		did = context_domain_id(&ce);
2012		if (did >= 0 && did < cap_ndoms(iommu->cap))
2013			set_bit(did, iommu->domain_ids);
2014
2015		set_context_copied(iommu, bus, devfn);
2016		new_ce[idx] = ce;
2017	}
2018
2019	tbl[tbl_idx + pos] = new_ce;
2020
2021	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2022
2023out_unmap:
2024	memunmap(old_ce);
2025
2026out:
2027	return ret;
2028}
2029
2030static int copy_translation_tables(struct intel_iommu *iommu)
2031{
2032	struct context_entry **ctxt_tbls;
2033	struct root_entry *old_rt;
2034	phys_addr_t old_rt_phys;
2035	int ctxt_table_entries;
2036	u64 rtaddr_reg;
2037	int bus, ret;
2038	bool new_ext, ext;
2039
2040	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2041	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2042	new_ext    = !!sm_supported(iommu);
2043
2044	/*
2045	 * The RTT bit can only be changed when translation is disabled,
2046	 * but disabling translation means to open a window for data
2047	 * corruption. So bail out and don't copy anything if we would
2048	 * have to change the bit.
2049	 */
2050	if (new_ext != ext)
2051		return -EINVAL;
2052
2053	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2054	if (!iommu->copied_tables)
2055		return -ENOMEM;
2056
2057	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2058	if (!old_rt_phys)
2059		return -EINVAL;
2060
2061	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2062	if (!old_rt)
2063		return -ENOMEM;
2064
2065	/* This is too big for the stack - allocate it from slab */
2066	ctxt_table_entries = ext ? 512 : 256;
2067	ret = -ENOMEM;
2068	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2069	if (!ctxt_tbls)
2070		goto out_unmap;
2071
2072	for (bus = 0; bus < 256; bus++) {
2073		ret = copy_context_table(iommu, &old_rt[bus],
2074					 ctxt_tbls, bus, ext);
2075		if (ret) {
2076			pr_err("%s: Failed to copy context table for bus %d\n",
2077				iommu->name, bus);
2078			continue;
2079		}
2080	}
2081
2082	spin_lock(&iommu->lock);
2083
2084	/* Context tables are copied, now write them to the root_entry table */
2085	for (bus = 0; bus < 256; bus++) {
2086		int idx = ext ? bus * 2 : bus;
2087		u64 val;
2088
2089		if (ctxt_tbls[idx]) {
2090			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2091			iommu->root_entry[bus].lo = val;
2092		}
2093
2094		if (!ext || !ctxt_tbls[idx + 1])
2095			continue;
2096
2097		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2098		iommu->root_entry[bus].hi = val;
2099	}
2100
2101	spin_unlock(&iommu->lock);
2102
2103	kfree(ctxt_tbls);
2104
2105	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2106
2107	ret = 0;
2108
2109out_unmap:
2110	memunmap(old_rt);
2111
2112	return ret;
2113}
2114
2115static int __init init_dmars(void)
2116{
2117	struct dmar_drhd_unit *drhd;
2118	struct intel_iommu *iommu;
2119	int ret;
2120
2121	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2122	if (ret)
2123		goto free_iommu;
2124
2125	for_each_iommu(iommu, drhd) {
2126		if (drhd->ignored) {
2127			iommu_disable_translation(iommu);
2128			continue;
2129		}
2130
2131		/*
2132		 * Find the max pasid size of all IOMMU's in the system.
2133		 * We need to ensure the system pasid table is no bigger
2134		 * than the smallest supported.
2135		 */
2136		if (pasid_supported(iommu)) {
2137			u32 temp = 2 << ecap_pss(iommu->ecap);
2138
2139			intel_pasid_max_id = min_t(u32, temp,
2140						   intel_pasid_max_id);
2141		}
2142
2143		intel_iommu_init_qi(iommu);
2144
2145		ret = iommu_init_domains(iommu);
2146		if (ret)
2147			goto free_iommu;
2148
2149		init_translation_status(iommu);
2150
2151		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2152			iommu_disable_translation(iommu);
2153			clear_translation_pre_enabled(iommu);
2154			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2155				iommu->name);
2156		}
2157
2158		/*
2159		 * TBD:
2160		 * we could share the same root & context tables
2161		 * among all IOMMU's. Need to Split it later.
2162		 */
2163		ret = iommu_alloc_root_entry(iommu);
2164		if (ret)
2165			goto free_iommu;
2166
2167		if (translation_pre_enabled(iommu)) {
2168			pr_info("Translation already enabled - trying to copy translation structures\n");
2169
2170			ret = copy_translation_tables(iommu);
2171			if (ret) {
2172				/*
2173				 * We found the IOMMU with translation
2174				 * enabled - but failed to copy over the
2175				 * old root-entry table. Try to proceed
2176				 * by disabling translation now and
2177				 * allocating a clean root-entry table.
2178				 * This might cause DMAR faults, but
2179				 * probably the dump will still succeed.
2180				 */
2181				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2182				       iommu->name);
2183				iommu_disable_translation(iommu);
2184				clear_translation_pre_enabled(iommu);
2185			} else {
2186				pr_info("Copied translation tables from previous kernel for %s\n",
2187					iommu->name);
2188			}
2189		}
2190
2191		intel_svm_check(iommu);
2192	}
2193
2194	/*
2195	 * Now that qi is enabled on all iommus, set the root entry and flush
2196	 * caches. This is required on some Intel X58 chipsets, otherwise the
2197	 * flush_context function will loop forever and the boot hangs.
2198	 */
2199	for_each_active_iommu(iommu, drhd) {
2200		iommu_flush_write_buffer(iommu);
2201		iommu_set_root_entry(iommu);
2202	}
2203
2204	check_tylersburg_isoch();
2205
2206	/*
2207	 * for each drhd
2208	 *   enable fault log
2209	 *   global invalidate context cache
2210	 *   global invalidate iotlb
2211	 *   enable translation
2212	 */
2213	for_each_iommu(iommu, drhd) {
2214		if (drhd->ignored) {
2215			/*
2216			 * we always have to disable PMRs or DMA may fail on
2217			 * this device
2218			 */
2219			if (force_on)
2220				iommu_disable_protect_mem_regions(iommu);
2221			continue;
2222		}
2223
2224		iommu_flush_write_buffer(iommu);
2225
2226		if (ecap_prs(iommu->ecap)) {
2227			/*
2228			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2229			 * could cause possible lock race condition.
2230			 */
2231			up_write(&dmar_global_lock);
2232			ret = intel_iommu_enable_prq(iommu);
2233			down_write(&dmar_global_lock);
2234			if (ret)
2235				goto free_iommu;
2236		}
2237
2238		ret = dmar_set_interrupt(iommu);
2239		if (ret)
2240			goto free_iommu;
2241	}
2242
2243	return 0;
2244
2245free_iommu:
2246	for_each_active_iommu(iommu, drhd) {
2247		disable_dmar_iommu(iommu);
2248		free_dmar_iommu(iommu);
2249	}
2250
2251	return ret;
2252}
2253
2254static void __init init_no_remapping_devices(void)
2255{
2256	struct dmar_drhd_unit *drhd;
2257	struct device *dev;
2258	int i;
2259
2260	for_each_drhd_unit(drhd) {
2261		if (!drhd->include_all) {
2262			for_each_active_dev_scope(drhd->devices,
2263						  drhd->devices_cnt, i, dev)
2264				break;
2265			/* ignore DMAR unit if no devices exist */
2266			if (i == drhd->devices_cnt)
2267				drhd->ignored = 1;
2268		}
2269	}
2270
2271	for_each_active_drhd_unit(drhd) {
2272		if (drhd->include_all)
2273			continue;
2274
2275		for_each_active_dev_scope(drhd->devices,
2276					  drhd->devices_cnt, i, dev)
2277			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2278				break;
2279		if (i < drhd->devices_cnt)
2280			continue;
2281
2282		/* This IOMMU has *only* gfx devices. Either bypass it or
2283		   set the gfx_mapped flag, as appropriate */
2284		drhd->gfx_dedicated = 1;
2285		if (disable_igfx_iommu)
2286			drhd->ignored = 1;
2287	}
2288}
2289
2290#ifdef CONFIG_SUSPEND
2291static int init_iommu_hw(void)
2292{
2293	struct dmar_drhd_unit *drhd;
2294	struct intel_iommu *iommu = NULL;
2295	int ret;
2296
2297	for_each_active_iommu(iommu, drhd) {
2298		if (iommu->qi) {
2299			ret = dmar_reenable_qi(iommu);
2300			if (ret)
2301				return ret;
2302		}
2303	}
2304
2305	for_each_iommu(iommu, drhd) {
2306		if (drhd->ignored) {
2307			/*
2308			 * we always have to disable PMRs or DMA may fail on
2309			 * this device
2310			 */
2311			if (force_on)
2312				iommu_disable_protect_mem_regions(iommu);
2313			continue;
2314		}
2315
2316		iommu_flush_write_buffer(iommu);
2317		iommu_set_root_entry(iommu);
2318		iommu_enable_translation(iommu);
2319		iommu_disable_protect_mem_regions(iommu);
2320	}
2321
2322	return 0;
2323}
2324
2325static void iommu_flush_all(void)
2326{
2327	struct dmar_drhd_unit *drhd;
2328	struct intel_iommu *iommu;
2329
2330	for_each_active_iommu(iommu, drhd) {
2331		iommu->flush.flush_context(iommu, 0, 0, 0,
2332					   DMA_CCMD_GLOBAL_INVL);
2333		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2334					 DMA_TLB_GLOBAL_FLUSH);
2335	}
2336}
2337
2338static int iommu_suspend(void)
2339{
2340	struct dmar_drhd_unit *drhd;
2341	struct intel_iommu *iommu = NULL;
2342	unsigned long flag;
2343
2344	iommu_flush_all();
2345
2346	for_each_active_iommu(iommu, drhd) {
2347		iommu_disable_translation(iommu);
2348
2349		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2350
2351		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2352			readl(iommu->reg + DMAR_FECTL_REG);
2353		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2354			readl(iommu->reg + DMAR_FEDATA_REG);
2355		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2356			readl(iommu->reg + DMAR_FEADDR_REG);
2357		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2358			readl(iommu->reg + DMAR_FEUADDR_REG);
2359
2360		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2361	}
2362	return 0;
2363}
2364
2365static void iommu_resume(void)
2366{
2367	struct dmar_drhd_unit *drhd;
2368	struct intel_iommu *iommu = NULL;
2369	unsigned long flag;
2370
2371	if (init_iommu_hw()) {
2372		if (force_on)
2373			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2374		else
2375			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2376		return;
2377	}
2378
2379	for_each_active_iommu(iommu, drhd) {
2380
2381		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2382
2383		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2384			iommu->reg + DMAR_FECTL_REG);
2385		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2386			iommu->reg + DMAR_FEDATA_REG);
2387		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2388			iommu->reg + DMAR_FEADDR_REG);
2389		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2390			iommu->reg + DMAR_FEUADDR_REG);
2391
2392		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2393	}
2394}
2395
2396static struct syscore_ops iommu_syscore_ops = {
2397	.resume		= iommu_resume,
2398	.suspend	= iommu_suspend,
2399};
2400
2401static void __init init_iommu_pm_ops(void)
2402{
2403	register_syscore_ops(&iommu_syscore_ops);
2404}
2405
2406#else
2407static inline void init_iommu_pm_ops(void) {}
2408#endif	/* CONFIG_PM */
2409
2410static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2411{
2412	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2413	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2414	    rmrr->end_address <= rmrr->base_address ||
2415	    arch_rmrr_sanity_check(rmrr))
2416		return -EINVAL;
2417
2418	return 0;
2419}
2420
2421int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2422{
2423	struct acpi_dmar_reserved_memory *rmrr;
2424	struct dmar_rmrr_unit *rmrru;
2425
2426	rmrr = (struct acpi_dmar_reserved_memory *)header;
2427	if (rmrr_sanity_check(rmrr)) {
2428		pr_warn(FW_BUG
2429			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2430			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2431			   rmrr->base_address, rmrr->end_address,
2432			   dmi_get_system_info(DMI_BIOS_VENDOR),
2433			   dmi_get_system_info(DMI_BIOS_VERSION),
2434			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2435		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2436	}
2437
2438	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2439	if (!rmrru)
2440		goto out;
2441
2442	rmrru->hdr = header;
2443
2444	rmrru->base_address = rmrr->base_address;
2445	rmrru->end_address = rmrr->end_address;
2446
2447	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2448				((void *)rmrr) + rmrr->header.length,
2449				&rmrru->devices_cnt);
2450	if (rmrru->devices_cnt && rmrru->devices == NULL)
2451		goto free_rmrru;
2452
2453	list_add(&rmrru->list, &dmar_rmrr_units);
2454
2455	return 0;
2456free_rmrru:
2457	kfree(rmrru);
2458out:
2459	return -ENOMEM;
2460}
2461
2462static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2463{
2464	struct dmar_atsr_unit *atsru;
2465	struct acpi_dmar_atsr *tmp;
2466
2467	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2468				dmar_rcu_check()) {
2469		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2470		if (atsr->segment != tmp->segment)
2471			continue;
2472		if (atsr->header.length != tmp->header.length)
2473			continue;
2474		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2475			return atsru;
2476	}
2477
2478	return NULL;
2479}
2480
2481int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2482{
2483	struct acpi_dmar_atsr *atsr;
2484	struct dmar_atsr_unit *atsru;
2485
2486	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2487		return 0;
2488
2489	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2490	atsru = dmar_find_atsr(atsr);
2491	if (atsru)
2492		return 0;
2493
2494	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2495	if (!atsru)
2496		return -ENOMEM;
2497
2498	/*
2499	 * If memory is allocated from slab by ACPI _DSM method, we need to
2500	 * copy the memory content because the memory buffer will be freed
2501	 * on return.
2502	 */
2503	atsru->hdr = (void *)(atsru + 1);
2504	memcpy(atsru->hdr, hdr, hdr->length);
2505	atsru->include_all = atsr->flags & 0x1;
2506	if (!atsru->include_all) {
2507		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2508				(void *)atsr + atsr->header.length,
2509				&atsru->devices_cnt);
2510		if (atsru->devices_cnt && atsru->devices == NULL) {
2511			kfree(atsru);
2512			return -ENOMEM;
2513		}
2514	}
2515
2516	list_add_rcu(&atsru->list, &dmar_atsr_units);
2517
2518	return 0;
2519}
2520
2521static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2522{
2523	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2524	kfree(atsru);
2525}
2526
2527int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2528{
2529	struct acpi_dmar_atsr *atsr;
2530	struct dmar_atsr_unit *atsru;
2531
2532	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2533	atsru = dmar_find_atsr(atsr);
2534	if (atsru) {
2535		list_del_rcu(&atsru->list);
2536		synchronize_rcu();
2537		intel_iommu_free_atsr(atsru);
2538	}
2539
2540	return 0;
2541}
2542
2543int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2544{
2545	int i;
2546	struct device *dev;
2547	struct acpi_dmar_atsr *atsr;
2548	struct dmar_atsr_unit *atsru;
2549
2550	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2551	atsru = dmar_find_atsr(atsr);
2552	if (!atsru)
2553		return 0;
2554
2555	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2556		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2557					  i, dev)
2558			return -EBUSY;
2559	}
2560
2561	return 0;
2562}
2563
2564static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2565{
2566	struct dmar_satc_unit *satcu;
2567	struct acpi_dmar_satc *tmp;
2568
2569	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2570				dmar_rcu_check()) {
2571		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2572		if (satc->segment != tmp->segment)
2573			continue;
2574		if (satc->header.length != tmp->header.length)
2575			continue;
2576		if (memcmp(satc, tmp, satc->header.length) == 0)
2577			return satcu;
2578	}
2579
2580	return NULL;
2581}
2582
2583int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2584{
2585	struct acpi_dmar_satc *satc;
2586	struct dmar_satc_unit *satcu;
2587
2588	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2589		return 0;
2590
2591	satc = container_of(hdr, struct acpi_dmar_satc, header);
2592	satcu = dmar_find_satc(satc);
2593	if (satcu)
2594		return 0;
2595
2596	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2597	if (!satcu)
2598		return -ENOMEM;
2599
2600	satcu->hdr = (void *)(satcu + 1);
2601	memcpy(satcu->hdr, hdr, hdr->length);
2602	satcu->atc_required = satc->flags & 0x1;
2603	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2604					      (void *)satc + satc->header.length,
2605					      &satcu->devices_cnt);
2606	if (satcu->devices_cnt && !satcu->devices) {
2607		kfree(satcu);
2608		return -ENOMEM;
2609	}
2610	list_add_rcu(&satcu->list, &dmar_satc_units);
2611
2612	return 0;
2613}
2614
2615static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2616{
2617	struct intel_iommu *iommu = dmaru->iommu;
2618	int ret;
2619
2620	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
2621	if (ret)
2622		goto out;
2623
2624	/*
2625	 * Disable translation if already enabled prior to OS handover.
2626	 */
2627	if (iommu->gcmd & DMA_GCMD_TE)
2628		iommu_disable_translation(iommu);
2629
2630	ret = iommu_init_domains(iommu);
2631	if (ret == 0)
2632		ret = iommu_alloc_root_entry(iommu);
2633	if (ret)
2634		goto out;
2635
2636	intel_svm_check(iommu);
2637
2638	if (dmaru->ignored) {
2639		/*
2640		 * we always have to disable PMRs or DMA may fail on this device
2641		 */
2642		if (force_on)
2643			iommu_disable_protect_mem_regions(iommu);
2644		return 0;
2645	}
2646
2647	intel_iommu_init_qi(iommu);
2648	iommu_flush_write_buffer(iommu);
2649
2650	if (ecap_prs(iommu->ecap)) {
2651		ret = intel_iommu_enable_prq(iommu);
2652		if (ret)
2653			goto disable_iommu;
2654	}
2655
2656	ret = dmar_set_interrupt(iommu);
2657	if (ret)
2658		goto disable_iommu;
2659
2660	iommu_set_root_entry(iommu);
2661	iommu_enable_translation(iommu);
2662
2663	iommu_disable_protect_mem_regions(iommu);
2664	return 0;
2665
2666disable_iommu:
2667	disable_dmar_iommu(iommu);
2668out:
2669	free_dmar_iommu(iommu);
2670	return ret;
2671}
2672
2673int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2674{
2675	int ret = 0;
2676	struct intel_iommu *iommu = dmaru->iommu;
2677
2678	if (!intel_iommu_enabled)
2679		return 0;
2680	if (iommu == NULL)
2681		return -EINVAL;
2682
2683	if (insert) {
2684		ret = intel_iommu_add(dmaru);
2685	} else {
2686		disable_dmar_iommu(iommu);
2687		free_dmar_iommu(iommu);
2688	}
2689
2690	return ret;
2691}
2692
2693static void intel_iommu_free_dmars(void)
2694{
2695	struct dmar_rmrr_unit *rmrru, *rmrr_n;
2696	struct dmar_atsr_unit *atsru, *atsr_n;
2697	struct dmar_satc_unit *satcu, *satc_n;
2698
2699	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2700		list_del(&rmrru->list);
2701		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2702		kfree(rmrru);
2703	}
2704
2705	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2706		list_del(&atsru->list);
2707		intel_iommu_free_atsr(atsru);
2708	}
2709	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2710		list_del(&satcu->list);
2711		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2712		kfree(satcu);
2713	}
2714}
2715
2716static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2717{
2718	struct dmar_satc_unit *satcu;
2719	struct acpi_dmar_satc *satc;
2720	struct device *tmp;
2721	int i;
2722
2723	dev = pci_physfn(dev);
2724	rcu_read_lock();
2725
2726	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2727		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2728		if (satc->segment != pci_domain_nr(dev->bus))
2729			continue;
2730		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2731			if (to_pci_dev(tmp) == dev)
2732				goto out;
2733	}
2734	satcu = NULL;
2735out:
2736	rcu_read_unlock();
2737	return satcu;
2738}
2739
2740static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2741{
2742	int i, ret = 1;
2743	struct pci_bus *bus;
2744	struct pci_dev *bridge = NULL;
2745	struct device *tmp;
2746	struct acpi_dmar_atsr *atsr;
2747	struct dmar_atsr_unit *atsru;
2748	struct dmar_satc_unit *satcu;
2749
2750	dev = pci_physfn(dev);
2751	satcu = dmar_find_matched_satc_unit(dev);
2752	if (satcu)
2753		/*
2754		 * This device supports ATS as it is in SATC table.
2755		 * When IOMMU is in legacy mode, enabling ATS is done
2756		 * automatically by HW for the device that requires
2757		 * ATS, hence OS should not enable this device ATS
2758		 * to avoid duplicated TLB invalidation.
2759		 */
2760		return !(satcu->atc_required && !sm_supported(iommu));
2761
2762	for (bus = dev->bus; bus; bus = bus->parent) {
2763		bridge = bus->self;
2764		/* If it's an integrated device, allow ATS */
2765		if (!bridge)
2766			return 1;
2767		/* Connected via non-PCIe: no ATS */
2768		if (!pci_is_pcie(bridge) ||
2769		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2770			return 0;
2771		/* If we found the root port, look it up in the ATSR */
2772		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2773			break;
2774	}
2775
2776	rcu_read_lock();
2777	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2778		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2779		if (atsr->segment != pci_domain_nr(dev->bus))
2780			continue;
2781
2782		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2783			if (tmp == &bridge->dev)
2784				goto out;
2785
2786		if (atsru->include_all)
2787			goto out;
2788	}
2789	ret = 0;
2790out:
2791	rcu_read_unlock();
2792
2793	return ret;
2794}
2795
2796int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2797{
2798	int ret;
2799	struct dmar_rmrr_unit *rmrru;
2800	struct dmar_atsr_unit *atsru;
2801	struct dmar_satc_unit *satcu;
2802	struct acpi_dmar_atsr *atsr;
2803	struct acpi_dmar_reserved_memory *rmrr;
2804	struct acpi_dmar_satc *satc;
2805
2806	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2807		return 0;
2808
2809	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2810		rmrr = container_of(rmrru->hdr,
2811				    struct acpi_dmar_reserved_memory, header);
2812		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2813			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2814				((void *)rmrr) + rmrr->header.length,
2815				rmrr->segment, rmrru->devices,
2816				rmrru->devices_cnt);
2817			if (ret < 0)
2818				return ret;
2819		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2820			dmar_remove_dev_scope(info, rmrr->segment,
2821				rmrru->devices, rmrru->devices_cnt);
2822		}
2823	}
2824
2825	list_for_each_entry(atsru, &dmar_atsr_units, list) {
2826		if (atsru->include_all)
2827			continue;
2828
2829		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2830		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2831			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2832					(void *)atsr + atsr->header.length,
2833					atsr->segment, atsru->devices,
2834					atsru->devices_cnt);
2835			if (ret > 0)
2836				break;
2837			else if (ret < 0)
2838				return ret;
2839		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2840			if (dmar_remove_dev_scope(info, atsr->segment,
2841					atsru->devices, atsru->devices_cnt))
2842				break;
2843		}
2844	}
2845	list_for_each_entry(satcu, &dmar_satc_units, list) {
2846		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2847		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2848			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2849					(void *)satc + satc->header.length,
2850					satc->segment, satcu->devices,
2851					satcu->devices_cnt);
2852			if (ret > 0)
2853				break;
2854			else if (ret < 0)
2855				return ret;
2856		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2857			if (dmar_remove_dev_scope(info, satc->segment,
2858					satcu->devices, satcu->devices_cnt))
2859				break;
2860		}
2861	}
2862
2863	return 0;
2864}
2865
2866static void intel_disable_iommus(void)
2867{
2868	struct intel_iommu *iommu = NULL;
2869	struct dmar_drhd_unit *drhd;
2870
2871	for_each_iommu(iommu, drhd)
2872		iommu_disable_translation(iommu);
2873}
2874
2875void intel_iommu_shutdown(void)
2876{
2877	struct dmar_drhd_unit *drhd;
2878	struct intel_iommu *iommu = NULL;
2879
2880	if (no_iommu || dmar_disabled)
2881		return;
2882
2883	down_write(&dmar_global_lock);
2884
2885	/* Disable PMRs explicitly here. */
2886	for_each_iommu(iommu, drhd)
2887		iommu_disable_protect_mem_regions(iommu);
2888
2889	/* Make sure the IOMMUs are switched off */
2890	intel_disable_iommus();
2891
2892	up_write(&dmar_global_lock);
2893}
2894
2895static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
2896{
2897	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2898
2899	return container_of(iommu_dev, struct intel_iommu, iommu);
2900}
2901
2902static ssize_t version_show(struct device *dev,
2903			    struct device_attribute *attr, char *buf)
2904{
2905	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2906	u32 ver = readl(iommu->reg + DMAR_VER_REG);
2907	return sysfs_emit(buf, "%d:%d\n",
2908			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2909}
2910static DEVICE_ATTR_RO(version);
2911
2912static ssize_t address_show(struct device *dev,
2913			    struct device_attribute *attr, char *buf)
2914{
2915	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2916	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
2917}
2918static DEVICE_ATTR_RO(address);
2919
2920static ssize_t cap_show(struct device *dev,
2921			struct device_attribute *attr, char *buf)
2922{
2923	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2924	return sysfs_emit(buf, "%llx\n", iommu->cap);
2925}
2926static DEVICE_ATTR_RO(cap);
2927
2928static ssize_t ecap_show(struct device *dev,
2929			 struct device_attribute *attr, char *buf)
2930{
2931	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2932	return sysfs_emit(buf, "%llx\n", iommu->ecap);
2933}
2934static DEVICE_ATTR_RO(ecap);
2935
2936static ssize_t domains_supported_show(struct device *dev,
2937				      struct device_attribute *attr, char *buf)
2938{
2939	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2940	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
2941}
2942static DEVICE_ATTR_RO(domains_supported);
2943
2944static ssize_t domains_used_show(struct device *dev,
2945				 struct device_attribute *attr, char *buf)
2946{
2947	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2948	return sysfs_emit(buf, "%d\n",
2949			  bitmap_weight(iommu->domain_ids,
2950					cap_ndoms(iommu->cap)));
2951}
2952static DEVICE_ATTR_RO(domains_used);
2953
2954static struct attribute *intel_iommu_attrs[] = {
2955	&dev_attr_version.attr,
2956	&dev_attr_address.attr,
2957	&dev_attr_cap.attr,
2958	&dev_attr_ecap.attr,
2959	&dev_attr_domains_supported.attr,
2960	&dev_attr_domains_used.attr,
2961	NULL,
2962};
2963
2964static struct attribute_group intel_iommu_group = {
2965	.name = "intel-iommu",
2966	.attrs = intel_iommu_attrs,
2967};
2968
2969const struct attribute_group *intel_iommu_groups[] = {
2970	&intel_iommu_group,
2971	NULL,
2972};
2973
2974static bool has_external_pci(void)
2975{
2976	struct pci_dev *pdev = NULL;
2977
2978	for_each_pci_dev(pdev)
2979		if (pdev->external_facing) {
2980			pci_dev_put(pdev);
2981			return true;
2982		}
2983
2984	return false;
2985}
2986
2987static int __init platform_optin_force_iommu(void)
2988{
2989	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
2990		return 0;
2991
2992	if (no_iommu || dmar_disabled)
2993		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
2994
2995	/*
2996	 * If Intel-IOMMU is disabled by default, we will apply identity
2997	 * map for all devices except those marked as being untrusted.
2998	 */
2999	if (dmar_disabled)
3000		iommu_set_default_passthrough(false);
3001
3002	dmar_disabled = 0;
3003	no_iommu = 0;
3004
3005	return 1;
3006}
3007
3008static int __init probe_acpi_namespace_devices(void)
3009{
3010	struct dmar_drhd_unit *drhd;
3011	/* To avoid a -Wunused-but-set-variable warning. */
3012	struct intel_iommu *iommu __maybe_unused;
3013	struct device *dev;
3014	int i, ret = 0;
3015
3016	for_each_active_iommu(iommu, drhd) {
3017		for_each_active_dev_scope(drhd->devices,
3018					  drhd->devices_cnt, i, dev) {
3019			struct acpi_device_physical_node *pn;
3020			struct acpi_device *adev;
3021
3022			if (dev->bus != &acpi_bus_type)
3023				continue;
3024
3025			adev = to_acpi_device(dev);
3026			mutex_lock(&adev->physical_node_lock);
3027			list_for_each_entry(pn,
3028					    &adev->physical_node_list, node) {
3029				ret = iommu_probe_device(pn->dev);
3030				if (ret)
3031					break;
3032			}
3033			mutex_unlock(&adev->physical_node_lock);
3034
3035			if (ret)
3036				return ret;
3037		}
3038	}
3039
3040	return 0;
3041}
3042
3043static __init int tboot_force_iommu(void)
3044{
3045	if (!tboot_enabled())
3046		return 0;
3047
3048	if (no_iommu || dmar_disabled)
3049		pr_warn("Forcing Intel-IOMMU to enabled\n");
3050
3051	dmar_disabled = 0;
3052	no_iommu = 0;
3053
3054	return 1;
3055}
3056
3057int __init intel_iommu_init(void)
3058{
3059	int ret = -ENODEV;
3060	struct dmar_drhd_unit *drhd;
3061	struct intel_iommu *iommu;
3062
3063	/*
3064	 * Intel IOMMU is required for a TXT/tboot launch or platform
3065	 * opt in, so enforce that.
3066	 */
3067	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3068		    platform_optin_force_iommu();
3069
3070	down_write(&dmar_global_lock);
3071	if (dmar_table_init()) {
3072		if (force_on)
3073			panic("tboot: Failed to initialize DMAR table\n");
3074		goto out_free_dmar;
3075	}
3076
3077	if (dmar_dev_scope_init() < 0) {
3078		if (force_on)
3079			panic("tboot: Failed to initialize DMAR device scope\n");
3080		goto out_free_dmar;
3081	}
3082
3083	up_write(&dmar_global_lock);
3084
3085	/*
3086	 * The bus notifier takes the dmar_global_lock, so lockdep will
3087	 * complain later when we register it under the lock.
3088	 */
3089	dmar_register_bus_notifier();
3090
3091	down_write(&dmar_global_lock);
3092
3093	if (!no_iommu)
3094		intel_iommu_debugfs_init();
3095
3096	if (no_iommu || dmar_disabled) {
3097		/*
3098		 * We exit the function here to ensure IOMMU's remapping and
3099		 * mempool aren't setup, which means that the IOMMU's PMRs
3100		 * won't be disabled via the call to init_dmars(). So disable
3101		 * it explicitly here. The PMRs were setup by tboot prior to
3102		 * calling SENTER, but the kernel is expected to reset/tear
3103		 * down the PMRs.
3104		 */
3105		if (intel_iommu_tboot_noforce) {
3106			for_each_iommu(iommu, drhd)
3107				iommu_disable_protect_mem_regions(iommu);
3108		}
3109
3110		/*
3111		 * Make sure the IOMMUs are switched off, even when we
3112		 * boot into a kexec kernel and the previous kernel left
3113		 * them enabled
3114		 */
3115		intel_disable_iommus();
3116		goto out_free_dmar;
3117	}
3118
3119	if (list_empty(&dmar_rmrr_units))
3120		pr_info("No RMRR found\n");
3121
3122	if (list_empty(&dmar_atsr_units))
3123		pr_info("No ATSR found\n");
3124
3125	if (list_empty(&dmar_satc_units))
3126		pr_info("No SATC found\n");
3127
3128	init_no_remapping_devices();
3129
3130	ret = init_dmars();
3131	if (ret) {
3132		if (force_on)
3133			panic("tboot: Failed to initialize DMARs\n");
3134		pr_err("Initialization failed\n");
3135		goto out_free_dmar;
3136	}
3137	up_write(&dmar_global_lock);
3138
3139	init_iommu_pm_ops();
3140
3141	down_read(&dmar_global_lock);
3142	for_each_active_iommu(iommu, drhd) {
3143		/*
3144		 * The flush queue implementation does not perform
3145		 * page-selective invalidations that are required for efficient
3146		 * TLB flushes in virtual environments.  The benefit of batching
3147		 * is likely to be much lower than the overhead of synchronizing
3148		 * the virtual and physical IOMMU page-tables.
3149		 */
3150		if (cap_caching_mode(iommu->cap) &&
3151		    !first_level_by_default(iommu)) {
3152			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3153			iommu_set_dma_strict();
3154		}
3155		iommu_device_sysfs_add(&iommu->iommu, NULL,
3156				       intel_iommu_groups,
3157				       "%s", iommu->name);
3158		/*
3159		 * The iommu device probe is protected by the iommu_probe_device_lock.
3160		 * Release the dmar_global_lock before entering the device probe path
3161		 * to avoid unnecessary lock order splat.
3162		 */
3163		up_read(&dmar_global_lock);
3164		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3165		down_read(&dmar_global_lock);
3166
3167		iommu_pmu_register(iommu);
3168	}
3169
3170	if (probe_acpi_namespace_devices())
3171		pr_warn("ACPI name space devices didn't probe correctly\n");
3172
3173	/* Finally, we enable the DMA remapping hardware. */
3174	for_each_iommu(iommu, drhd) {
3175		if (!drhd->ignored && !translation_pre_enabled(iommu))
3176			iommu_enable_translation(iommu);
3177
3178		iommu_disable_protect_mem_regions(iommu);
3179	}
3180	up_read(&dmar_global_lock);
3181
3182	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3183
3184	intel_iommu_enabled = 1;
3185
3186	return 0;
3187
3188out_free_dmar:
3189	intel_iommu_free_dmars();
3190	up_write(&dmar_global_lock);
3191	return ret;
3192}
3193
3194static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3195{
3196	struct device_domain_info *info = opaque;
3197
3198	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3199	return 0;
3200}
3201
3202/*
3203 * NB - intel-iommu lacks any sort of reference counting for the users of
3204 * dependent devices.  If multiple endpoints have intersecting dependent
3205 * devices, unbinding the driver from any one of them will possibly leave
3206 * the others unable to operate.
3207 */
3208static void domain_context_clear(struct device_domain_info *info)
3209{
3210	if (!dev_is_pci(info->dev)) {
3211		domain_context_clear_one(info, info->bus, info->devfn);
3212		return;
3213	}
3214
3215	pci_for_each_dma_alias(to_pci_dev(info->dev),
3216			       &domain_context_clear_one_cb, info);
3217}
3218
3219/*
3220 * Clear the page table pointer in context or pasid table entries so that
3221 * all DMA requests without PASID from the device are blocked. If the page
3222 * table has been set, clean up the data structures.
3223 */
3224void device_block_translation(struct device *dev)
3225{
3226	struct device_domain_info *info = dev_iommu_priv_get(dev);
3227	struct intel_iommu *iommu = info->iommu;
3228	unsigned long flags;
3229
3230	if (info->domain)
3231		cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3232
3233	iommu_disable_pci_caps(info);
3234	if (!dev_is_real_dma_subdevice(dev)) {
3235		if (sm_supported(iommu))
3236			intel_pasid_tear_down_entry(iommu, dev,
3237						    IOMMU_NO_PASID, false);
3238		else
3239			domain_context_clear(info);
3240	}
3241
3242	if (!info->domain)
3243		return;
3244
3245	spin_lock_irqsave(&info->domain->lock, flags);
3246	list_del(&info->link);
3247	spin_unlock_irqrestore(&info->domain->lock, flags);
3248
3249	domain_detach_iommu(info->domain, iommu);
3250	info->domain = NULL;
3251}
3252
3253static int blocking_domain_attach_dev(struct iommu_domain *domain,
3254				      struct device *dev)
3255{
3256	device_block_translation(dev);
3257	return 0;
3258}
3259
3260static struct iommu_domain blocking_domain = {
3261	.type = IOMMU_DOMAIN_BLOCKED,
3262	.ops = &(const struct iommu_domain_ops) {
3263		.attach_dev	= blocking_domain_attach_dev,
3264	}
3265};
3266
3267static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3268{
3269	if (!intel_iommu_superpage)
3270		return 0;
3271
3272	if (first_stage)
3273		return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3274
3275	return fls(cap_super_page_val(iommu->cap));
3276}
3277
3278static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3279{
3280	struct device_domain_info *info = dev_iommu_priv_get(dev);
3281	struct intel_iommu *iommu = info->iommu;
3282	struct dmar_domain *domain;
3283	int addr_width;
3284
3285	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3286	if (!domain)
3287		return ERR_PTR(-ENOMEM);
3288
3289	INIT_LIST_HEAD(&domain->devices);
3290	INIT_LIST_HEAD(&domain->dev_pasids);
3291	INIT_LIST_HEAD(&domain->cache_tags);
3292	spin_lock_init(&domain->lock);
3293	spin_lock_init(&domain->cache_lock);
3294	xa_init(&domain->iommu_array);
3295
3296	domain->nid = dev_to_node(dev);
3297	domain->use_first_level = first_stage;
3298
3299	/* calculate the address width */
3300	addr_width = agaw_to_width(iommu->agaw);
3301	if (addr_width > cap_mgaw(iommu->cap))
3302		addr_width = cap_mgaw(iommu->cap);
3303	domain->gaw = addr_width;
3304	domain->agaw = iommu->agaw;
3305	domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3306
3307	/* iommu memory access coherency */
3308	domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3309
3310	/* pagesize bitmap */
3311	domain->domain.pgsize_bitmap = SZ_4K;
3312	domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3313	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3314
3315	/*
3316	 * IOVA aperture: First-level translation restricts the input-address
3317	 * to a canonical address (i.e., address bits 63:N have the same value
3318	 * as address bit [N-1], where N is 48-bits with 4-level paging and
3319	 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3320	 */
3321	domain->domain.geometry.force_aperture = true;
3322	domain->domain.geometry.aperture_start = 0;
3323	if (first_stage)
3324		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3325	else
3326		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3327
3328	/* always allocate the top pgd */
3329	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3330	if (!domain->pgd) {
3331		kfree(domain);
3332		return ERR_PTR(-ENOMEM);
3333	}
3334	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3335
3336	return domain;
3337}
3338
3339static struct iommu_domain *
3340intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
3341				      const struct iommu_user_data *user_data)
3342{
3343	struct device_domain_info *info = dev_iommu_priv_get(dev);
3344	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3345	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3346	struct intel_iommu *iommu = info->iommu;
3347	struct dmar_domain *dmar_domain;
3348	struct iommu_domain *domain;
3349	bool first_stage;
3350
3351	if (flags &
3352	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3353	       | IOMMU_HWPT_FAULT_ID_VALID)))
3354		return ERR_PTR(-EOPNOTSUPP);
3355	if (nested_parent && !nested_supported(iommu))
3356		return ERR_PTR(-EOPNOTSUPP);
3357	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3358		return ERR_PTR(-EOPNOTSUPP);
3359
3360	/*
3361	 * Always allocate the guest compatible page table unless
3362	 * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3363	 * is specified.
3364	 */
3365	if (nested_parent || dirty_tracking) {
3366		if (!sm_supported(iommu) || !ecap_slts(iommu->ecap))
3367			return ERR_PTR(-EOPNOTSUPP);
3368		first_stage = false;
3369	} else {
3370		first_stage = first_level_by_default(iommu);
3371	}
3372
3373	dmar_domain = paging_domain_alloc(dev, first_stage);
3374	if (IS_ERR(dmar_domain))
3375		return ERR_CAST(dmar_domain);
3376	domain = &dmar_domain->domain;
3377	domain->type = IOMMU_DOMAIN_UNMANAGED;
3378	domain->owner = &intel_iommu_ops;
3379	domain->ops = intel_iommu_ops.default_domain_ops;
3380
3381	if (nested_parent) {
3382		dmar_domain->nested_parent = true;
3383		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3384		spin_lock_init(&dmar_domain->s1_lock);
3385	}
3386
3387	if (dirty_tracking) {
3388		if (dmar_domain->use_first_level) {
3389			iommu_domain_free(domain);
3390			return ERR_PTR(-EOPNOTSUPP);
3391		}
3392		domain->dirty_ops = &intel_dirty_ops;
3393	}
3394
3395	return domain;
3396}
3397
3398static void intel_iommu_domain_free(struct iommu_domain *domain)
3399{
3400	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3401
3402	WARN_ON(dmar_domain->nested_parent &&
3403		!list_empty(&dmar_domain->s1_domains));
3404	domain_exit(dmar_domain);
3405}
3406
3407int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
3408{
3409	struct device_domain_info *info = dev_iommu_priv_get(dev);
3410	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3411	struct intel_iommu *iommu = info->iommu;
3412	int addr_width;
3413
3414	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
3415		return -EPERM;
3416
3417	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3418		return -EINVAL;
3419
3420	if (domain->dirty_ops && !ssads_supported(iommu))
3421		return -EINVAL;
3422
3423	if (dmar_domain->iommu_coherency !=
3424			iommu_paging_structure_coherency(iommu))
3425		return -EINVAL;
3426
3427	if (dmar_domain->iommu_superpage !=
3428			iommu_superpage_capability(iommu, dmar_domain->use_first_level))
3429		return -EINVAL;
3430
3431	if (dmar_domain->use_first_level &&
3432	    (!sm_supported(iommu) || !ecap_flts(iommu->ecap)))
3433		return -EINVAL;
3434
3435	/* check if this iommu agaw is sufficient for max mapped address */
3436	addr_width = agaw_to_width(iommu->agaw);
3437	if (addr_width > cap_mgaw(iommu->cap))
3438		addr_width = cap_mgaw(iommu->cap);
3439
3440	if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
3441		return -EINVAL;
3442
3443	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3444	    context_copied(iommu, info->bus, info->devfn))
3445		return intel_pasid_setup_sm_context(dev);
3446
3447	return 0;
3448}
3449
3450static int intel_iommu_attach_device(struct iommu_domain *domain,
3451				     struct device *dev)
3452{
3453	int ret;
3454
3455	device_block_translation(dev);
3456
3457	ret = paging_domain_compatible(domain, dev);
3458	if (ret)
3459		return ret;
3460
3461	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3462}
3463
3464static int intel_iommu_map(struct iommu_domain *domain,
3465			   unsigned long iova, phys_addr_t hpa,
3466			   size_t size, int iommu_prot, gfp_t gfp)
3467{
3468	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3469	u64 max_addr;
3470	int prot = 0;
3471
3472	if (iommu_prot & IOMMU_READ)
3473		prot |= DMA_PTE_READ;
3474	if (iommu_prot & IOMMU_WRITE)
3475		prot |= DMA_PTE_WRITE;
3476	if (dmar_domain->set_pte_snp)
3477		prot |= DMA_PTE_SNP;
3478
3479	max_addr = iova + size;
3480	if (dmar_domain->max_addr < max_addr) {
3481		u64 end;
3482
3483		/* check if minimum agaw is sufficient for mapped address */
3484		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3485		if (end < max_addr) {
3486			pr_err("%s: iommu width (%d) is not "
3487			       "sufficient for the mapped address (%llx)\n",
3488			       __func__, dmar_domain->gaw, max_addr);
3489			return -EFAULT;
3490		}
3491		dmar_domain->max_addr = max_addr;
3492	}
3493	/* Round up size to next multiple of PAGE_SIZE, if it and
3494	   the low bits of hpa would take us onto the next page */
3495	size = aligned_nrpages(hpa, size);
3496	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3497				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3498}
3499
3500static int intel_iommu_map_pages(struct iommu_domain *domain,
3501				 unsigned long iova, phys_addr_t paddr,
3502				 size_t pgsize, size_t pgcount,
3503				 int prot, gfp_t gfp, size_t *mapped)
3504{
3505	unsigned long pgshift = __ffs(pgsize);
3506	size_t size = pgcount << pgshift;
3507	int ret;
3508
3509	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3510		return -EINVAL;
3511
3512	if (!IS_ALIGNED(iova | paddr, pgsize))
3513		return -EINVAL;
3514
3515	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3516	if (!ret && mapped)
3517		*mapped = size;
3518
3519	return ret;
3520}
3521
3522static size_t intel_iommu_unmap(struct iommu_domain *domain,
3523				unsigned long iova, size_t size,
3524				struct iommu_iotlb_gather *gather)
3525{
3526	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3527	unsigned long start_pfn, last_pfn;
3528	int level = 0;
3529
3530	/* Cope with horrid API which requires us to unmap more than the
3531	   size argument if it happens to be a large-page mapping. */
3532	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3533				     &level, GFP_ATOMIC)))
3534		return 0;
3535
3536	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3537		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3538
3539	start_pfn = iova >> VTD_PAGE_SHIFT;
3540	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3541
3542	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3543
3544	if (dmar_domain->max_addr == iova + size)
3545		dmar_domain->max_addr = iova;
3546
3547	/*
3548	 * We do not use page-selective IOTLB invalidation in flush queue,
3549	 * so there is no need to track page and sync iotlb.
3550	 */
3551	if (!iommu_iotlb_gather_queued(gather))
3552		iommu_iotlb_gather_add_page(domain, gather, iova, size);
3553
3554	return size;
3555}
3556
3557static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3558				      unsigned long iova,
3559				      size_t pgsize, size_t pgcount,
3560				      struct iommu_iotlb_gather *gather)
3561{
3562	unsigned long pgshift = __ffs(pgsize);
3563	size_t size = pgcount << pgshift;
3564
3565	return intel_iommu_unmap(domain, iova, size, gather);
3566}
3567
3568static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3569				 struct iommu_iotlb_gather *gather)
3570{
3571	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3572			      gather->end, list_empty(&gather->freelist));
3573	iommu_put_pages_list(&gather->freelist);
3574}
3575
3576static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3577					    dma_addr_t iova)
3578{
3579	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3580	struct dma_pte *pte;
3581	int level = 0;
3582	u64 phys = 0;
3583
3584	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3585			     GFP_ATOMIC);
3586	if (pte && dma_pte_present(pte))
3587		phys = dma_pte_addr(pte) +
3588			(iova & (BIT_MASK(level_to_offset_bits(level) +
3589						VTD_PAGE_SHIFT) - 1));
3590
3591	return phys;
3592}
3593
3594static bool domain_support_force_snooping(struct dmar_domain *domain)
3595{
3596	struct device_domain_info *info;
3597	bool support = true;
3598
3599	assert_spin_locked(&domain->lock);
3600	list_for_each_entry(info, &domain->devices, link) {
3601		if (!ecap_sc_support(info->iommu->ecap)) {
3602			support = false;
3603			break;
3604		}
3605	}
3606
3607	return support;
3608}
3609
3610static void domain_set_force_snooping(struct dmar_domain *domain)
3611{
3612	struct device_domain_info *info;
3613
3614	assert_spin_locked(&domain->lock);
3615	/*
3616	 * Second level page table supports per-PTE snoop control. The
3617	 * iommu_map() interface will handle this by setting SNP bit.
3618	 */
3619	if (!domain->use_first_level) {
3620		domain->set_pte_snp = true;
3621		return;
3622	}
3623
3624	list_for_each_entry(info, &domain->devices, link)
3625		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3626						     IOMMU_NO_PASID);
3627}
3628
3629static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3630{
3631	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3632	unsigned long flags;
3633
3634	if (dmar_domain->force_snooping)
3635		return true;
3636
3637	spin_lock_irqsave(&dmar_domain->lock, flags);
3638	if (!domain_support_force_snooping(dmar_domain) ||
3639	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3640		spin_unlock_irqrestore(&dmar_domain->lock, flags);
3641		return false;
3642	}
3643
3644	domain_set_force_snooping(dmar_domain);
3645	dmar_domain->force_snooping = true;
3646	spin_unlock_irqrestore(&dmar_domain->lock, flags);
3647
3648	return true;
3649}
3650
3651static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3652{
3653	struct device_domain_info *info = dev_iommu_priv_get(dev);
3654
3655	switch (cap) {
3656	case IOMMU_CAP_CACHE_COHERENCY:
3657	case IOMMU_CAP_DEFERRED_FLUSH:
3658		return true;
3659	case IOMMU_CAP_PRE_BOOT_PROTECTION:
3660		return dmar_platform_optin();
3661	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3662		return ecap_sc_support(info->iommu->ecap);
3663	case IOMMU_CAP_DIRTY_TRACKING:
3664		return ssads_supported(info->iommu);
3665	default:
3666		return false;
3667	}
3668}
3669
3670static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3671{
3672	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3673	struct device_domain_info *info;
3674	struct intel_iommu *iommu;
3675	u8 bus, devfn;
3676	int ret;
3677
3678	iommu = device_lookup_iommu(dev, &bus, &devfn);
3679	if (!iommu || !iommu->iommu.ops)
3680		return ERR_PTR(-ENODEV);
3681
3682	info = kzalloc(sizeof(*info), GFP_KERNEL);
3683	if (!info)
3684		return ERR_PTR(-ENOMEM);
3685
3686	if (dev_is_real_dma_subdevice(dev)) {
3687		info->bus = pdev->bus->number;
3688		info->devfn = pdev->devfn;
3689		info->segment = pci_domain_nr(pdev->bus);
3690	} else {
3691		info->bus = bus;
3692		info->devfn = devfn;
3693		info->segment = iommu->segment;
3694	}
3695
3696	info->dev = dev;
3697	info->iommu = iommu;
3698	if (dev_is_pci(dev)) {
3699		if (ecap_dev_iotlb_support(iommu->ecap) &&
3700		    pci_ats_supported(pdev) &&
3701		    dmar_ats_supported(pdev, iommu)) {
3702			info->ats_supported = 1;
3703			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3704
3705			/*
3706			 * For IOMMU that supports device IOTLB throttling
3707			 * (DIT), we assign PFSID to the invalidation desc
3708			 * of a VF such that IOMMU HW can gauge queue depth
3709			 * at PF level. If DIT is not set, PFSID will be
3710			 * treated as reserved, which should be set to 0.
3711			 */
3712			if (ecap_dit(iommu->ecap))
3713				info->pfsid = pci_dev_id(pci_physfn(pdev));
3714			info->ats_qdep = pci_ats_queue_depth(pdev);
3715		}
3716		if (sm_supported(iommu)) {
3717			if (pasid_supported(iommu)) {
3718				int features = pci_pasid_features(pdev);
3719
3720				if (features >= 0)
3721					info->pasid_supported = features | 1;
3722			}
3723
3724			if (info->ats_supported && ecap_prs(iommu->ecap) &&
3725			    pci_pri_supported(pdev))
3726				info->pri_supported = 1;
3727		}
3728	}
3729
3730	dev_iommu_priv_set(dev, info);
3731	if (pdev && pci_ats_supported(pdev)) {
3732		pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3733		ret = device_rbtree_insert(iommu, info);
3734		if (ret)
3735			goto free;
3736	}
3737
3738	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3739		ret = intel_pasid_alloc_table(dev);
3740		if (ret) {
3741			dev_err(dev, "PASID table allocation failed\n");
3742			goto clear_rbtree;
3743		}
3744
3745		if (!context_copied(iommu, info->bus, info->devfn)) {
3746			ret = intel_pasid_setup_sm_context(dev);
3747			if (ret)
3748				goto free_table;
3749		}
3750	}
3751
3752	intel_iommu_debugfs_create_dev(info);
3753
3754	/*
3755	 * The PCIe spec, in its wisdom, declares that the behaviour of the
3756	 * device is undefined if you enable PASID support after ATS support.
3757	 * So always enable PASID support on devices which have it, even if
3758	 * we can't yet know if we're ever going to use it.
3759	 */
3760	if (info->pasid_supported &&
3761	    !pci_enable_pasid(pdev, info->pasid_supported & ~1))
3762		info->pasid_enabled = 1;
3763
3764	return &iommu->iommu;
3765free_table:
3766	intel_pasid_free_table(dev);
3767clear_rbtree:
3768	device_rbtree_remove(info);
3769free:
3770	kfree(info);
3771
3772	return ERR_PTR(ret);
3773}
3774
3775static void intel_iommu_release_device(struct device *dev)
3776{
3777	struct device_domain_info *info = dev_iommu_priv_get(dev);
3778	struct intel_iommu *iommu = info->iommu;
3779
3780	if (info->pasid_enabled) {
3781		pci_disable_pasid(to_pci_dev(dev));
3782		info->pasid_enabled = 0;
3783	}
3784
3785	mutex_lock(&iommu->iopf_lock);
3786	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3787		device_rbtree_remove(info);
3788	mutex_unlock(&iommu->iopf_lock);
3789
3790	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3791	    !context_copied(iommu, info->bus, info->devfn))
3792		intel_pasid_teardown_sm_context(dev);
3793
3794	intel_pasid_free_table(dev);
3795	intel_iommu_debugfs_remove_dev(info);
3796	kfree(info);
3797	set_dma_ops(dev, NULL);
3798}
3799
3800static void intel_iommu_get_resv_regions(struct device *device,
3801					 struct list_head *head)
3802{
3803	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3804	struct iommu_resv_region *reg;
3805	struct dmar_rmrr_unit *rmrr;
3806	struct device *i_dev;
3807	int i;
3808
3809	rcu_read_lock();
3810	for_each_rmrr_units(rmrr) {
3811		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3812					  i, i_dev) {
3813			struct iommu_resv_region *resv;
3814			enum iommu_resv_type type;
3815			size_t length;
3816
3817			if (i_dev != device &&
3818			    !is_downstream_to_pci_bridge(device, i_dev))
3819				continue;
3820
3821			length = rmrr->end_address - rmrr->base_address + 1;
3822
3823			type = device_rmrr_is_relaxable(device) ?
3824				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3825
3826			resv = iommu_alloc_resv_region(rmrr->base_address,
3827						       length, prot, type,
3828						       GFP_ATOMIC);
3829			if (!resv)
3830				break;
3831
3832			list_add_tail(&resv->list, head);
3833		}
3834	}
3835	rcu_read_unlock();
3836
3837#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3838	if (dev_is_pci(device)) {
3839		struct pci_dev *pdev = to_pci_dev(device);
3840
3841		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
3842			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
3843					IOMMU_RESV_DIRECT_RELAXABLE,
3844					GFP_KERNEL);
3845			if (reg)
3846				list_add_tail(&reg->list, head);
3847		}
3848	}
3849#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3850
3851	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3852				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
3853				      0, IOMMU_RESV_MSI, GFP_KERNEL);
3854	if (!reg)
3855		return;
3856	list_add_tail(&reg->list, head);
3857}
3858
3859static struct iommu_group *intel_iommu_device_group(struct device *dev)
3860{
3861	if (dev_is_pci(dev))
3862		return pci_device_group(dev);
3863	return generic_device_group(dev);
3864}
3865
3866static int intel_iommu_enable_sva(struct device *dev)
3867{
3868	struct device_domain_info *info = dev_iommu_priv_get(dev);
3869	struct intel_iommu *iommu;
3870
3871	if (!info || dmar_disabled)
3872		return -EINVAL;
3873
3874	iommu = info->iommu;
3875	if (!iommu)
3876		return -EINVAL;
3877
3878	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
3879		return -ENODEV;
3880
3881	if (!info->pasid_enabled || !info->ats_enabled)
3882		return -EINVAL;
3883
3884	/*
3885	 * Devices having device-specific I/O fault handling should not
3886	 * support PCI/PRI. The IOMMU side has no means to check the
3887	 * capability of device-specific IOPF.  Therefore, IOMMU can only
3888	 * default that if the device driver enables SVA on a non-PRI
3889	 * device, it will handle IOPF in its own way.
3890	 */
3891	if (!info->pri_supported)
3892		return 0;
3893
3894	/* Devices supporting PRI should have it enabled. */
3895	if (!info->pri_enabled)
3896		return -EINVAL;
3897
3898	return 0;
3899}
3900
3901static int context_flip_pri(struct device_domain_info *info, bool enable)
3902{
3903	struct intel_iommu *iommu = info->iommu;
3904	u8 bus = info->bus, devfn = info->devfn;
3905	struct context_entry *context;
3906	u16 did;
3907
3908	spin_lock(&iommu->lock);
3909	if (context_copied(iommu, bus, devfn)) {
3910		spin_unlock(&iommu->lock);
3911		return -EINVAL;
3912	}
3913
3914	context = iommu_context_addr(iommu, bus, devfn, false);
3915	if (!context || !context_present(context)) {
3916		spin_unlock(&iommu->lock);
3917		return -ENODEV;
3918	}
3919	did = context_domain_id(context);
3920
3921	if (enable)
3922		context_set_sm_pre(context);
3923	else
3924		context_clear_sm_pre(context);
3925
3926	if (!ecap_coherent(iommu->ecap))
3927		clflush_cache_range(context, sizeof(*context));
3928	intel_context_flush_present(info, context, did, true);
3929	spin_unlock(&iommu->lock);
3930
3931	return 0;
3932}
3933
3934static int intel_iommu_enable_iopf(struct device *dev)
3935{
3936	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3937	struct device_domain_info *info = dev_iommu_priv_get(dev);
3938	struct intel_iommu *iommu;
3939	int ret;
3940
3941	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
3942		return -ENODEV;
3943
3944	if (info->pri_enabled)
3945		return -EBUSY;
3946
3947	iommu = info->iommu;
3948	if (!iommu)
3949		return -EINVAL;
3950
3951	/* PASID is required in PRG Response Message. */
3952	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
3953		return -EINVAL;
3954
3955	ret = pci_reset_pri(pdev);
3956	if (ret)
3957		return ret;
3958
3959	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
3960	if (ret)
3961		return ret;
3962
3963	ret = context_flip_pri(info, true);
3964	if (ret)
3965		goto err_remove_device;
3966
3967	ret = pci_enable_pri(pdev, PRQ_DEPTH);
3968	if (ret)
3969		goto err_clear_pri;
3970
3971	info->pri_enabled = 1;
3972
3973	return 0;
3974err_clear_pri:
3975	context_flip_pri(info, false);
3976err_remove_device:
3977	iopf_queue_remove_device(iommu->iopf_queue, dev);
3978
3979	return ret;
3980}
3981
3982static int intel_iommu_disable_iopf(struct device *dev)
3983{
3984	struct device_domain_info *info = dev_iommu_priv_get(dev);
3985	struct intel_iommu *iommu = info->iommu;
3986
3987	if (!info->pri_enabled)
3988		return -EINVAL;
3989
3990	/* Disable new PRI reception: */
3991	context_flip_pri(info, false);
3992
3993	/*
3994	 * Remove device from fault queue and acknowledge all outstanding
3995	 * PRQs to the device:
3996	 */
3997	iopf_queue_remove_device(iommu->iopf_queue, dev);
3998
3999	/*
4000	 * PCIe spec states that by clearing PRI enable bit, the Page
4001	 * Request Interface will not issue new page requests, but has
4002	 * outstanding page requests that have been transmitted or are
4003	 * queued for transmission. This is supposed to be called after
4004	 * the device driver has stopped DMA, all PASIDs have been
4005	 * unbound and the outstanding PRQs have been drained.
4006	 */
4007	pci_disable_pri(to_pci_dev(dev));
4008	info->pri_enabled = 0;
4009
4010	return 0;
4011}
4012
4013static int
4014intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4015{
4016	switch (feat) {
4017	case IOMMU_DEV_FEAT_IOPF:
4018		return intel_iommu_enable_iopf(dev);
4019
4020	case IOMMU_DEV_FEAT_SVA:
4021		return intel_iommu_enable_sva(dev);
4022
4023	default:
4024		return -ENODEV;
4025	}
4026}
4027
4028static int
4029intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4030{
4031	switch (feat) {
4032	case IOMMU_DEV_FEAT_IOPF:
4033		return intel_iommu_disable_iopf(dev);
4034
4035	case IOMMU_DEV_FEAT_SVA:
4036		return 0;
4037
4038	default:
4039		return -ENODEV;
4040	}
4041}
4042
4043static bool intel_iommu_is_attach_deferred(struct device *dev)
4044{
4045	struct device_domain_info *info = dev_iommu_priv_get(dev);
4046
4047	return translation_pre_enabled(info->iommu) && !info->domain;
4048}
4049
4050/*
4051 * Check that the device does not live on an external facing PCI port that is
4052 * marked as untrusted. Such devices should not be able to apply quirks and
4053 * thus not be able to bypass the IOMMU restrictions.
4054 */
4055static bool risky_device(struct pci_dev *pdev)
4056{
4057	if (pdev->untrusted) {
4058		pci_info(pdev,
4059			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4060			 pdev->vendor, pdev->device);
4061		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4062		return true;
4063	}
4064	return false;
4065}
4066
4067static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4068				      unsigned long iova, size_t size)
4069{
4070	cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4071
4072	return 0;
4073}
4074
4075void domain_remove_dev_pasid(struct iommu_domain *domain,
4076			     struct device *dev, ioasid_t pasid)
4077{
4078	struct device_domain_info *info = dev_iommu_priv_get(dev);
4079	struct dev_pasid_info *curr, *dev_pasid = NULL;
4080	struct intel_iommu *iommu = info->iommu;
4081	struct dmar_domain *dmar_domain;
4082	unsigned long flags;
4083
4084	if (!domain)
4085		return;
4086
4087	/* Identity domain has no meta data for pasid. */
4088	if (domain->type == IOMMU_DOMAIN_IDENTITY)
4089		return;
4090
4091	dmar_domain = to_dmar_domain(domain);
4092	spin_lock_irqsave(&dmar_domain->lock, flags);
4093	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4094		if (curr->dev == dev && curr->pasid == pasid) {
4095			list_del(&curr->link_domain);
4096			dev_pasid = curr;
4097			break;
4098		}
4099	}
4100	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4101
4102	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4103	domain_detach_iommu(dmar_domain, iommu);
4104	if (!WARN_ON_ONCE(!dev_pasid)) {
4105		intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4106		kfree(dev_pasid);
4107	}
4108}
4109
4110static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
4111					 struct iommu_domain *domain)
4112{
4113	struct device_domain_info *info = dev_iommu_priv_get(dev);
4114
4115	intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
4116	domain_remove_dev_pasid(domain, dev, pasid);
4117}
4118
4119struct dev_pasid_info *
4120domain_add_dev_pasid(struct iommu_domain *domain,
4121		     struct device *dev, ioasid_t pasid)
4122{
4123	struct device_domain_info *info = dev_iommu_priv_get(dev);
4124	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4125	struct intel_iommu *iommu = info->iommu;
4126	struct dev_pasid_info *dev_pasid;
4127	unsigned long flags;
4128	int ret;
4129
4130	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4131	if (!dev_pasid)
4132		return ERR_PTR(-ENOMEM);
4133
4134	ret = domain_attach_iommu(dmar_domain, iommu);
4135	if (ret)
4136		goto out_free;
4137
4138	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4139	if (ret)
4140		goto out_detach_iommu;
4141
4142	dev_pasid->dev = dev;
4143	dev_pasid->pasid = pasid;
4144	spin_lock_irqsave(&dmar_domain->lock, flags);
4145	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4146	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4147
4148	return dev_pasid;
4149out_detach_iommu:
4150	domain_detach_iommu(dmar_domain, iommu);
4151out_free:
4152	kfree(dev_pasid);
4153	return ERR_PTR(ret);
4154}
4155
4156static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4157				     struct device *dev, ioasid_t pasid,
4158				     struct iommu_domain *old)
4159{
4160	struct device_domain_info *info = dev_iommu_priv_get(dev);
4161	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4162	struct intel_iommu *iommu = info->iommu;
4163	struct dev_pasid_info *dev_pasid;
4164	int ret;
4165
4166	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
4167		return -EINVAL;
4168
4169	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4170		return -EOPNOTSUPP;
4171
4172	if (domain->dirty_ops)
4173		return -EINVAL;
4174
4175	if (context_copied(iommu, info->bus, info->devfn))
4176		return -EBUSY;
4177
4178	ret = paging_domain_compatible(domain, dev);
4179	if (ret)
4180		return ret;
4181
4182	dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
4183	if (IS_ERR(dev_pasid))
4184		return PTR_ERR(dev_pasid);
4185
4186	if (dmar_domain->use_first_level)
4187		ret = domain_setup_first_level(iommu, dmar_domain,
4188					       dev, pasid, old);
4189	else
4190		ret = domain_setup_second_level(iommu, dmar_domain,
4191						dev, pasid, old);
4192	if (ret)
4193		goto out_remove_dev_pasid;
4194
4195	domain_remove_dev_pasid(old, dev, pasid);
4196
4197	intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4198
4199	return 0;
4200
4201out_remove_dev_pasid:
4202	domain_remove_dev_pasid(domain, dev, pasid);
4203	return ret;
4204}
4205
4206static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4207{
4208	struct device_domain_info *info = dev_iommu_priv_get(dev);
4209	struct intel_iommu *iommu = info->iommu;
4210	struct iommu_hw_info_vtd *vtd;
4211
4212	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4213	if (!vtd)
4214		return ERR_PTR(-ENOMEM);
4215
4216	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4217	vtd->cap_reg = iommu->cap;
4218	vtd->ecap_reg = iommu->ecap;
4219	*length = sizeof(*vtd);
4220	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4221	return vtd;
4222}
4223
4224/*
4225 * Set dirty tracking for the device list of a domain. The caller must
4226 * hold the domain->lock when calling it.
4227 */
4228static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4229{
4230	struct device_domain_info *info;
4231	int ret = 0;
4232
4233	list_for_each_entry(info, devices, link) {
4234		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4235						       IOMMU_NO_PASID, enable);
4236		if (ret)
4237			break;
4238	}
4239
4240	return ret;
4241}
4242
4243static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4244					    bool enable)
4245{
4246	struct dmar_domain *s1_domain;
4247	unsigned long flags;
4248	int ret;
4249
4250	spin_lock(&domain->s1_lock);
4251	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4252		spin_lock_irqsave(&s1_domain->lock, flags);
4253		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4254		spin_unlock_irqrestore(&s1_domain->lock, flags);
4255		if (ret)
4256			goto err_unwind;
4257	}
4258	spin_unlock(&domain->s1_lock);
4259	return 0;
4260
4261err_unwind:
4262	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4263		spin_lock_irqsave(&s1_domain->lock, flags);
4264		device_set_dirty_tracking(&s1_domain->devices,
4265					  domain->dirty_tracking);
4266		spin_unlock_irqrestore(&s1_domain->lock, flags);
4267	}
4268	spin_unlock(&domain->s1_lock);
4269	return ret;
4270}
4271
4272static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4273					  bool enable)
4274{
4275	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4276	int ret;
4277
4278	spin_lock(&dmar_domain->lock);
4279	if (dmar_domain->dirty_tracking == enable)
4280		goto out_unlock;
4281
4282	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4283	if (ret)
4284		goto err_unwind;
4285
4286	if (dmar_domain->nested_parent) {
4287		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4288		if (ret)
4289			goto err_unwind;
4290	}
4291
4292	dmar_domain->dirty_tracking = enable;
4293out_unlock:
4294	spin_unlock(&dmar_domain->lock);
4295
4296	return 0;
4297
4298err_unwind:
4299	device_set_dirty_tracking(&dmar_domain->devices,
4300				  dmar_domain->dirty_tracking);
4301	spin_unlock(&dmar_domain->lock);
4302	return ret;
4303}
4304
4305static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4306					    unsigned long iova, size_t size,
4307					    unsigned long flags,
4308					    struct iommu_dirty_bitmap *dirty)
4309{
4310	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4311	unsigned long end = iova + size - 1;
4312	unsigned long pgsize;
4313
4314	/*
4315	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4316	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4317	 * have occurred when we stopped dirty tracking. This ensures that we
4318	 * never inherit dirtied bits from a previous cycle.
4319	 */
4320	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4321		return -EINVAL;
4322
4323	do {
4324		struct dma_pte *pte;
4325		int lvl = 0;
4326
4327		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4328				     GFP_ATOMIC);
4329		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4330		if (!pte || !dma_pte_present(pte)) {
4331			iova += pgsize;
4332			continue;
4333		}
4334
4335		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4336			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4337		iova += pgsize;
4338	} while (iova < end);
4339
4340	return 0;
4341}
4342
4343static const struct iommu_dirty_ops intel_dirty_ops = {
4344	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4345	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4346};
4347
4348static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4349{
4350	struct device_domain_info *info = dev_iommu_priv_get(dev);
4351	struct intel_iommu *iommu = info->iommu;
4352	struct context_entry *context;
4353
4354	spin_lock(&iommu->lock);
4355	context = iommu_context_addr(iommu, bus, devfn, 1);
4356	if (!context) {
4357		spin_unlock(&iommu->lock);
4358		return -ENOMEM;
4359	}
4360
4361	if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4362		spin_unlock(&iommu->lock);
4363		return 0;
4364	}
4365
4366	copied_context_tear_down(iommu, context, bus, devfn);
4367	context_clear_entry(context);
4368	context_set_domain_id(context, FLPT_DEFAULT_DID);
4369
4370	/*
4371	 * In pass through mode, AW must be programmed to indicate the largest
4372	 * AGAW value supported by hardware. And ASR is ignored by hardware.
4373	 */
4374	context_set_address_width(context, iommu->msagaw);
4375	context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4376	context_set_fault_enable(context);
4377	context_set_present(context);
4378	if (!ecap_coherent(iommu->ecap))
4379		clflush_cache_range(context, sizeof(*context));
4380	context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4381	spin_unlock(&iommu->lock);
4382
4383	return 0;
4384}
4385
4386static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4387{
4388	struct device *dev = data;
4389
4390	return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4391}
4392
4393static int device_setup_pass_through(struct device *dev)
4394{
4395	struct device_domain_info *info = dev_iommu_priv_get(dev);
4396
4397	if (!dev_is_pci(dev))
4398		return context_setup_pass_through(dev, info->bus, info->devfn);
4399
4400	return pci_for_each_dma_alias(to_pci_dev(dev),
4401				      context_setup_pass_through_cb, dev);
4402}
4403
4404static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4405{
4406	struct device_domain_info *info = dev_iommu_priv_get(dev);
4407	struct intel_iommu *iommu = info->iommu;
4408	int ret;
4409
4410	device_block_translation(dev);
4411
4412	if (dev_is_real_dma_subdevice(dev))
4413		return 0;
4414
4415	if (sm_supported(iommu)) {
4416		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4417		if (!ret)
4418			iommu_enable_pci_caps(info);
4419	} else {
4420		ret = device_setup_pass_through(dev);
4421	}
4422
4423	return ret;
4424}
4425
4426static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4427					 struct device *dev, ioasid_t pasid,
4428					 struct iommu_domain *old)
4429{
4430	struct device_domain_info *info = dev_iommu_priv_get(dev);
4431	struct intel_iommu *iommu = info->iommu;
4432	int ret;
4433
4434	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4435		return -EOPNOTSUPP;
4436
4437	ret = domain_setup_passthrough(iommu, dev, pasid, old);
4438	if (ret)
4439		return ret;
4440
4441	domain_remove_dev_pasid(old, dev, pasid);
4442	return 0;
4443}
4444
4445static struct iommu_domain identity_domain = {
4446	.type = IOMMU_DOMAIN_IDENTITY,
4447	.ops = &(const struct iommu_domain_ops) {
4448		.attach_dev	= identity_domain_attach_dev,
4449		.set_dev_pasid	= identity_domain_set_dev_pasid,
4450	},
4451};
4452
4453static struct iommu_domain *intel_iommu_domain_alloc_paging(struct device *dev)
4454{
4455	struct device_domain_info *info = dev_iommu_priv_get(dev);
4456	struct intel_iommu *iommu = info->iommu;
4457	struct dmar_domain *dmar_domain;
4458	bool first_stage;
4459
4460	first_stage = first_level_by_default(iommu);
4461	dmar_domain = paging_domain_alloc(dev, first_stage);
4462	if (IS_ERR(dmar_domain))
4463		return ERR_CAST(dmar_domain);
4464
4465	return &dmar_domain->domain;
4466}
4467
4468const struct iommu_ops intel_iommu_ops = {
4469	.blocked_domain		= &blocking_domain,
4470	.release_domain		= &blocking_domain,
4471	.identity_domain	= &identity_domain,
4472	.capable		= intel_iommu_capable,
4473	.hw_info		= intel_iommu_hw_info,
4474	.domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
4475	.domain_alloc_sva	= intel_svm_domain_alloc,
4476	.domain_alloc_paging	= intel_iommu_domain_alloc_paging,
4477	.domain_alloc_nested	= intel_iommu_domain_alloc_nested,
4478	.probe_device		= intel_iommu_probe_device,
4479	.release_device		= intel_iommu_release_device,
4480	.get_resv_regions	= intel_iommu_get_resv_regions,
4481	.device_group		= intel_iommu_device_group,
4482	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4483	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4484	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4485	.def_domain_type	= device_def_domain_type,
4486	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4487	.pgsize_bitmap		= SZ_4K,
4488	.page_response		= intel_iommu_page_response,
4489	.default_domain_ops = &(const struct iommu_domain_ops) {
4490		.attach_dev		= intel_iommu_attach_device,
4491		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4492		.map_pages		= intel_iommu_map_pages,
4493		.unmap_pages		= intel_iommu_unmap_pages,
4494		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4495		.flush_iotlb_all        = intel_flush_iotlb_all,
4496		.iotlb_sync		= intel_iommu_tlb_sync,
4497		.iova_to_phys		= intel_iommu_iova_to_phys,
4498		.free			= intel_iommu_domain_free,
4499		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4500	}
4501};
4502
4503static void quirk_iommu_igfx(struct pci_dev *dev)
4504{
4505	if (risky_device(dev))
4506		return;
4507
4508	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4509	disable_igfx_iommu = 1;
4510}
4511
4512/* G4x/GM45 integrated gfx dmar support is totally busted. */
4513DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4514DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4515DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4516DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4517DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4518DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4519DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4520
4521/* Broadwell igfx malfunctions with dmar */
4522DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4523DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4524DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4525DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4526DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4527DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4528DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4529DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4530DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4531DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4532DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4533DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4534DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4535DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4536DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4537DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4538DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4539DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4540DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4541DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4542DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4543DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4544DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4545DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4546
4547static void quirk_iommu_rwbf(struct pci_dev *dev)
4548{
4549	if (risky_device(dev))
4550		return;
4551
4552	/*
4553	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4554	 * but needs it. Same seems to hold for the desktop versions.
4555	 */
4556	pci_info(dev, "Forcing write-buffer flush capability\n");
4557	rwbf_quirk = 1;
4558}
4559
4560DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4561DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4562DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4563DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4564DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4565DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4566DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4567
4568#define GGC 0x52
4569#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4570#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4571#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4572#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4573#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4574#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4575#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4576#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4577
4578static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4579{
4580	unsigned short ggc;
4581
4582	if (risky_device(dev))
4583		return;
4584
4585	if (pci_read_config_word(dev, GGC, &ggc))
4586		return;
4587
4588	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4589		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4590		disable_igfx_iommu = 1;
4591	} else if (!disable_igfx_iommu) {
4592		/* we have to ensure the gfx device is idle before we flush */
4593		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4594		iommu_set_dma_strict();
4595	}
4596}
4597DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4598DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4599DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4600DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4601
4602static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4603{
4604	unsigned short ver;
4605
4606	if (!IS_GFX_DEVICE(dev))
4607		return;
4608
4609	ver = (dev->device >> 8) & 0xff;
4610	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4611	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4612	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4613		return;
4614
4615	if (risky_device(dev))
4616		return;
4617
4618	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4619	iommu_skip_te_disable = 1;
4620}
4621DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4622
4623/* On Tylersburg chipsets, some BIOSes have been known to enable the
4624   ISOCH DMAR unit for the Azalia sound device, but not give it any
4625   TLB entries, which causes it to deadlock. Check for that.  We do
4626   this in a function called from init_dmars(), instead of in a PCI
4627   quirk, because we don't want to print the obnoxious "BIOS broken"
4628   message if VT-d is actually disabled.
4629*/
4630static void __init check_tylersburg_isoch(void)
4631{
4632	struct pci_dev *pdev;
4633	uint32_t vtisochctrl;
4634
4635	/* If there's no Azalia in the system anyway, forget it. */
4636	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4637	if (!pdev)
4638		return;
4639
4640	if (risky_device(pdev)) {
4641		pci_dev_put(pdev);
4642		return;
4643	}
4644
4645	pci_dev_put(pdev);
4646
4647	/* System Management Registers. Might be hidden, in which case
4648	   we can't do the sanity check. But that's OK, because the
4649	   known-broken BIOSes _don't_ actually hide it, so far. */
4650	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4651	if (!pdev)
4652		return;
4653
4654	if (risky_device(pdev)) {
4655		pci_dev_put(pdev);
4656		return;
4657	}
4658
4659	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4660		pci_dev_put(pdev);
4661		return;
4662	}
4663
4664	pci_dev_put(pdev);
4665
4666	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4667	if (vtisochctrl & 1)
4668		return;
4669
4670	/* Drop all bits other than the number of TLB entries */
4671	vtisochctrl &= 0x1c;
4672
4673	/* If we have the recommended number of TLB entries (16), fine. */
4674	if (vtisochctrl == 0x10)
4675		return;
4676
4677	/* Zero TLB entries? You get to ride the short bus to school. */
4678	if (!vtisochctrl) {
4679		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4680		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4681		     dmi_get_system_info(DMI_BIOS_VENDOR),
4682		     dmi_get_system_info(DMI_BIOS_VERSION),
4683		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4684		iommu_identity_mapping |= IDENTMAP_AZALIA;
4685		return;
4686	}
4687
4688	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4689	       vtisochctrl);
4690}
4691
4692/*
4693 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4694 * invalidation completion before posted writes initiated with translated address
4695 * that utilized translations matching the invalidation address range, violating
4696 * the invalidation completion ordering.
4697 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4698 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4699 * under the control of the trusted/privileged host device driver must use this
4700 * quirk.
4701 * Device TLBs are invalidated under the following six conditions:
4702 * 1. Device driver does DMA API unmap IOVA
4703 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4704 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4705 *    exit_mmap() due to crash
4706 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4707 *    VM has to free pages that were unmapped
4708 * 5. Userspace driver unmaps a DMA buffer
4709 * 6. Cache invalidation in vSVA usage (upcoming)
4710 *
4711 * For #1 and #2, device drivers are responsible for stopping DMA traffic
4712 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4713 * invalidate TLB the same way as normal user unmap which will use this quirk.
4714 * The dTLB invalidation after PASID cache flush does not need this quirk.
4715 *
4716 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4717 */
4718void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4719			       unsigned long address, unsigned long mask,
4720			       u32 pasid, u16 qdep)
4721{
4722	u16 sid;
4723
4724	if (likely(!info->dtlb_extra_inval))
4725		return;
4726
4727	sid = PCI_DEVID(info->bus, info->devfn);
4728	if (pasid == IOMMU_NO_PASID) {
4729		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4730				   qdep, address, mask);
4731	} else {
4732		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4733					 pasid, qdep, address, mask);
4734	}
4735}
4736
4737#define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4738
4739/*
4740 * Function to submit a command to the enhanced command interface. The
4741 * valid enhanced command descriptions are defined in Table 47 of the
4742 * VT-d spec. The VT-d hardware implementation may support some but not
4743 * all commands, which can be determined by checking the Enhanced
4744 * Command Capability Register.
4745 *
4746 * Return values:
4747 *  - 0: Command successful without any error;
4748 *  - Negative: software error value;
4749 *  - Nonzero positive: failure status code defined in Table 48.
4750 */
4751int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4752{
4753	unsigned long flags;
4754	u64 res;
4755	int ret;
4756
4757	if (!cap_ecmds(iommu->cap))
4758		return -ENODEV;
4759
4760	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4761
4762	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4763	if (res & DMA_ECMD_ECRSP_IP) {
4764		ret = -EBUSY;
4765		goto err;
4766	}
4767
4768	/*
4769	 * Unconditionally write the operand B, because
4770	 * - There is no side effect if an ecmd doesn't require an
4771	 *   operand B, but we set the register to some value.
4772	 * - It's not invoked in any critical path. The extra MMIO
4773	 *   write doesn't bring any performance concerns.
4774	 */
4775	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4776	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4777
4778	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4779		      !(res & DMA_ECMD_ECRSP_IP), res);
4780
4781	if (res & DMA_ECMD_ECRSP_IP) {
4782		ret = -ETIMEDOUT;
4783		goto err;
4784	}
4785
4786	ret = ecmd_get_status_code(res);
4787err:
4788	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4789
4790	return ret;
4791}