Linux Audio

Check our new training course

Loading...
v6.8
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright © 2006-2014 Intel Corporation.
   4 *
   5 * Authors: David Woodhouse <dwmw2@infradead.org>,
   6 *          Ashok Raj <ashok.raj@intel.com>,
   7 *          Shaohua Li <shaohua.li@intel.com>,
   8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9 *          Fenghua Yu <fenghua.yu@intel.com>
  10 *          Joerg Roedel <jroedel@suse.de>
  11 */
  12
  13#define pr_fmt(fmt)     "DMAR: " fmt
  14#define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16#include <linux/crash_dump.h>
  17#include <linux/dma-direct.h>
  18#include <linux/dmi.h>
 
  19#include <linux/memory.h>
  20#include <linux/pci.h>
  21#include <linux/pci-ats.h>
  22#include <linux/spinlock.h>
  23#include <linux/syscore_ops.h>
  24#include <linux/tboot.h>
  25#include <uapi/linux/iommufd.h>
  26
  27#include "iommu.h"
  28#include "../dma-iommu.h"
  29#include "../irq_remapping.h"
  30#include "../iommu-sva.h"
  31#include "pasid.h"
  32#include "cap_audit.h"
  33#include "perfmon.h"
  34
  35#define ROOT_SIZE		VTD_PAGE_SIZE
  36#define CONTEXT_SIZE		VTD_PAGE_SIZE
  37
  38#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  39#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  40#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  41#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  42
  43#define IOAPIC_RANGE_START	(0xfee00000)
  44#define IOAPIC_RANGE_END	(0xfeefffff)
  45#define IOVA_START_ADDR		(0x1000)
  46
  47#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  48
 
 
 
  49#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
  50#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
  51
  52/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  53   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  54#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
  55				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  56#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  57
  58/* IO virtual address start page frame number */
  59#define IOVA_START_PFN		(1)
  60
  61#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
  62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  63static void __init check_tylersburg_isoch(void);
  64static int rwbf_quirk;
  65
  66/*
  67 * set to 1 to panic kernel if can't successfully enable VT-d
  68 * (used when kernel is launched w/ TXT)
  69 */
  70static int force_on = 0;
  71static int intel_iommu_tboot_noforce;
  72static int no_platform_optin;
  73
  74#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
  75
  76/*
  77 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
  78 * if marked present.
  79 */
  80static phys_addr_t root_entry_lctp(struct root_entry *re)
  81{
  82	if (!(re->lo & 1))
  83		return 0;
  84
  85	return re->lo & VTD_PAGE_MASK;
  86}
  87
  88/*
  89 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
  90 * if marked present.
  91 */
  92static phys_addr_t root_entry_uctp(struct root_entry *re)
  93{
  94	if (!(re->hi & 1))
  95		return 0;
  96
  97	return re->hi & VTD_PAGE_MASK;
  98}
  99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 100/*
 101 * This domain is a statically identity mapping domain.
 102 *	1. This domain creats a static 1:1 mapping to all usable memory.
 103 * 	2. It maps to each iommu if successful.
 104 *	3. Each iommu mapps to this domain if successful.
 105 */
 106static struct dmar_domain *si_domain;
 107static int hw_pass_through = 1;
 108
 109struct dmar_rmrr_unit {
 110	struct list_head list;		/* list of rmrr units	*/
 111	struct acpi_dmar_header *hdr;	/* ACPI header		*/
 112	u64	base_address;		/* reserved base address*/
 113	u64	end_address;		/* reserved end address */
 114	struct dmar_dev_scope *devices;	/* target devices */
 115	int	devices_cnt;		/* target device count */
 116};
 117
 118struct dmar_atsr_unit {
 119	struct list_head list;		/* list of ATSR units */
 120	struct acpi_dmar_header *hdr;	/* ACPI header */
 121	struct dmar_dev_scope *devices;	/* target devices */
 122	int devices_cnt;		/* target device count */
 123	u8 include_all:1;		/* include all ports */
 124};
 125
 126struct dmar_satc_unit {
 127	struct list_head list;		/* list of SATC units */
 128	struct acpi_dmar_header *hdr;	/* ACPI header */
 129	struct dmar_dev_scope *devices;	/* target devices */
 130	struct intel_iommu *iommu;	/* the corresponding iommu */
 131	int devices_cnt;		/* target device count */
 132	u8 atc_required:1;		/* ATS is required */
 133};
 134
 135static LIST_HEAD(dmar_atsr_units);
 136static LIST_HEAD(dmar_rmrr_units);
 137static LIST_HEAD(dmar_satc_units);
 138
 139#define for_each_rmrr_units(rmrr) \
 140	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 141
 
 142static void intel_iommu_domain_free(struct iommu_domain *domain);
 143
 144int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
 145int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
 146
 147int intel_iommu_enabled = 0;
 148EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 149
 150static int dmar_map_gfx = 1;
 151static int intel_iommu_superpage = 1;
 152static int iommu_identity_mapping;
 153static int iommu_skip_te_disable;
 154
 155#define IDENTMAP_GFX		2
 156#define IDENTMAP_AZALIA		4
 157
 158const struct iommu_ops intel_iommu_ops;
 159static const struct iommu_dirty_ops intel_dirty_ops;
 160
 161static bool translation_pre_enabled(struct intel_iommu *iommu)
 162{
 163	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 164}
 165
 166static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 167{
 168	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 169}
 170
 171static void init_translation_status(struct intel_iommu *iommu)
 172{
 173	u32 gsts;
 174
 175	gsts = readl(iommu->reg + DMAR_GSTS_REG);
 176	if (gsts & DMA_GSTS_TES)
 177		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 178}
 179
 180static int __init intel_iommu_setup(char *str)
 181{
 182	if (!str)
 183		return -EINVAL;
 184
 185	while (*str) {
 186		if (!strncmp(str, "on", 2)) {
 187			dmar_disabled = 0;
 188			pr_info("IOMMU enabled\n");
 189		} else if (!strncmp(str, "off", 3)) {
 190			dmar_disabled = 1;
 191			no_platform_optin = 1;
 192			pr_info("IOMMU disabled\n");
 193		} else if (!strncmp(str, "igfx_off", 8)) {
 194			dmar_map_gfx = 0;
 195			pr_info("Disable GFX device mapping\n");
 196		} else if (!strncmp(str, "forcedac", 8)) {
 197			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 198			iommu_dma_forcedac = true;
 199		} else if (!strncmp(str, "strict", 6)) {
 200			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
 201			iommu_set_dma_strict();
 202		} else if (!strncmp(str, "sp_off", 6)) {
 203			pr_info("Disable supported super page\n");
 204			intel_iommu_superpage = 0;
 205		} else if (!strncmp(str, "sm_on", 5)) {
 206			pr_info("Enable scalable mode if hardware supports\n");
 207			intel_iommu_sm = 1;
 208		} else if (!strncmp(str, "sm_off", 6)) {
 209			pr_info("Scalable mode is disallowed\n");
 210			intel_iommu_sm = 0;
 211		} else if (!strncmp(str, "tboot_noforce", 13)) {
 212			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 213			intel_iommu_tboot_noforce = 1;
 214		} else {
 215			pr_notice("Unknown option - '%s'\n", str);
 216		}
 217
 218		str += strcspn(str, ",");
 219		while (*str == ',')
 220			str++;
 221	}
 222
 223	return 1;
 224}
 225__setup("intel_iommu=", intel_iommu_setup);
 226
 227void *alloc_pgtable_page(int node, gfp_t gfp)
 228{
 229	struct page *page;
 230	void *vaddr = NULL;
 231
 232	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
 233	if (page)
 234		vaddr = page_address(page);
 235	return vaddr;
 236}
 237
 238void free_pgtable_page(void *vaddr)
 239{
 240	free_page((unsigned long)vaddr);
 241}
 242
 243static int domain_type_is_si(struct dmar_domain *domain)
 244{
 245	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
 246}
 247
 248static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
 
 249{
 250	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 251
 252	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 253}
 254
 255/*
 256 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
 257 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
 258 * the returned SAGAW.
 259 */
 260static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
 261{
 262	unsigned long fl_sagaw, sl_sagaw;
 263
 264	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
 265	sl_sagaw = cap_sagaw(iommu->cap);
 266
 267	/* Second level only. */
 268	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
 269		return sl_sagaw;
 270
 271	/* First level only. */
 272	if (!ecap_slts(iommu->ecap))
 273		return fl_sagaw;
 274
 275	return fl_sagaw & sl_sagaw;
 276}
 277
 278static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 279{
 280	unsigned long sagaw;
 281	int agaw;
 282
 283	sagaw = __iommu_calculate_sagaw(iommu);
 284	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
 285		if (test_bit(agaw, &sagaw))
 286			break;
 287	}
 288
 289	return agaw;
 290}
 291
 292/*
 293 * Calculate max SAGAW for each iommu.
 294 */
 295int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 296{
 297	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 298}
 299
 300/*
 301 * calculate agaw for each iommu.
 302 * "SAGAW" may be different across iommus, use a default agaw, and
 303 * get a supported less agaw for iommus that don't support the default agaw.
 304 */
 305int iommu_calculate_agaw(struct intel_iommu *iommu)
 306{
 307	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 308}
 309
 310static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 311{
 312	return sm_supported(iommu) ?
 313			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 314}
 315
 316static void domain_update_iommu_coherency(struct dmar_domain *domain)
 317{
 318	struct iommu_domain_info *info;
 319	struct dmar_drhd_unit *drhd;
 320	struct intel_iommu *iommu;
 321	bool found = false;
 322	unsigned long i;
 323
 324	domain->iommu_coherency = true;
 325	xa_for_each(&domain->iommu_array, i, info) {
 326		found = true;
 327		if (!iommu_paging_structure_coherency(info->iommu)) {
 328			domain->iommu_coherency = false;
 329			break;
 330		}
 331	}
 332	if (found)
 333		return;
 334
 335	/* No hardware attached; use lowest common denominator */
 336	rcu_read_lock();
 337	for_each_active_iommu(iommu, drhd) {
 338		if (!iommu_paging_structure_coherency(iommu)) {
 339			domain->iommu_coherency = false;
 340			break;
 341		}
 342	}
 343	rcu_read_unlock();
 344}
 345
 346static int domain_update_iommu_superpage(struct dmar_domain *domain,
 347					 struct intel_iommu *skip)
 348{
 349	struct dmar_drhd_unit *drhd;
 350	struct intel_iommu *iommu;
 351	int mask = 0x3;
 352
 353	if (!intel_iommu_superpage)
 354		return 0;
 355
 356	/* set iommu_superpage to the smallest common denominator */
 357	rcu_read_lock();
 358	for_each_active_iommu(iommu, drhd) {
 359		if (iommu != skip) {
 360			if (domain && domain->use_first_level) {
 361				if (!cap_fl1gp_support(iommu->cap))
 362					mask = 0x1;
 363			} else {
 364				mask &= cap_super_page_val(iommu->cap);
 365			}
 366
 367			if (!mask)
 368				break;
 369		}
 370	}
 371	rcu_read_unlock();
 372
 373	return fls(mask);
 374}
 375
 376static int domain_update_device_node(struct dmar_domain *domain)
 377{
 378	struct device_domain_info *info;
 379	int nid = NUMA_NO_NODE;
 380	unsigned long flags;
 381
 382	spin_lock_irqsave(&domain->lock, flags);
 383	list_for_each_entry(info, &domain->devices, link) {
 384		/*
 385		 * There could possibly be multiple device numa nodes as devices
 386		 * within the same domain may sit behind different IOMMUs. There
 387		 * isn't perfect answer in such situation, so we select first
 388		 * come first served policy.
 389		 */
 390		nid = dev_to_node(info->dev);
 391		if (nid != NUMA_NO_NODE)
 392			break;
 393	}
 394	spin_unlock_irqrestore(&domain->lock, flags);
 395
 396	return nid;
 397}
 398
 
 
 399/* Return the super pagesize bitmap if supported. */
 400static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
 401{
 402	unsigned long bitmap = 0;
 403
 404	/*
 405	 * 1-level super page supports page size of 2MiB, 2-level super page
 406	 * supports page size of both 2MiB and 1GiB.
 407	 */
 408	if (domain->iommu_superpage == 1)
 409		bitmap |= SZ_2M;
 410	else if (domain->iommu_superpage == 2)
 411		bitmap |= SZ_2M | SZ_1G;
 412
 413	return bitmap;
 414}
 415
 416/* Some capabilities may be different across iommus */
 417void domain_update_iommu_cap(struct dmar_domain *domain)
 418{
 419	domain_update_iommu_coherency(domain);
 420	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 421
 422	/*
 423	 * If RHSA is missing, we should default to the device numa domain
 424	 * as fall back.
 425	 */
 426	if (domain->nid == NUMA_NO_NODE)
 427		domain->nid = domain_update_device_node(domain);
 428
 429	/*
 430	 * First-level translation restricts the input-address to a
 431	 * canonical address (i.e., address bits 63:N have the same
 432	 * value as address bit [N-1], where N is 48-bits with 4-level
 433	 * paging and 57-bits with 5-level paging). Hence, skip bit
 434	 * [N-1].
 435	 */
 436	if (domain->use_first_level)
 437		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
 438	else
 439		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
 440
 441	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
 442	domain_update_iotlb(domain);
 443}
 444
 445struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 446					 u8 devfn, int alloc)
 447{
 448	struct root_entry *root = &iommu->root_entry[bus];
 449	struct context_entry *context;
 450	u64 *entry;
 451
 452	/*
 453	 * Except that the caller requested to allocate a new entry,
 454	 * returning a copied context entry makes no sense.
 455	 */
 456	if (!alloc && context_copied(iommu, bus, devfn))
 457		return NULL;
 458
 459	entry = &root->lo;
 460	if (sm_supported(iommu)) {
 461		if (devfn >= 0x80) {
 462			devfn -= 0x80;
 463			entry = &root->hi;
 464		}
 465		devfn *= 2;
 466	}
 467	if (*entry & 1)
 468		context = phys_to_virt(*entry & VTD_PAGE_MASK);
 469	else {
 470		unsigned long phy_addr;
 471		if (!alloc)
 472			return NULL;
 473
 474		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
 475		if (!context)
 476			return NULL;
 477
 478		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 479		phy_addr = virt_to_phys((void *)context);
 480		*entry = phy_addr | 1;
 481		__iommu_flush_cache(iommu, entry, sizeof(*entry));
 482	}
 483	return &context[devfn];
 484}
 485
 486/**
 487 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 488 *				 sub-hierarchy of a candidate PCI-PCI bridge
 489 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 490 * @bridge: the candidate PCI-PCI bridge
 491 *
 492 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 493 */
 494static bool
 495is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 496{
 497	struct pci_dev *pdev, *pbridge;
 498
 499	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 500		return false;
 501
 502	pdev = to_pci_dev(dev);
 503	pbridge = to_pci_dev(bridge);
 504
 505	if (pbridge->subordinate &&
 506	    pbridge->subordinate->number <= pdev->bus->number &&
 507	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
 508		return true;
 509
 510	return false;
 511}
 512
 513static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 514{
 515	struct dmar_drhd_unit *drhd;
 516	u32 vtbar;
 517	int rc;
 518
 519	/* We know that this device on this chipset has its own IOMMU.
 520	 * If we find it under a different IOMMU, then the BIOS is lying
 521	 * to us. Hope that the IOMMU for this device is actually
 522	 * disabled, and it needs no translation...
 523	 */
 524	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 525	if (rc) {
 526		/* "can't" happen */
 527		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 528		return false;
 529	}
 530	vtbar &= 0xffff0000;
 531
 532	/* we know that the this iommu should be at offset 0xa000 from vtbar */
 533	drhd = dmar_find_matched_drhd_unit(pdev);
 534	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 535		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 536		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 537		return true;
 538	}
 539
 540	return false;
 541}
 542
 543static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 544{
 545	if (!iommu || iommu->drhd->ignored)
 546		return true;
 547
 548	if (dev_is_pci(dev)) {
 549		struct pci_dev *pdev = to_pci_dev(dev);
 550
 551		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 552		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 553		    quirk_ioat_snb_local_iommu(pdev))
 554			return true;
 555	}
 556
 557	return false;
 558}
 559
 560static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
 561{
 562	struct dmar_drhd_unit *drhd = NULL;
 563	struct pci_dev *pdev = NULL;
 564	struct intel_iommu *iommu;
 565	struct device *tmp;
 566	u16 segment = 0;
 567	int i;
 568
 569	if (!dev)
 570		return NULL;
 571
 572	if (dev_is_pci(dev)) {
 573		struct pci_dev *pf_pdev;
 574
 575		pdev = pci_real_dma_dev(to_pci_dev(dev));
 576
 577		/* VFs aren't listed in scope tables; we need to look up
 578		 * the PF instead to find the IOMMU. */
 579		pf_pdev = pci_physfn(pdev);
 580		dev = &pf_pdev->dev;
 581		segment = pci_domain_nr(pdev->bus);
 582	} else if (has_acpi_companion(dev))
 583		dev = &ACPI_COMPANION(dev)->dev;
 584
 585	rcu_read_lock();
 586	for_each_iommu(iommu, drhd) {
 587		if (pdev && segment != drhd->segment)
 588			continue;
 589
 590		for_each_active_dev_scope(drhd->devices,
 591					  drhd->devices_cnt, i, tmp) {
 592			if (tmp == dev) {
 593				/* For a VF use its original BDF# not that of the PF
 594				 * which we used for the IOMMU lookup. Strictly speaking
 595				 * we could do this for all PCI devices; we only need to
 596				 * get the BDF# from the scope table for ACPI matches. */
 597				if (pdev && pdev->is_virtfn)
 598					goto got_pdev;
 599
 600				if (bus && devfn) {
 601					*bus = drhd->devices[i].bus;
 602					*devfn = drhd->devices[i].devfn;
 603				}
 604				goto out;
 605			}
 606
 607			if (is_downstream_to_pci_bridge(dev, tmp))
 608				goto got_pdev;
 609		}
 610
 611		if (pdev && drhd->include_all) {
 612got_pdev:
 613			if (bus && devfn) {
 614				*bus = pdev->bus->number;
 615				*devfn = pdev->devfn;
 616			}
 617			goto out;
 618		}
 619	}
 620	iommu = NULL;
 621out:
 622	if (iommu_is_dummy(iommu, dev))
 623		iommu = NULL;
 624
 625	rcu_read_unlock();
 626
 627	return iommu;
 628}
 629
 630static void domain_flush_cache(struct dmar_domain *domain,
 631			       void *addr, int size)
 632{
 633	if (!domain->iommu_coherency)
 634		clflush_cache_range(addr, size);
 635}
 636
 637static void free_context_table(struct intel_iommu *iommu)
 638{
 639	struct context_entry *context;
 640	int i;
 641
 642	if (!iommu->root_entry)
 643		return;
 644
 645	for (i = 0; i < ROOT_ENTRY_NR; i++) {
 646		context = iommu_context_addr(iommu, i, 0, 0);
 647		if (context)
 648			free_pgtable_page(context);
 649
 650		if (!sm_supported(iommu))
 651			continue;
 652
 653		context = iommu_context_addr(iommu, i, 0x80, 0);
 654		if (context)
 655			free_pgtable_page(context);
 656	}
 657
 658	free_pgtable_page(iommu->root_entry);
 659	iommu->root_entry = NULL;
 660}
 661
 662#ifdef CONFIG_DMAR_DEBUG
 663static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
 664			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
 665{
 666	struct dma_pte *pte;
 667	int offset;
 668
 669	while (1) {
 670		offset = pfn_level_offset(pfn, level);
 671		pte = &parent[offset];
 672		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
 673			pr_info("PTE not present at level %d\n", level);
 674			break;
 675		}
 676
 677		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
 678
 679		if (level == 1)
 680			break;
 681
 682		parent = phys_to_virt(dma_pte_addr(pte));
 683		level--;
 684	}
 685}
 686
 687void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
 688			  unsigned long long addr, u32 pasid)
 689{
 690	struct pasid_dir_entry *dir, *pde;
 691	struct pasid_entry *entries, *pte;
 692	struct context_entry *ctx_entry;
 693	struct root_entry *rt_entry;
 694	int i, dir_index, index, level;
 695	u8 devfn = source_id & 0xff;
 696	u8 bus = source_id >> 8;
 697	struct dma_pte *pgtable;
 698
 699	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
 700
 701	/* root entry dump */
 702	rt_entry = &iommu->root_entry[bus];
 703	if (!rt_entry) {
 704		pr_info("root table entry is not present\n");
 705		return;
 706	}
 707
 708	if (sm_supported(iommu))
 709		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
 710			rt_entry->hi, rt_entry->lo);
 711	else
 712		pr_info("root entry: 0x%016llx", rt_entry->lo);
 713
 714	/* context entry dump */
 715	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
 716	if (!ctx_entry) {
 717		pr_info("context table entry is not present\n");
 718		return;
 719	}
 720
 721	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
 722		ctx_entry->hi, ctx_entry->lo);
 723
 724	/* legacy mode does not require PASID entries */
 725	if (!sm_supported(iommu)) {
 726		level = agaw_to_level(ctx_entry->hi & 7);
 727		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 728		goto pgtable_walk;
 729	}
 730
 731	/* get the pointer to pasid directory entry */
 732	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 733	if (!dir) {
 734		pr_info("pasid directory entry is not present\n");
 735		return;
 736	}
 737	/* For request-without-pasid, get the pasid from context entry */
 738	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
 739		pasid = IOMMU_NO_PASID;
 740
 741	dir_index = pasid >> PASID_PDE_SHIFT;
 742	pde = &dir[dir_index];
 743	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
 744
 745	/* get the pointer to the pasid table entry */
 746	entries = get_pasid_table_from_pde(pde);
 747	if (!entries) {
 748		pr_info("pasid table entry is not present\n");
 749		return;
 750	}
 751	index = pasid & PASID_PTE_MASK;
 752	pte = &entries[index];
 753	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
 754		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
 755
 756	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
 757		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
 758		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
 759	} else {
 760		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
 761		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
 762	}
 763
 764pgtable_walk:
 765	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
 766}
 767#endif
 768
 769static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 770				      unsigned long pfn, int *target_level,
 771				      gfp_t gfp)
 772{
 773	struct dma_pte *parent, *pte;
 774	int level = agaw_to_level(domain->agaw);
 775	int offset;
 776
 
 
 777	if (!domain_pfn_supported(domain, pfn))
 778		/* Address beyond IOMMU's addressing capabilities. */
 779		return NULL;
 780
 781	parent = domain->pgd;
 782
 783	while (1) {
 784		void *tmp_page;
 785
 786		offset = pfn_level_offset(pfn, level);
 787		pte = &parent[offset];
 788		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 789			break;
 790		if (level == *target_level)
 791			break;
 792
 793		if (!dma_pte_present(pte)) {
 794			uint64_t pteval;
 795
 796			tmp_page = alloc_pgtable_page(domain->nid, gfp);
 797
 798			if (!tmp_page)
 799				return NULL;
 800
 801			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 802			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 803			if (domain->use_first_level)
 804				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
 805
 806			if (cmpxchg64(&pte->val, 0ULL, pteval))
 807				/* Someone else set it while we were thinking; use theirs. */
 808				free_pgtable_page(tmp_page);
 809			else
 810				domain_flush_cache(domain, pte, sizeof(*pte));
 811		}
 812		if (level == 1)
 813			break;
 814
 815		parent = phys_to_virt(dma_pte_addr(pte));
 816		level--;
 817	}
 818
 819	if (!*target_level)
 820		*target_level = level;
 821
 822	return pte;
 823}
 824
 825/* return address's pte at specific level */
 826static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 827					 unsigned long pfn,
 828					 int level, int *large_page)
 829{
 830	struct dma_pte *parent, *pte;
 831	int total = agaw_to_level(domain->agaw);
 832	int offset;
 833
 834	parent = domain->pgd;
 835	while (level <= total) {
 836		offset = pfn_level_offset(pfn, total);
 837		pte = &parent[offset];
 838		if (level == total)
 839			return pte;
 840
 841		if (!dma_pte_present(pte)) {
 842			*large_page = total;
 843			break;
 844		}
 845
 846		if (dma_pte_superpage(pte)) {
 847			*large_page = total;
 848			return pte;
 849		}
 850
 851		parent = phys_to_virt(dma_pte_addr(pte));
 852		total--;
 853	}
 854	return NULL;
 855}
 856
 857/* clear last level pte, a tlb flush should be followed */
 858static void dma_pte_clear_range(struct dmar_domain *domain,
 859				unsigned long start_pfn,
 860				unsigned long last_pfn)
 861{
 862	unsigned int large_page;
 863	struct dma_pte *first_pte, *pte;
 864
 865	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
 866	    WARN_ON(start_pfn > last_pfn))
 867		return;
 868
 869	/* we don't need lock here; nobody else touches the iova range */
 870	do {
 871		large_page = 1;
 872		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 873		if (!pte) {
 874			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 875			continue;
 876		}
 877		do {
 878			dma_clear_pte(pte);
 879			start_pfn += lvl_to_nr_pages(large_page);
 880			pte++;
 881		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 882
 883		domain_flush_cache(domain, first_pte,
 884				   (void *)pte - (void *)first_pte);
 885
 886	} while (start_pfn && start_pfn <= last_pfn);
 887}
 888
 889static void dma_pte_free_level(struct dmar_domain *domain, int level,
 890			       int retain_level, struct dma_pte *pte,
 891			       unsigned long pfn, unsigned long start_pfn,
 892			       unsigned long last_pfn)
 893{
 894	pfn = max(start_pfn, pfn);
 895	pte = &pte[pfn_level_offset(pfn, level)];
 896
 897	do {
 898		unsigned long level_pfn;
 899		struct dma_pte *level_pte;
 900
 901		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
 902			goto next;
 903
 904		level_pfn = pfn & level_mask(level);
 905		level_pte = phys_to_virt(dma_pte_addr(pte));
 906
 907		if (level > 2) {
 908			dma_pte_free_level(domain, level - 1, retain_level,
 909					   level_pte, level_pfn, start_pfn,
 910					   last_pfn);
 911		}
 912
 913		/*
 914		 * Free the page table if we're below the level we want to
 915		 * retain and the range covers the entire table.
 916		 */
 917		if (level < retain_level && !(start_pfn > level_pfn ||
 918		      last_pfn < level_pfn + level_size(level) - 1)) {
 919			dma_clear_pte(pte);
 920			domain_flush_cache(domain, pte, sizeof(*pte));
 921			free_pgtable_page(level_pte);
 922		}
 923next:
 924		pfn += level_size(level);
 925	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
 926}
 927
 928/*
 929 * clear last level (leaf) ptes and free page table pages below the
 930 * level we wish to keep intact.
 931 */
 932static void dma_pte_free_pagetable(struct dmar_domain *domain,
 933				   unsigned long start_pfn,
 934				   unsigned long last_pfn,
 935				   int retain_level)
 936{
 937	dma_pte_clear_range(domain, start_pfn, last_pfn);
 938
 939	/* We don't need lock here; nobody else touches the iova range */
 940	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
 941			   domain->pgd, 0, start_pfn, last_pfn);
 942
 943	/* free pgd */
 944	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 945		free_pgtable_page(domain->pgd);
 946		domain->pgd = NULL;
 947	}
 948}
 949
 950/* When a page at a given level is being unlinked from its parent, we don't
 951   need to *modify* it at all. All we need to do is make a list of all the
 952   pages which can be freed just as soon as we've flushed the IOTLB and we
 953   know the hardware page-walk will no longer touch them.
 954   The 'pte' argument is the *parent* PTE, pointing to the page that is to
 955   be freed. */
 956static void dma_pte_list_pagetables(struct dmar_domain *domain,
 957				    int level, struct dma_pte *pte,
 958				    struct list_head *freelist)
 959{
 960	struct page *pg;
 961
 962	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
 963	list_add_tail(&pg->lru, freelist);
 964
 965	if (level == 1)
 966		return;
 967
 968	pte = page_address(pg);
 969	do {
 970		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
 971			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
 972		pte++;
 973	} while (!first_pte_in_page(pte));
 974}
 975
 976static void dma_pte_clear_level(struct dmar_domain *domain, int level,
 977				struct dma_pte *pte, unsigned long pfn,
 978				unsigned long start_pfn, unsigned long last_pfn,
 979				struct list_head *freelist)
 980{
 981	struct dma_pte *first_pte = NULL, *last_pte = NULL;
 982
 983	pfn = max(start_pfn, pfn);
 984	pte = &pte[pfn_level_offset(pfn, level)];
 985
 986	do {
 987		unsigned long level_pfn = pfn & level_mask(level);
 988
 989		if (!dma_pte_present(pte))
 990			goto next;
 991
 992		/* If range covers entire pagetable, free it */
 993		if (start_pfn <= level_pfn &&
 994		    last_pfn >= level_pfn + level_size(level) - 1) {
 995			/* These suborbinate page tables are going away entirely. Don't
 996			   bother to clear them; we're just going to *free* them. */
 997			if (level > 1 && !dma_pte_superpage(pte))
 998				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
 999
1000			dma_clear_pte(pte);
1001			if (!first_pte)
1002				first_pte = pte;
1003			last_pte = pte;
1004		} else if (level > 1) {
1005			/* Recurse down into a level that isn't *entirely* obsolete */
1006			dma_pte_clear_level(domain, level - 1,
1007					    phys_to_virt(dma_pte_addr(pte)),
1008					    level_pfn, start_pfn, last_pfn,
1009					    freelist);
1010		}
1011next:
1012		pfn = level_pfn + level_size(level);
1013	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1014
1015	if (first_pte)
1016		domain_flush_cache(domain, first_pte,
1017				   (void *)++last_pte - (void *)first_pte);
1018}
1019
1020/* We can't just free the pages because the IOMMU may still be walking
1021   the page tables, and may have cached the intermediate levels. The
1022   pages can only be freed after the IOTLB flush has been done. */
1023static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1024			 unsigned long last_pfn, struct list_head *freelist)
1025{
1026	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1027	    WARN_ON(start_pfn > last_pfn))
1028		return;
1029
1030	/* we don't need lock here; nobody else touches the iova range */
1031	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1032			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1033
1034	/* free pgd */
1035	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1036		struct page *pgd_page = virt_to_page(domain->pgd);
1037		list_add_tail(&pgd_page->lru, freelist);
1038		domain->pgd = NULL;
1039	}
1040}
1041
1042/* iommu handling */
1043static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1044{
1045	struct root_entry *root;
1046
1047	root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1048	if (!root) {
1049		pr_err("Allocating root entry for %s failed\n",
1050			iommu->name);
1051		return -ENOMEM;
1052	}
1053
1054	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1055	iommu->root_entry = root;
1056
1057	return 0;
1058}
1059
1060static void iommu_set_root_entry(struct intel_iommu *iommu)
1061{
1062	u64 addr;
1063	u32 sts;
1064	unsigned long flag;
1065
1066	addr = virt_to_phys(iommu->root_entry);
1067	if (sm_supported(iommu))
1068		addr |= DMA_RTADDR_SMT;
1069
1070	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1071	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1072
1073	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1074
1075	/* Make sure hardware complete it */
1076	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1077		      readl, (sts & DMA_GSTS_RTPS), sts);
1078
1079	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1080
1081	/*
1082	 * Hardware invalidates all DMA remapping hardware translation
1083	 * caches as part of SRTP flow.
1084	 */
1085	if (cap_esrtps(iommu->cap))
1086		return;
1087
1088	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1089	if (sm_supported(iommu))
1090		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1091	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1092}
1093
1094void iommu_flush_write_buffer(struct intel_iommu *iommu)
1095{
1096	u32 val;
1097	unsigned long flag;
1098
1099	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1100		return;
1101
1102	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1103	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1104
1105	/* Make sure hardware complete it */
1106	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1107		      readl, (!(val & DMA_GSTS_WBFS)), val);
1108
1109	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1110}
1111
1112/* return value determine if we need a write buffer flush */
1113static void __iommu_flush_context(struct intel_iommu *iommu,
1114				  u16 did, u16 source_id, u8 function_mask,
1115				  u64 type)
1116{
1117	u64 val = 0;
1118	unsigned long flag;
1119
1120	switch (type) {
1121	case DMA_CCMD_GLOBAL_INVL:
1122		val = DMA_CCMD_GLOBAL_INVL;
1123		break;
1124	case DMA_CCMD_DOMAIN_INVL:
1125		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1126		break;
1127	case DMA_CCMD_DEVICE_INVL:
1128		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1129			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1130		break;
1131	default:
1132		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1133			iommu->name, type);
1134		return;
1135	}
1136	val |= DMA_CCMD_ICC;
1137
1138	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1139	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1140
1141	/* Make sure hardware complete it */
1142	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1143		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1144
1145	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1146}
1147
1148/* return value determine if we need a write buffer flush */
1149static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1150				u64 addr, unsigned int size_order, u64 type)
1151{
1152	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1153	u64 val = 0, val_iva = 0;
1154	unsigned long flag;
1155
1156	switch (type) {
1157	case DMA_TLB_GLOBAL_FLUSH:
1158		/* global flush doesn't need set IVA_REG */
1159		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1160		break;
1161	case DMA_TLB_DSI_FLUSH:
1162		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1163		break;
1164	case DMA_TLB_PSI_FLUSH:
1165		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1166		/* IH bit is passed in as part of address */
1167		val_iva = size_order | addr;
1168		break;
1169	default:
1170		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1171			iommu->name, type);
1172		return;
1173	}
1174
 
 
 
 
 
 
 
 
1175	if (cap_write_drain(iommu->cap))
1176		val |= DMA_TLB_WRITE_DRAIN;
1177
1178	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1179	/* Note: Only uses first TLB reg currently */
1180	if (val_iva)
1181		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1182	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1183
1184	/* Make sure hardware complete it */
1185	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1186		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1187
1188	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1189
1190	/* check IOTLB invalidation granularity */
1191	if (DMA_TLB_IAIG(val) == 0)
1192		pr_err("Flush IOTLB failed\n");
1193	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1194		pr_debug("TLB flush request %Lx, actual %Lx\n",
1195			(unsigned long long)DMA_TLB_IIRG(type),
1196			(unsigned long long)DMA_TLB_IAIG(val));
1197}
1198
1199static struct device_domain_info *
1200domain_lookup_dev_info(struct dmar_domain *domain,
1201		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1202{
1203	struct device_domain_info *info;
1204	unsigned long flags;
1205
1206	spin_lock_irqsave(&domain->lock, flags);
1207	list_for_each_entry(info, &domain->devices, link) {
1208		if (info->iommu == iommu && info->bus == bus &&
1209		    info->devfn == devfn) {
1210			spin_unlock_irqrestore(&domain->lock, flags);
1211			return info;
1212		}
1213	}
1214	spin_unlock_irqrestore(&domain->lock, flags);
1215
1216	return NULL;
1217}
1218
1219void domain_update_iotlb(struct dmar_domain *domain)
1220{
1221	struct dev_pasid_info *dev_pasid;
1222	struct device_domain_info *info;
1223	bool has_iotlb_device = false;
1224	unsigned long flags;
1225
1226	spin_lock_irqsave(&domain->lock, flags);
1227	list_for_each_entry(info, &domain->devices, link) {
1228		if (info->ats_enabled) {
1229			has_iotlb_device = true;
1230			break;
1231		}
1232	}
1233
1234	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1235		info = dev_iommu_priv_get(dev_pasid->dev);
1236		if (info->ats_enabled) {
1237			has_iotlb_device = true;
1238			break;
1239		}
1240	}
1241	domain->has_iotlb_device = has_iotlb_device;
1242	spin_unlock_irqrestore(&domain->lock, flags);
1243}
1244
1245/*
1246 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1247 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1248 * check because it applies only to the built-in QAT devices and it doesn't
1249 * grant additional privileges.
1250 */
1251#define BUGGY_QAT_DEVID_MASK 0x4940
1252static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1253{
1254	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1255		return false;
1256
1257	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1258		return false;
1259
1260	return true;
1261}
1262
1263static void iommu_enable_pci_caps(struct device_domain_info *info)
1264{
1265	struct pci_dev *pdev;
1266
1267	if (!dev_is_pci(info->dev))
1268		return;
1269
1270	pdev = to_pci_dev(info->dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1271
1272	/* The PCIe spec, in its wisdom, declares that the behaviour of
1273	   the device if you enable PASID support after ATS support is
1274	   undefined. So always enable PASID support on devices which
1275	   have it, even if we can't yet know if we're ever going to
1276	   use it. */
1277	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1278		info->pasid_enabled = 1;
1279
 
 
 
 
 
1280	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1281	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1282		info->ats_enabled = 1;
1283		domain_update_iotlb(info->domain);
 
1284	}
1285}
1286
1287static void iommu_disable_pci_caps(struct device_domain_info *info)
1288{
1289	struct pci_dev *pdev;
1290
1291	if (!dev_is_pci(info->dev))
1292		return;
1293
1294	pdev = to_pci_dev(info->dev);
1295
1296	if (info->ats_enabled) {
1297		pci_disable_ats(pdev);
1298		info->ats_enabled = 0;
1299		domain_update_iotlb(info->domain);
1300	}
1301
 
 
 
 
 
1302	if (info->pasid_enabled) {
1303		pci_disable_pasid(pdev);
1304		info->pasid_enabled = 0;
1305	}
1306}
1307
1308static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1309				    u64 addr, unsigned int mask)
1310{
1311	u16 sid, qdep;
1312
1313	if (!info || !info->ats_enabled)
1314		return;
1315
1316	sid = info->bus << 8 | info->devfn;
1317	qdep = info->ats_qdep;
1318	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1319			   qdep, addr, mask);
1320	quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1321}
1322
1323static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1324				  u64 addr, unsigned mask)
1325{
1326	struct dev_pasid_info *dev_pasid;
1327	struct device_domain_info *info;
1328	unsigned long flags;
1329
1330	if (!domain->has_iotlb_device)
1331		return;
1332
1333	spin_lock_irqsave(&domain->lock, flags);
1334	list_for_each_entry(info, &domain->devices, link)
1335		__iommu_flush_dev_iotlb(info, addr, mask);
1336
1337	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1338		info = dev_iommu_priv_get(dev_pasid->dev);
1339
1340		if (!info->ats_enabled)
1341			continue;
1342
1343		qi_flush_dev_iotlb_pasid(info->iommu,
1344					 PCI_DEVID(info->bus, info->devfn),
1345					 info->pfsid, dev_pasid->pasid,
1346					 info->ats_qdep, addr,
1347					 mask);
1348	}
1349	spin_unlock_irqrestore(&domain->lock, flags);
1350}
1351
1352static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1353				     struct dmar_domain *domain, u64 addr,
1354				     unsigned long npages, bool ih)
1355{
1356	u16 did = domain_id_iommu(domain, iommu);
1357	struct dev_pasid_info *dev_pasid;
1358	unsigned long flags;
1359
1360	spin_lock_irqsave(&domain->lock, flags);
1361	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1362		qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1363
1364	if (!list_empty(&domain->devices))
1365		qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1366	spin_unlock_irqrestore(&domain->lock, flags);
1367}
1368
1369static void __iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1370				    unsigned long pfn, unsigned int pages,
1371				    int ih)
1372{
1373	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1374	unsigned long bitmask = aligned_pages - 1;
1375	unsigned int mask = ilog2(aligned_pages);
1376	u64 addr = (u64)pfn << VTD_PAGE_SHIFT;
1377
1378	/*
1379	 * PSI masks the low order bits of the base address. If the
1380	 * address isn't aligned to the mask, then compute a mask value
1381	 * needed to ensure the target range is flushed.
1382	 */
1383	if (unlikely(bitmask & pfn)) {
1384		unsigned long end_pfn = pfn + pages - 1, shared_bits;
1385
1386		/*
1387		 * Since end_pfn <= pfn + bitmask, the only way bits
1388		 * higher than bitmask can differ in pfn and end_pfn is
1389		 * by carrying. This means after masking out bitmask,
1390		 * high bits starting with the first set bit in
1391		 * shared_bits are all equal in both pfn and end_pfn.
1392		 */
1393		shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1394		mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1395	}
1396
1397	/*
1398	 * Fallback to domain selective flush if no PSI support or
1399	 * the size is too big.
1400	 */
1401	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1402		iommu->flush.flush_iotlb(iommu, did, 0, 0,
1403					 DMA_TLB_DSI_FLUSH);
1404	else
1405		iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1406					 DMA_TLB_PSI_FLUSH);
1407}
1408
1409static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1410				  struct dmar_domain *domain,
1411				  unsigned long pfn, unsigned int pages,
1412				  int ih, int map)
1413{
1414	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1415	unsigned int mask = ilog2(aligned_pages);
1416	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1417	u16 did = domain_id_iommu(domain, iommu);
1418
1419	if (WARN_ON(!pages))
1420		return;
1421
1422	if (ih)
1423		ih = 1 << 6;
1424
1425	if (domain->use_first_level)
1426		domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1427	else
1428		__iommu_flush_iotlb_psi(iommu, did, pfn, pages, ih);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1429
1430	/*
1431	 * In caching mode, changes of pages from non-present to present require
1432	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1433	 */
1434	if (!cap_caching_mode(iommu->cap) || !map)
1435		iommu_flush_dev_iotlb(domain, addr, mask);
1436}
1437
1438/* Notification for newly created mappings */
1439static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain,
1440				 unsigned long pfn, unsigned int pages)
 
1441{
1442	/*
1443	 * It's a non-present to present mapping. Only flush if caching mode
1444	 * and second level.
1445	 */
1446	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1447		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1448	else
1449		iommu_flush_write_buffer(iommu);
1450}
1451
1452/*
1453 * Flush the relevant caches in nested translation if the domain
1454 * also serves as a parent
1455 */
1456static void parent_domain_flush(struct dmar_domain *domain,
1457				unsigned long pfn,
1458				unsigned long pages, int ih)
1459{
1460	struct dmar_domain *s1_domain;
1461
1462	spin_lock(&domain->s1_lock);
1463	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
1464		struct device_domain_info *device_info;
1465		struct iommu_domain_info *info;
1466		unsigned long flags;
1467		unsigned long i;
1468
1469		xa_for_each(&s1_domain->iommu_array, i, info)
1470			__iommu_flush_iotlb_psi(info->iommu, info->did,
1471						pfn, pages, ih);
1472
1473		if (!s1_domain->has_iotlb_device)
1474			continue;
1475
1476		spin_lock_irqsave(&s1_domain->lock, flags);
1477		list_for_each_entry(device_info, &s1_domain->devices, link)
1478			/*
1479			 * Address translation cache in device side caches the
1480			 * result of nested translation. There is no easy way
1481			 * to identify the exact set of nested translations
1482			 * affected by a change in S2. So just flush the entire
1483			 * device cache.
1484			 */
1485			__iommu_flush_dev_iotlb(device_info, 0,
1486						MAX_AGAW_PFN_WIDTH);
1487		spin_unlock_irqrestore(&s1_domain->lock, flags);
1488	}
1489	spin_unlock(&domain->s1_lock);
1490}
1491
1492static void intel_flush_iotlb_all(struct iommu_domain *domain)
1493{
1494	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1495	struct iommu_domain_info *info;
1496	unsigned long idx;
1497
1498	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1499		struct intel_iommu *iommu = info->iommu;
1500		u16 did = domain_id_iommu(dmar_domain, iommu);
1501
1502		if (dmar_domain->use_first_level)
1503			domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1504		else
1505			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1506						 DMA_TLB_DSI_FLUSH);
1507
1508		if (!cap_caching_mode(iommu->cap))
1509			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1510	}
1511
1512	if (dmar_domain->nested_parent)
1513		parent_domain_flush(dmar_domain, 0, -1, 0);
1514}
1515
1516static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1517{
1518	u32 pmen;
1519	unsigned long flags;
1520
1521	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1522		return;
1523
1524	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1525	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1526	pmen &= ~DMA_PMEN_EPM;
1527	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1528
1529	/* wait for the protected region status bit to clear */
1530	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1531		readl, !(pmen & DMA_PMEN_PRS), pmen);
1532
1533	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1534}
1535
1536static void iommu_enable_translation(struct intel_iommu *iommu)
1537{
1538	u32 sts;
1539	unsigned long flags;
1540
1541	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1542	iommu->gcmd |= DMA_GCMD_TE;
1543	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1544
1545	/* Make sure hardware complete it */
1546	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1547		      readl, (sts & DMA_GSTS_TES), sts);
1548
1549	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1550}
1551
1552static void iommu_disable_translation(struct intel_iommu *iommu)
1553{
1554	u32 sts;
1555	unsigned long flag;
1556
1557	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1558	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1559		return;
1560
1561	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1562	iommu->gcmd &= ~DMA_GCMD_TE;
1563	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1564
1565	/* Make sure hardware complete it */
1566	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1567		      readl, (!(sts & DMA_GSTS_TES)), sts);
1568
1569	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1570}
1571
1572static int iommu_init_domains(struct intel_iommu *iommu)
1573{
1574	u32 ndomains;
1575
1576	ndomains = cap_ndoms(iommu->cap);
1577	pr_debug("%s: Number of Domains supported <%d>\n",
1578		 iommu->name, ndomains);
1579
1580	spin_lock_init(&iommu->lock);
1581
1582	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1583	if (!iommu->domain_ids)
1584		return -ENOMEM;
1585
1586	/*
1587	 * If Caching mode is set, then invalid translations are tagged
1588	 * with domain-id 0, hence we need to pre-allocate it. We also
1589	 * use domain-id 0 as a marker for non-allocated domain-id, so
1590	 * make sure it is not used for a real domain.
1591	 */
1592	set_bit(0, iommu->domain_ids);
1593
1594	/*
1595	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1596	 * entry for first-level or pass-through translation modes should
1597	 * be programmed with a domain id different from those used for
1598	 * second-level or nested translation. We reserve a domain id for
1599	 * this purpose.
1600	 */
1601	if (sm_supported(iommu))
1602		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1603
1604	return 0;
1605}
1606
1607static void disable_dmar_iommu(struct intel_iommu *iommu)
1608{
1609	if (!iommu->domain_ids)
1610		return;
1611
1612	/*
1613	 * All iommu domains must have been detached from the devices,
1614	 * hence there should be no domain IDs in use.
1615	 */
1616	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1617		    > NUM_RESERVED_DID))
1618		return;
1619
1620	if (iommu->gcmd & DMA_GCMD_TE)
1621		iommu_disable_translation(iommu);
1622}
1623
1624static void free_dmar_iommu(struct intel_iommu *iommu)
1625{
1626	if (iommu->domain_ids) {
1627		bitmap_free(iommu->domain_ids);
1628		iommu->domain_ids = NULL;
1629	}
1630
1631	if (iommu->copied_tables) {
1632		bitmap_free(iommu->copied_tables);
1633		iommu->copied_tables = NULL;
1634	}
1635
1636	/* free context mapping */
1637	free_context_table(iommu);
1638
1639#ifdef CONFIG_INTEL_IOMMU_SVM
1640	if (pasid_supported(iommu)) {
1641		if (ecap_prs(iommu->ecap))
1642			intel_svm_finish_prq(iommu);
1643	}
 
 
 
1644#endif
1645}
1646
1647/*
1648 * Check and return whether first level is used by default for
1649 * DMA translation.
1650 */
1651static bool first_level_by_default(unsigned int type)
1652{
1653	/* Only SL is available in legacy mode */
1654	if (!scalable_mode_support())
1655		return false;
1656
1657	/* Only level (either FL or SL) is available, just use it */
1658	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1659		return intel_cap_flts_sanity();
1660
1661	/* Both levels are available, decide it based on domain type */
1662	return type != IOMMU_DOMAIN_UNMANAGED;
1663}
1664
1665static struct dmar_domain *alloc_domain(unsigned int type)
1666{
1667	struct dmar_domain *domain;
1668
1669	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1670	if (!domain)
1671		return NULL;
1672
1673	domain->nid = NUMA_NO_NODE;
1674	if (first_level_by_default(type))
1675		domain->use_first_level = true;
1676	domain->has_iotlb_device = false;
1677	INIT_LIST_HEAD(&domain->devices);
1678	INIT_LIST_HEAD(&domain->dev_pasids);
1679	spin_lock_init(&domain->lock);
1680	xa_init(&domain->iommu_array);
1681
1682	return domain;
1683}
1684
1685int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
 
1686{
1687	struct iommu_domain_info *info, *curr;
1688	unsigned long ndomains;
1689	int num, ret = -ENOSPC;
1690
1691	info = kzalloc(sizeof(*info), GFP_KERNEL);
1692	if (!info)
1693		return -ENOMEM;
1694
1695	spin_lock(&iommu->lock);
1696	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1697	if (curr) {
1698		curr->refcnt++;
1699		spin_unlock(&iommu->lock);
1700		kfree(info);
1701		return 0;
1702	}
1703
1704	ndomains = cap_ndoms(iommu->cap);
1705	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1706	if (num >= ndomains) {
1707		pr_err("%s: No free domain ids\n", iommu->name);
1708		goto err_unlock;
1709	}
1710
1711	set_bit(num, iommu->domain_ids);
1712	info->refcnt	= 1;
1713	info->did	= num;
1714	info->iommu	= iommu;
1715	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1716			  NULL, info, GFP_ATOMIC);
1717	if (curr) {
1718		ret = xa_err(curr) ? : -EBUSY;
1719		goto err_clear;
1720	}
1721	domain_update_iommu_cap(domain);
1722
1723	spin_unlock(&iommu->lock);
1724	return 0;
1725
1726err_clear:
1727	clear_bit(info->did, iommu->domain_ids);
1728err_unlock:
1729	spin_unlock(&iommu->lock);
1730	kfree(info);
1731	return ret;
1732}
1733
1734void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
 
1735{
1736	struct iommu_domain_info *info;
1737
1738	spin_lock(&iommu->lock);
1739	info = xa_load(&domain->iommu_array, iommu->seq_id);
1740	if (--info->refcnt == 0) {
1741		clear_bit(info->did, iommu->domain_ids);
1742		xa_erase(&domain->iommu_array, iommu->seq_id);
1743		domain->nid = NUMA_NO_NODE;
1744		domain_update_iommu_cap(domain);
1745		kfree(info);
1746	}
1747	spin_unlock(&iommu->lock);
1748}
1749
1750static int guestwidth_to_adjustwidth(int gaw)
1751{
1752	int agaw;
1753	int r = (gaw - 12) % 9;
1754
1755	if (r == 0)
1756		agaw = gaw;
1757	else
1758		agaw = gaw + 9 - r;
1759	if (agaw > 64)
1760		agaw = 64;
1761	return agaw;
1762}
1763
1764static void domain_exit(struct dmar_domain *domain)
1765{
1766	if (domain->pgd) {
1767		LIST_HEAD(freelist);
1768
1769		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1770		put_pages_list(&freelist);
1771	}
1772
1773	if (WARN_ON(!list_empty(&domain->devices)))
1774		return;
1775
1776	kfree(domain);
1777}
1778
1779/*
1780 * Get the PASID directory size for scalable mode context entry.
1781 * Value of X in the PDTS field of a scalable mode context entry
1782 * indicates PASID directory with 2^(X + 7) entries.
1783 */
1784static unsigned long context_get_sm_pds(struct pasid_table *table)
1785{
1786	unsigned long pds, max_pde;
1787
1788	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1789	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1790	if (pds < 7)
1791		return 0;
1792
1793	return pds - 7;
1794}
1795
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1796static int domain_context_mapping_one(struct dmar_domain *domain,
1797				      struct intel_iommu *iommu,
1798				      struct pasid_table *table,
1799				      u8 bus, u8 devfn)
1800{
1801	struct device_domain_info *info =
1802			domain_lookup_dev_info(domain, iommu, bus, devfn);
1803	u16 did = domain_id_iommu(domain, iommu);
1804	int translation = CONTEXT_TT_MULTI_LEVEL;
1805	struct context_entry *context;
1806	int ret;
1807
 
 
1808	if (hw_pass_through && domain_type_is_si(domain))
1809		translation = CONTEXT_TT_PASS_THROUGH;
1810
1811	pr_debug("Set context mapping for %02x:%02x.%d\n",
1812		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1813
 
 
1814	spin_lock(&iommu->lock);
1815	ret = -ENOMEM;
1816	context = iommu_context_addr(iommu, bus, devfn, 1);
1817	if (!context)
1818		goto out_unlock;
1819
1820	ret = 0;
1821	if (context_present(context) && !context_copied(iommu, bus, devfn))
1822		goto out_unlock;
1823
1824	/*
1825	 * For kdump cases, old valid entries may be cached due to the
1826	 * in-flight DMA and copied pgtable, but there is no unmapping
1827	 * behaviour for them, thus we need an explicit cache flush for
1828	 * the newly-mapped device. For kdump, at this point, the device
1829	 * is supposed to finish reset at its driver probe stage, so no
1830	 * in-flight DMA will exist, and we don't need to worry anymore
1831	 * hereafter.
1832	 */
1833	if (context_copied(iommu, bus, devfn)) {
1834		u16 did_old = context_domain_id(context);
1835
1836		if (did_old < cap_ndoms(iommu->cap)) {
1837			iommu->flush.flush_context(iommu, did_old,
1838						   (((u16)bus) << 8) | devfn,
1839						   DMA_CCMD_MASK_NOBIT,
1840						   DMA_CCMD_DEVICE_INVL);
1841			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1842						 DMA_TLB_DSI_FLUSH);
1843		}
1844
1845		clear_context_copied(iommu, bus, devfn);
1846	}
1847
1848	context_clear_entry(context);
1849
1850	if (sm_supported(iommu)) {
1851		unsigned long pds;
1852
 
 
1853		/* Setup the PASID DIR pointer: */
1854		pds = context_get_sm_pds(table);
1855		context->lo = (u64)virt_to_phys(table->table) |
1856				context_pdts(pds);
1857
1858		/* Setup the RID_PASID field: */
1859		context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1860
1861		/*
1862		 * Setup the Device-TLB enable bit and Page request
1863		 * Enable bit:
1864		 */
1865		if (info && info->ats_supported)
1866			context_set_sm_dte(context);
1867		if (info && info->pri_supported)
1868			context_set_sm_pre(context);
1869		if (info && info->pasid_supported)
1870			context_set_pasid(context);
1871	} else {
1872		struct dma_pte *pgd = domain->pgd;
1873		int agaw;
1874
1875		context_set_domain_id(context, did);
1876
1877		if (translation != CONTEXT_TT_PASS_THROUGH) {
1878			/*
1879			 * Skip top levels of page tables for iommu which has
1880			 * less agaw than default. Unnecessary for PT mode.
1881			 */
1882			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1883				ret = -ENOMEM;
1884				pgd = phys_to_virt(dma_pte_addr(pgd));
1885				if (!dma_pte_present(pgd))
1886					goto out_unlock;
1887			}
1888
1889			if (info && info->ats_supported)
1890				translation = CONTEXT_TT_DEV_IOTLB;
1891			else
1892				translation = CONTEXT_TT_MULTI_LEVEL;
1893
1894			context_set_address_root(context, virt_to_phys(pgd));
1895			context_set_address_width(context, agaw);
1896		} else {
1897			/*
1898			 * In pass through mode, AW must be programmed to
1899			 * indicate the largest AGAW value supported by
1900			 * hardware. And ASR is ignored by hardware.
1901			 */
1902			context_set_address_width(context, iommu->msagaw);
1903		}
1904
1905		context_set_translation_type(context, translation);
1906	}
1907
1908	context_set_fault_enable(context);
1909	context_set_present(context);
1910	if (!ecap_coherent(iommu->ecap))
1911		clflush_cache_range(context, sizeof(*context));
1912
1913	/*
1914	 * It's a non-present to present mapping. If hardware doesn't cache
1915	 * non-present entry we only need to flush the write-buffer. If the
1916	 * _does_ cache non-present entries, then it does so in the special
1917	 * domain #0, which we have to flush:
1918	 */
1919	if (cap_caching_mode(iommu->cap)) {
1920		iommu->flush.flush_context(iommu, 0,
1921					   (((u16)bus) << 8) | devfn,
1922					   DMA_CCMD_MASK_NOBIT,
1923					   DMA_CCMD_DEVICE_INVL);
1924		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1925	} else {
1926		iommu_flush_write_buffer(iommu);
1927	}
1928
1929	ret = 0;
1930
1931out_unlock:
1932	spin_unlock(&iommu->lock);
1933
1934	return ret;
1935}
1936
1937struct domain_context_mapping_data {
1938	struct dmar_domain *domain;
1939	struct intel_iommu *iommu;
1940	struct pasid_table *table;
1941};
1942
1943static int domain_context_mapping_cb(struct pci_dev *pdev,
1944				     u16 alias, void *opaque)
1945{
1946	struct domain_context_mapping_data *data = opaque;
1947
1948	return domain_context_mapping_one(data->domain, data->iommu,
1949					  data->table, PCI_BUS_NUM(alias),
1950					  alias & 0xff);
1951}
1952
1953static int
1954domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1955{
1956	struct device_domain_info *info = dev_iommu_priv_get(dev);
1957	struct domain_context_mapping_data data;
1958	struct intel_iommu *iommu = info->iommu;
1959	u8 bus = info->bus, devfn = info->devfn;
1960	struct pasid_table *table;
 
 
 
 
 
 
1961
1962	table = intel_pasid_get_table(dev);
1963
1964	if (!dev_is_pci(dev))
1965		return domain_context_mapping_one(domain, iommu, table,
1966						  bus, devfn);
1967
1968	data.domain = domain;
1969	data.iommu = iommu;
1970	data.table = table;
1971
1972	return pci_for_each_dma_alias(to_pci_dev(dev),
1973				      &domain_context_mapping_cb, &data);
1974}
1975
1976/* Returns a number of VTD pages, but aligned to MM page size */
1977static unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
 
1978{
1979	host_addr &= ~PAGE_MASK;
1980	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1981}
1982
1983/* Return largest possible superpage level for a given mapping */
1984static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1985				   unsigned long phy_pfn, unsigned long pages)
 
 
1986{
1987	int support, level = 1;
1988	unsigned long pfnmerge;
1989
1990	support = domain->iommu_superpage;
1991
1992	/* To use a large page, the virtual *and* physical addresses
1993	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1994	   of them will mean we have to use smaller pages. So just
1995	   merge them and check both at once. */
1996	pfnmerge = iov_pfn | phy_pfn;
1997
1998	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1999		pages >>= VTD_STRIDE_SHIFT;
2000		if (!pages)
2001			break;
2002		pfnmerge >>= VTD_STRIDE_SHIFT;
2003		level++;
2004		support--;
2005	}
2006	return level;
2007}
2008
2009/*
2010 * Ensure that old small page tables are removed to make room for superpage(s).
2011 * We're going to add new large pages, so make sure we don't remove their parent
2012 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2013 */
2014static void switch_to_super_page(struct dmar_domain *domain,
2015				 unsigned long start_pfn,
2016				 unsigned long end_pfn, int level)
2017{
2018	unsigned long lvl_pages = lvl_to_nr_pages(level);
2019	struct iommu_domain_info *info;
2020	struct dma_pte *pte = NULL;
2021	unsigned long i;
2022
2023	while (start_pfn <= end_pfn) {
2024		if (!pte)
2025			pte = pfn_to_dma_pte(domain, start_pfn, &level,
2026					     GFP_ATOMIC);
2027
2028		if (dma_pte_present(pte)) {
2029			dma_pte_free_pagetable(domain, start_pfn,
2030					       start_pfn + lvl_pages - 1,
2031					       level + 1);
2032
2033			xa_for_each(&domain->iommu_array, i, info)
2034				iommu_flush_iotlb_psi(info->iommu, domain,
2035						      start_pfn, lvl_pages,
2036						      0, 0);
2037			if (domain->nested_parent)
2038				parent_domain_flush(domain, start_pfn,
2039						    lvl_pages, 0);
2040		}
2041
2042		pte++;
2043		start_pfn += lvl_pages;
2044		if (first_pte_in_page(pte))
2045			pte = NULL;
2046	}
2047}
2048
2049static int
2050__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2051		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2052		 gfp_t gfp)
2053{
2054	struct dma_pte *first_pte = NULL, *pte = NULL;
2055	unsigned int largepage_lvl = 0;
2056	unsigned long lvl_pages = 0;
2057	phys_addr_t pteval;
2058	u64 attr;
2059
2060	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2061		return -EINVAL;
2062
2063	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2064		return -EINVAL;
2065
2066	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2067		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2068		return -EINVAL;
2069	}
2070
2071	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2072	attr |= DMA_FL_PTE_PRESENT;
2073	if (domain->use_first_level) {
2074		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2075		if (prot & DMA_PTE_WRITE)
2076			attr |= DMA_FL_PTE_DIRTY;
2077	}
2078
2079	domain->has_mappings = true;
2080
2081	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2082
2083	while (nr_pages > 0) {
2084		uint64_t tmp;
2085
2086		if (!pte) {
2087			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2088					phys_pfn, nr_pages);
2089
2090			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2091					     gfp);
2092			if (!pte)
2093				return -ENOMEM;
2094			first_pte = pte;
2095
2096			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2097
2098			/* It is large page*/
2099			if (largepage_lvl > 1) {
2100				unsigned long end_pfn;
2101				unsigned long pages_to_remove;
2102
2103				pteval |= DMA_PTE_LARGE_PAGE;
2104				pages_to_remove = min_t(unsigned long, nr_pages,
2105							nr_pte_to_next_page(pte) * lvl_pages);
2106				end_pfn = iov_pfn + pages_to_remove - 1;
2107				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2108			} else {
2109				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2110			}
2111
2112		}
2113		/* We don't need lock here, nobody else
2114		 * touches the iova range
2115		 */
2116		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2117		if (tmp) {
2118			static int dumps = 5;
2119			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2120				iov_pfn, tmp, (unsigned long long)pteval);
2121			if (dumps) {
2122				dumps--;
2123				debug_dma_dump_mappings(NULL);
2124			}
2125			WARN_ON(1);
2126		}
2127
2128		nr_pages -= lvl_pages;
2129		iov_pfn += lvl_pages;
2130		phys_pfn += lvl_pages;
2131		pteval += lvl_pages * VTD_PAGE_SIZE;
2132
2133		/* If the next PTE would be the first in a new page, then we
2134		 * need to flush the cache on the entries we've just written.
2135		 * And then we'll need to recalculate 'pte', so clear it and
2136		 * let it get set again in the if (!pte) block above.
2137		 *
2138		 * If we're done (!nr_pages) we need to flush the cache too.
2139		 *
2140		 * Also if we've been setting superpages, we may need to
2141		 * recalculate 'pte' and switch back to smaller pages for the
2142		 * end of the mapping, if the trailing size is not enough to
2143		 * use another superpage (i.e. nr_pages < lvl_pages).
2144		 */
2145		pte++;
2146		if (!nr_pages || first_pte_in_page(pte) ||
2147		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2148			domain_flush_cache(domain, first_pte,
2149					   (void *)pte - (void *)first_pte);
2150			pte = NULL;
2151		}
2152	}
2153
2154	return 0;
2155}
2156
2157static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2158{
2159	struct intel_iommu *iommu = info->iommu;
2160	struct context_entry *context;
2161	u16 did_old;
2162
2163	if (!iommu)
2164		return;
2165
2166	spin_lock(&iommu->lock);
2167	context = iommu_context_addr(iommu, bus, devfn, 0);
2168	if (!context) {
2169		spin_unlock(&iommu->lock);
2170		return;
2171	}
2172
2173	if (sm_supported(iommu)) {
2174		if (hw_pass_through && domain_type_is_si(info->domain))
2175			did_old = FLPT_DEFAULT_DID;
2176		else
2177			did_old = domain_id_iommu(info->domain, iommu);
2178	} else {
2179		did_old = context_domain_id(context);
2180	}
2181
2182	context_clear_entry(context);
2183	__iommu_flush_cache(iommu, context, sizeof(*context));
2184	spin_unlock(&iommu->lock);
2185	iommu->flush.flush_context(iommu,
2186				   did_old,
2187				   (((u16)bus) << 8) | devfn,
2188				   DMA_CCMD_MASK_NOBIT,
2189				   DMA_CCMD_DEVICE_INVL);
2190
2191	if (sm_supported(iommu))
2192		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2193
2194	iommu->flush.flush_iotlb(iommu,
2195				 did_old,
2196				 0,
2197				 0,
2198				 DMA_TLB_DSI_FLUSH);
2199
2200	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2201}
2202
2203static int domain_setup_first_level(struct intel_iommu *iommu,
2204				    struct dmar_domain *domain,
2205				    struct device *dev,
2206				    u32 pasid)
2207{
2208	struct dma_pte *pgd = domain->pgd;
2209	int agaw, level;
2210	int flags = 0;
2211
2212	/*
2213	 * Skip top levels of page tables for iommu which has
2214	 * less agaw than default. Unnecessary for PT mode.
2215	 */
2216	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2217		pgd = phys_to_virt(dma_pte_addr(pgd));
2218		if (!dma_pte_present(pgd))
2219			return -ENOMEM;
2220	}
2221
2222	level = agaw_to_level(agaw);
2223	if (level != 4 && level != 5)
2224		return -EINVAL;
2225
 
 
2226	if (level == 5)
2227		flags |= PASID_FLAG_FL5LP;
2228
2229	if (domain->force_snooping)
2230		flags |= PASID_FLAG_PAGE_SNOOP;
2231
2232	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2233					     domain_id_iommu(domain, iommu),
2234					     flags);
2235}
2236
2237static bool dev_is_real_dma_subdevice(struct device *dev)
2238{
2239	return dev && dev_is_pci(dev) &&
2240	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2241}
2242
2243static int iommu_domain_identity_map(struct dmar_domain *domain,
2244				     unsigned long first_vpfn,
2245				     unsigned long last_vpfn)
2246{
2247	/*
2248	 * RMRR range might have overlap with physical memory range,
2249	 * clear it first
2250	 */
2251	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2252
2253	return __domain_mapping(domain, first_vpfn,
2254				first_vpfn, last_vpfn - first_vpfn + 1,
2255				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2256}
2257
2258static int md_domain_init(struct dmar_domain *domain, int guest_width);
2259
2260static int __init si_domain_init(int hw)
2261{
2262	struct dmar_rmrr_unit *rmrr;
2263	struct device *dev;
2264	int i, nid, ret;
2265
2266	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2267	if (!si_domain)
2268		return -EFAULT;
2269
2270	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2271		domain_exit(si_domain);
2272		si_domain = NULL;
2273		return -EFAULT;
2274	}
2275
2276	if (hw)
2277		return 0;
2278
2279	for_each_online_node(nid) {
2280		unsigned long start_pfn, end_pfn;
2281		int i;
2282
2283		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2284			ret = iommu_domain_identity_map(si_domain,
2285					mm_to_dma_pfn_start(start_pfn),
2286					mm_to_dma_pfn_end(end_pfn));
2287			if (ret)
2288				return ret;
2289		}
2290	}
2291
2292	/*
2293	 * Identity map the RMRRs so that devices with RMRRs could also use
2294	 * the si_domain.
2295	 */
2296	for_each_rmrr_units(rmrr) {
2297		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2298					  i, dev) {
2299			unsigned long long start = rmrr->base_address;
2300			unsigned long long end = rmrr->end_address;
2301
2302			if (WARN_ON(end < start ||
2303				    end >> agaw_to_width(si_domain->agaw)))
2304				continue;
2305
2306			ret = iommu_domain_identity_map(si_domain,
2307					mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2308					mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2309			if (ret)
2310				return ret;
2311		}
2312	}
2313
2314	return 0;
2315}
2316
2317static int dmar_domain_attach_device(struct dmar_domain *domain,
2318				     struct device *dev)
2319{
2320	struct device_domain_info *info = dev_iommu_priv_get(dev);
2321	struct intel_iommu *iommu = info->iommu;
2322	unsigned long flags;
 
2323	int ret;
2324
 
 
 
 
2325	ret = domain_attach_iommu(domain, iommu);
2326	if (ret)
2327		return ret;
2328	info->domain = domain;
2329	spin_lock_irqsave(&domain->lock, flags);
2330	list_add(&info->link, &domain->devices);
2331	spin_unlock_irqrestore(&domain->lock, flags);
2332
2333	/* PASID table is mandatory for a PCI device in scalable mode. */
2334	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2335		/* Setup the PASID entry for requests without PASID: */
2336		if (hw_pass_through && domain_type_is_si(domain))
2337			ret = intel_pasid_setup_pass_through(iommu,
2338					dev, IOMMU_NO_PASID);
2339		else if (domain->use_first_level)
2340			ret = domain_setup_first_level(iommu, domain, dev,
2341					IOMMU_NO_PASID);
2342		else
2343			ret = intel_pasid_setup_second_level(iommu, domain,
2344					dev, IOMMU_NO_PASID);
2345		if (ret) {
2346			dev_err(dev, "Setup RID2PASID failed\n");
2347			device_block_translation(dev);
2348			return ret;
2349		}
2350	}
2351
2352	ret = domain_context_mapping(domain, dev);
2353	if (ret) {
2354		dev_err(dev, "Domain context map failed\n");
2355		device_block_translation(dev);
2356		return ret;
2357	}
2358
2359	if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2360		iommu_enable_pci_caps(info);
2361
2362	return 0;
2363}
2364
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2365/**
2366 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2367 * is relaxable (ie. is allowed to be not enforced under some conditions)
2368 * @dev: device handle
2369 *
2370 * We assume that PCI USB devices with RMRRs have them largely
2371 * for historical reasons and that the RMRR space is not actively used post
2372 * boot.  This exclusion may change if vendors begin to abuse it.
2373 *
2374 * The same exception is made for graphics devices, with the requirement that
2375 * any use of the RMRR regions will be torn down before assigning the device
2376 * to a guest.
2377 *
2378 * Return: true if the RMRR is relaxable, false otherwise
2379 */
2380static bool device_rmrr_is_relaxable(struct device *dev)
2381{
2382	struct pci_dev *pdev;
2383
2384	if (!dev_is_pci(dev))
2385		return false;
2386
2387	pdev = to_pci_dev(dev);
2388	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2389		return true;
2390	else
2391		return false;
2392}
2393
2394/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2395 * Return the required default domain type for a specific device.
2396 *
2397 * @dev: the device in query
2398 * @startup: true if this is during early boot
2399 *
2400 * Returns:
2401 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2402 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2403 *  - 0: both identity and dynamic domains work for this device
2404 */
2405static int device_def_domain_type(struct device *dev)
2406{
2407	if (dev_is_pci(dev)) {
2408		struct pci_dev *pdev = to_pci_dev(dev);
2409
2410		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2411			return IOMMU_DOMAIN_IDENTITY;
2412
2413		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2414			return IOMMU_DOMAIN_IDENTITY;
2415	}
2416
2417	return 0;
2418}
2419
2420static void intel_iommu_init_qi(struct intel_iommu *iommu)
2421{
2422	/*
2423	 * Start from the sane iommu hardware state.
2424	 * If the queued invalidation is already initialized by us
2425	 * (for example, while enabling interrupt-remapping) then
2426	 * we got the things already rolling from a sane state.
2427	 */
2428	if (!iommu->qi) {
2429		/*
2430		 * Clear any previous faults.
2431		 */
2432		dmar_fault(-1, iommu);
2433		/*
2434		 * Disable queued invalidation if supported and already enabled
2435		 * before OS handover.
2436		 */
2437		dmar_disable_qi(iommu);
2438	}
2439
2440	if (dmar_enable_qi(iommu)) {
2441		/*
2442		 * Queued Invalidate not enabled, use Register Based Invalidate
2443		 */
2444		iommu->flush.flush_context = __iommu_flush_context;
2445		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2446		pr_info("%s: Using Register based invalidation\n",
2447			iommu->name);
2448	} else {
2449		iommu->flush.flush_context = qi_flush_context;
2450		iommu->flush.flush_iotlb = qi_flush_iotlb;
2451		pr_info("%s: Using Queued invalidation\n", iommu->name);
2452	}
2453}
2454
2455static int copy_context_table(struct intel_iommu *iommu,
2456			      struct root_entry *old_re,
2457			      struct context_entry **tbl,
2458			      int bus, bool ext)
2459{
2460	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2461	struct context_entry *new_ce = NULL, ce;
2462	struct context_entry *old_ce = NULL;
2463	struct root_entry re;
2464	phys_addr_t old_ce_phys;
2465
2466	tbl_idx = ext ? bus * 2 : bus;
2467	memcpy(&re, old_re, sizeof(re));
2468
2469	for (devfn = 0; devfn < 256; devfn++) {
2470		/* First calculate the correct index */
2471		idx = (ext ? devfn * 2 : devfn) % 256;
2472
2473		if (idx == 0) {
2474			/* First save what we may have and clean up */
2475			if (new_ce) {
2476				tbl[tbl_idx] = new_ce;
2477				__iommu_flush_cache(iommu, new_ce,
2478						    VTD_PAGE_SIZE);
2479				pos = 1;
2480			}
2481
2482			if (old_ce)
2483				memunmap(old_ce);
2484
2485			ret = 0;
2486			if (devfn < 0x80)
2487				old_ce_phys = root_entry_lctp(&re);
2488			else
2489				old_ce_phys = root_entry_uctp(&re);
2490
2491			if (!old_ce_phys) {
2492				if (ext && devfn == 0) {
2493					/* No LCTP, try UCTP */
2494					devfn = 0x7f;
2495					continue;
2496				} else {
2497					goto out;
2498				}
2499			}
2500
2501			ret = -ENOMEM;
2502			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2503					MEMREMAP_WB);
2504			if (!old_ce)
2505				goto out;
2506
2507			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2508			if (!new_ce)
2509				goto out_unmap;
2510
2511			ret = 0;
2512		}
2513
2514		/* Now copy the context entry */
2515		memcpy(&ce, old_ce + idx, sizeof(ce));
2516
2517		if (!context_present(&ce))
2518			continue;
2519
2520		did = context_domain_id(&ce);
2521		if (did >= 0 && did < cap_ndoms(iommu->cap))
2522			set_bit(did, iommu->domain_ids);
2523
2524		set_context_copied(iommu, bus, devfn);
2525		new_ce[idx] = ce;
2526	}
2527
2528	tbl[tbl_idx + pos] = new_ce;
2529
2530	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2531
2532out_unmap:
2533	memunmap(old_ce);
2534
2535out:
2536	return ret;
2537}
2538
2539static int copy_translation_tables(struct intel_iommu *iommu)
2540{
2541	struct context_entry **ctxt_tbls;
2542	struct root_entry *old_rt;
2543	phys_addr_t old_rt_phys;
2544	int ctxt_table_entries;
2545	u64 rtaddr_reg;
2546	int bus, ret;
2547	bool new_ext, ext;
2548
2549	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2550	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2551	new_ext    = !!sm_supported(iommu);
2552
2553	/*
2554	 * The RTT bit can only be changed when translation is disabled,
2555	 * but disabling translation means to open a window for data
2556	 * corruption. So bail out and don't copy anything if we would
2557	 * have to change the bit.
2558	 */
2559	if (new_ext != ext)
2560		return -EINVAL;
2561
2562	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2563	if (!iommu->copied_tables)
2564		return -ENOMEM;
2565
2566	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2567	if (!old_rt_phys)
2568		return -EINVAL;
2569
2570	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2571	if (!old_rt)
2572		return -ENOMEM;
2573
2574	/* This is too big for the stack - allocate it from slab */
2575	ctxt_table_entries = ext ? 512 : 256;
2576	ret = -ENOMEM;
2577	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2578	if (!ctxt_tbls)
2579		goto out_unmap;
2580
2581	for (bus = 0; bus < 256; bus++) {
2582		ret = copy_context_table(iommu, &old_rt[bus],
2583					 ctxt_tbls, bus, ext);
2584		if (ret) {
2585			pr_err("%s: Failed to copy context table for bus %d\n",
2586				iommu->name, bus);
2587			continue;
2588		}
2589	}
2590
2591	spin_lock(&iommu->lock);
2592
2593	/* Context tables are copied, now write them to the root_entry table */
2594	for (bus = 0; bus < 256; bus++) {
2595		int idx = ext ? bus * 2 : bus;
2596		u64 val;
2597
2598		if (ctxt_tbls[idx]) {
2599			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2600			iommu->root_entry[bus].lo = val;
2601		}
2602
2603		if (!ext || !ctxt_tbls[idx + 1])
2604			continue;
2605
2606		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2607		iommu->root_entry[bus].hi = val;
2608	}
2609
2610	spin_unlock(&iommu->lock);
2611
2612	kfree(ctxt_tbls);
2613
2614	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2615
2616	ret = 0;
2617
2618out_unmap:
2619	memunmap(old_rt);
2620
2621	return ret;
2622}
2623
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2624static int __init init_dmars(void)
2625{
2626	struct dmar_drhd_unit *drhd;
2627	struct intel_iommu *iommu;
2628	int ret;
2629
2630	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2631	if (ret)
2632		goto free_iommu;
2633
2634	for_each_iommu(iommu, drhd) {
2635		if (drhd->ignored) {
2636			iommu_disable_translation(iommu);
2637			continue;
2638		}
2639
2640		/*
2641		 * Find the max pasid size of all IOMMU's in the system.
2642		 * We need to ensure the system pasid table is no bigger
2643		 * than the smallest supported.
2644		 */
2645		if (pasid_supported(iommu)) {
2646			u32 temp = 2 << ecap_pss(iommu->ecap);
2647
2648			intel_pasid_max_id = min_t(u32, temp,
2649						   intel_pasid_max_id);
2650		}
2651
2652		intel_iommu_init_qi(iommu);
2653
2654		ret = iommu_init_domains(iommu);
2655		if (ret)
2656			goto free_iommu;
2657
2658		init_translation_status(iommu);
2659
2660		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2661			iommu_disable_translation(iommu);
2662			clear_translation_pre_enabled(iommu);
2663			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2664				iommu->name);
2665		}
2666
2667		/*
2668		 * TBD:
2669		 * we could share the same root & context tables
2670		 * among all IOMMU's. Need to Split it later.
2671		 */
2672		ret = iommu_alloc_root_entry(iommu);
2673		if (ret)
2674			goto free_iommu;
2675
2676		if (translation_pre_enabled(iommu)) {
2677			pr_info("Translation already enabled - trying to copy translation structures\n");
2678
2679			ret = copy_translation_tables(iommu);
2680			if (ret) {
2681				/*
2682				 * We found the IOMMU with translation
2683				 * enabled - but failed to copy over the
2684				 * old root-entry table. Try to proceed
2685				 * by disabling translation now and
2686				 * allocating a clean root-entry table.
2687				 * This might cause DMAR faults, but
2688				 * probably the dump will still succeed.
2689				 */
2690				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2691				       iommu->name);
2692				iommu_disable_translation(iommu);
2693				clear_translation_pre_enabled(iommu);
2694			} else {
2695				pr_info("Copied translation tables from previous kernel for %s\n",
2696					iommu->name);
2697			}
2698		}
2699
2700		if (!ecap_pass_through(iommu->ecap))
2701			hw_pass_through = 0;
2702		intel_svm_check(iommu);
2703	}
2704
2705	/*
2706	 * Now that qi is enabled on all iommus, set the root entry and flush
2707	 * caches. This is required on some Intel X58 chipsets, otherwise the
2708	 * flush_context function will loop forever and the boot hangs.
2709	 */
2710	for_each_active_iommu(iommu, drhd) {
2711		iommu_flush_write_buffer(iommu);
 
 
 
2712		iommu_set_root_entry(iommu);
2713	}
2714
2715#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2716	dmar_map_gfx = 0;
2717#endif
2718
2719	if (!dmar_map_gfx)
2720		iommu_identity_mapping |= IDENTMAP_GFX;
2721
2722	check_tylersburg_isoch();
2723
2724	ret = si_domain_init(hw_pass_through);
2725	if (ret)
2726		goto free_iommu;
2727
2728	/*
2729	 * for each drhd
2730	 *   enable fault log
2731	 *   global invalidate context cache
2732	 *   global invalidate iotlb
2733	 *   enable translation
2734	 */
2735	for_each_iommu(iommu, drhd) {
2736		if (drhd->ignored) {
2737			/*
2738			 * we always have to disable PMRs or DMA may fail on
2739			 * this device
2740			 */
2741			if (force_on)
2742				iommu_disable_protect_mem_regions(iommu);
2743			continue;
2744		}
2745
2746		iommu_flush_write_buffer(iommu);
2747
2748#ifdef CONFIG_INTEL_IOMMU_SVM
2749		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2750			/*
2751			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2752			 * could cause possible lock race condition.
2753			 */
2754			up_write(&dmar_global_lock);
2755			ret = intel_svm_enable_prq(iommu);
2756			down_write(&dmar_global_lock);
2757			if (ret)
2758				goto free_iommu;
2759		}
2760#endif
2761		ret = dmar_set_interrupt(iommu);
2762		if (ret)
2763			goto free_iommu;
2764	}
2765
2766	return 0;
2767
2768free_iommu:
2769	for_each_active_iommu(iommu, drhd) {
2770		disable_dmar_iommu(iommu);
2771		free_dmar_iommu(iommu);
2772	}
2773	if (si_domain) {
2774		domain_exit(si_domain);
2775		si_domain = NULL;
2776	}
2777
2778	return ret;
2779}
2780
2781static void __init init_no_remapping_devices(void)
2782{
2783	struct dmar_drhd_unit *drhd;
2784	struct device *dev;
2785	int i;
2786
2787	for_each_drhd_unit(drhd) {
2788		if (!drhd->include_all) {
2789			for_each_active_dev_scope(drhd->devices,
2790						  drhd->devices_cnt, i, dev)
2791				break;
2792			/* ignore DMAR unit if no devices exist */
2793			if (i == drhd->devices_cnt)
2794				drhd->ignored = 1;
2795		}
2796	}
2797
2798	for_each_active_drhd_unit(drhd) {
2799		if (drhd->include_all)
2800			continue;
2801
2802		for_each_active_dev_scope(drhd->devices,
2803					  drhd->devices_cnt, i, dev)
2804			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2805				break;
2806		if (i < drhd->devices_cnt)
2807			continue;
2808
2809		/* This IOMMU has *only* gfx devices. Either bypass it or
2810		   set the gfx_mapped flag, as appropriate */
2811		drhd->gfx_dedicated = 1;
2812		if (!dmar_map_gfx)
2813			drhd->ignored = 1;
2814	}
2815}
2816
2817#ifdef CONFIG_SUSPEND
2818static int init_iommu_hw(void)
2819{
2820	struct dmar_drhd_unit *drhd;
2821	struct intel_iommu *iommu = NULL;
2822	int ret;
2823
2824	for_each_active_iommu(iommu, drhd) {
2825		if (iommu->qi) {
2826			ret = dmar_reenable_qi(iommu);
2827			if (ret)
2828				return ret;
2829		}
2830	}
2831
2832	for_each_iommu(iommu, drhd) {
2833		if (drhd->ignored) {
2834			/*
2835			 * we always have to disable PMRs or DMA may fail on
2836			 * this device
2837			 */
2838			if (force_on)
2839				iommu_disable_protect_mem_regions(iommu);
2840			continue;
2841		}
2842
2843		iommu_flush_write_buffer(iommu);
2844		iommu_set_root_entry(iommu);
2845		iommu_enable_translation(iommu);
2846		iommu_disable_protect_mem_regions(iommu);
2847	}
2848
2849	return 0;
2850}
2851
2852static void iommu_flush_all(void)
2853{
2854	struct dmar_drhd_unit *drhd;
2855	struct intel_iommu *iommu;
2856
2857	for_each_active_iommu(iommu, drhd) {
2858		iommu->flush.flush_context(iommu, 0, 0, 0,
2859					   DMA_CCMD_GLOBAL_INVL);
2860		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2861					 DMA_TLB_GLOBAL_FLUSH);
2862	}
2863}
2864
2865static int iommu_suspend(void)
2866{
2867	struct dmar_drhd_unit *drhd;
2868	struct intel_iommu *iommu = NULL;
2869	unsigned long flag;
2870
 
 
 
 
 
 
 
2871	iommu_flush_all();
2872
2873	for_each_active_iommu(iommu, drhd) {
2874		iommu_disable_translation(iommu);
2875
2876		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2877
2878		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2879			readl(iommu->reg + DMAR_FECTL_REG);
2880		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2881			readl(iommu->reg + DMAR_FEDATA_REG);
2882		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2883			readl(iommu->reg + DMAR_FEADDR_REG);
2884		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2885			readl(iommu->reg + DMAR_FEUADDR_REG);
2886
2887		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2888	}
2889	return 0;
 
 
 
 
 
 
2890}
2891
2892static void iommu_resume(void)
2893{
2894	struct dmar_drhd_unit *drhd;
2895	struct intel_iommu *iommu = NULL;
2896	unsigned long flag;
2897
2898	if (init_iommu_hw()) {
2899		if (force_on)
2900			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2901		else
2902			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2903		return;
2904	}
2905
2906	for_each_active_iommu(iommu, drhd) {
2907
2908		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2909
2910		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2911			iommu->reg + DMAR_FECTL_REG);
2912		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2913			iommu->reg + DMAR_FEDATA_REG);
2914		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2915			iommu->reg + DMAR_FEADDR_REG);
2916		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2917			iommu->reg + DMAR_FEUADDR_REG);
2918
2919		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2920	}
 
 
 
2921}
2922
2923static struct syscore_ops iommu_syscore_ops = {
2924	.resume		= iommu_resume,
2925	.suspend	= iommu_suspend,
2926};
2927
2928static void __init init_iommu_pm_ops(void)
2929{
2930	register_syscore_ops(&iommu_syscore_ops);
2931}
2932
2933#else
2934static inline void init_iommu_pm_ops(void) {}
2935#endif	/* CONFIG_PM */
2936
2937static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2938{
2939	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2940	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2941	    rmrr->end_address <= rmrr->base_address ||
2942	    arch_rmrr_sanity_check(rmrr))
2943		return -EINVAL;
2944
2945	return 0;
2946}
2947
2948int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2949{
2950	struct acpi_dmar_reserved_memory *rmrr;
2951	struct dmar_rmrr_unit *rmrru;
2952
2953	rmrr = (struct acpi_dmar_reserved_memory *)header;
2954	if (rmrr_sanity_check(rmrr)) {
2955		pr_warn(FW_BUG
2956			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2957			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2958			   rmrr->base_address, rmrr->end_address,
2959			   dmi_get_system_info(DMI_BIOS_VENDOR),
2960			   dmi_get_system_info(DMI_BIOS_VERSION),
2961			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2962		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2963	}
2964
2965	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2966	if (!rmrru)
2967		goto out;
2968
2969	rmrru->hdr = header;
2970
2971	rmrru->base_address = rmrr->base_address;
2972	rmrru->end_address = rmrr->end_address;
2973
2974	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2975				((void *)rmrr) + rmrr->header.length,
2976				&rmrru->devices_cnt);
2977	if (rmrru->devices_cnt && rmrru->devices == NULL)
2978		goto free_rmrru;
2979
2980	list_add(&rmrru->list, &dmar_rmrr_units);
2981
2982	return 0;
2983free_rmrru:
2984	kfree(rmrru);
2985out:
2986	return -ENOMEM;
2987}
2988
2989static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2990{
2991	struct dmar_atsr_unit *atsru;
2992	struct acpi_dmar_atsr *tmp;
2993
2994	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2995				dmar_rcu_check()) {
2996		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2997		if (atsr->segment != tmp->segment)
2998			continue;
2999		if (atsr->header.length != tmp->header.length)
3000			continue;
3001		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3002			return atsru;
3003	}
3004
3005	return NULL;
3006}
3007
3008int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3009{
3010	struct acpi_dmar_atsr *atsr;
3011	struct dmar_atsr_unit *atsru;
3012
3013	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3014		return 0;
3015
3016	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3017	atsru = dmar_find_atsr(atsr);
3018	if (atsru)
3019		return 0;
3020
3021	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3022	if (!atsru)
3023		return -ENOMEM;
3024
3025	/*
3026	 * If memory is allocated from slab by ACPI _DSM method, we need to
3027	 * copy the memory content because the memory buffer will be freed
3028	 * on return.
3029	 */
3030	atsru->hdr = (void *)(atsru + 1);
3031	memcpy(atsru->hdr, hdr, hdr->length);
3032	atsru->include_all = atsr->flags & 0x1;
3033	if (!atsru->include_all) {
3034		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3035				(void *)atsr + atsr->header.length,
3036				&atsru->devices_cnt);
3037		if (atsru->devices_cnt && atsru->devices == NULL) {
3038			kfree(atsru);
3039			return -ENOMEM;
3040		}
3041	}
3042
3043	list_add_rcu(&atsru->list, &dmar_atsr_units);
3044
3045	return 0;
3046}
3047
3048static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3049{
3050	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3051	kfree(atsru);
3052}
3053
3054int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3055{
3056	struct acpi_dmar_atsr *atsr;
3057	struct dmar_atsr_unit *atsru;
3058
3059	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3060	atsru = dmar_find_atsr(atsr);
3061	if (atsru) {
3062		list_del_rcu(&atsru->list);
3063		synchronize_rcu();
3064		intel_iommu_free_atsr(atsru);
3065	}
3066
3067	return 0;
3068}
3069
3070int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3071{
3072	int i;
3073	struct device *dev;
3074	struct acpi_dmar_atsr *atsr;
3075	struct dmar_atsr_unit *atsru;
3076
3077	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3078	atsru = dmar_find_atsr(atsr);
3079	if (!atsru)
3080		return 0;
3081
3082	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3083		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3084					  i, dev)
3085			return -EBUSY;
3086	}
3087
3088	return 0;
3089}
3090
3091static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3092{
3093	struct dmar_satc_unit *satcu;
3094	struct acpi_dmar_satc *tmp;
3095
3096	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3097				dmar_rcu_check()) {
3098		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3099		if (satc->segment != tmp->segment)
3100			continue;
3101		if (satc->header.length != tmp->header.length)
3102			continue;
3103		if (memcmp(satc, tmp, satc->header.length) == 0)
3104			return satcu;
3105	}
3106
3107	return NULL;
3108}
3109
3110int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3111{
3112	struct acpi_dmar_satc *satc;
3113	struct dmar_satc_unit *satcu;
3114
3115	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3116		return 0;
3117
3118	satc = container_of(hdr, struct acpi_dmar_satc, header);
3119	satcu = dmar_find_satc(satc);
3120	if (satcu)
3121		return 0;
3122
3123	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3124	if (!satcu)
3125		return -ENOMEM;
3126
3127	satcu->hdr = (void *)(satcu + 1);
3128	memcpy(satcu->hdr, hdr, hdr->length);
3129	satcu->atc_required = satc->flags & 0x1;
3130	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3131					      (void *)satc + satc->header.length,
3132					      &satcu->devices_cnt);
3133	if (satcu->devices_cnt && !satcu->devices) {
3134		kfree(satcu);
3135		return -ENOMEM;
3136	}
3137	list_add_rcu(&satcu->list, &dmar_satc_units);
3138
3139	return 0;
3140}
3141
3142static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3143{
3144	int sp, ret;
3145	struct intel_iommu *iommu = dmaru->iommu;
3146
3147	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3148	if (ret)
3149		goto out;
3150
3151	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3152		pr_warn("%s: Doesn't support hardware pass through.\n",
3153			iommu->name);
3154		return -ENXIO;
3155	}
3156
3157	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3158	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3159		pr_warn("%s: Doesn't support large page.\n",
3160			iommu->name);
3161		return -ENXIO;
3162	}
3163
3164	/*
3165	 * Disable translation if already enabled prior to OS handover.
3166	 */
3167	if (iommu->gcmd & DMA_GCMD_TE)
3168		iommu_disable_translation(iommu);
3169
3170	ret = iommu_init_domains(iommu);
3171	if (ret == 0)
3172		ret = iommu_alloc_root_entry(iommu);
3173	if (ret)
3174		goto out;
3175
3176	intel_svm_check(iommu);
3177
3178	if (dmaru->ignored) {
3179		/*
3180		 * we always have to disable PMRs or DMA may fail on this device
3181		 */
3182		if (force_on)
3183			iommu_disable_protect_mem_regions(iommu);
3184		return 0;
3185	}
3186
3187	intel_iommu_init_qi(iommu);
3188	iommu_flush_write_buffer(iommu);
3189
3190#ifdef CONFIG_INTEL_IOMMU_SVM
3191	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3192		ret = intel_svm_enable_prq(iommu);
3193		if (ret)
3194			goto disable_iommu;
3195	}
3196#endif
3197	ret = dmar_set_interrupt(iommu);
3198	if (ret)
3199		goto disable_iommu;
3200
3201	iommu_set_root_entry(iommu);
3202	iommu_enable_translation(iommu);
3203
3204	iommu_disable_protect_mem_regions(iommu);
3205	return 0;
3206
3207disable_iommu:
3208	disable_dmar_iommu(iommu);
3209out:
3210	free_dmar_iommu(iommu);
3211	return ret;
3212}
3213
3214int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3215{
3216	int ret = 0;
3217	struct intel_iommu *iommu = dmaru->iommu;
3218
3219	if (!intel_iommu_enabled)
3220		return 0;
3221	if (iommu == NULL)
3222		return -EINVAL;
3223
3224	if (insert) {
3225		ret = intel_iommu_add(dmaru);
3226	} else {
3227		disable_dmar_iommu(iommu);
3228		free_dmar_iommu(iommu);
3229	}
3230
3231	return ret;
3232}
3233
3234static void intel_iommu_free_dmars(void)
3235{
3236	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3237	struct dmar_atsr_unit *atsru, *atsr_n;
3238	struct dmar_satc_unit *satcu, *satc_n;
3239
3240	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3241		list_del(&rmrru->list);
3242		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3243		kfree(rmrru);
3244	}
3245
3246	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3247		list_del(&atsru->list);
3248		intel_iommu_free_atsr(atsru);
3249	}
3250	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3251		list_del(&satcu->list);
3252		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3253		kfree(satcu);
3254	}
3255}
3256
3257static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3258{
3259	struct dmar_satc_unit *satcu;
3260	struct acpi_dmar_satc *satc;
3261	struct device *tmp;
3262	int i;
3263
3264	dev = pci_physfn(dev);
3265	rcu_read_lock();
3266
3267	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3268		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3269		if (satc->segment != pci_domain_nr(dev->bus))
3270			continue;
3271		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3272			if (to_pci_dev(tmp) == dev)
3273				goto out;
3274	}
3275	satcu = NULL;
3276out:
3277	rcu_read_unlock();
3278	return satcu;
3279}
3280
3281static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3282{
3283	int i, ret = 1;
3284	struct pci_bus *bus;
3285	struct pci_dev *bridge = NULL;
3286	struct device *tmp;
3287	struct acpi_dmar_atsr *atsr;
3288	struct dmar_atsr_unit *atsru;
3289	struct dmar_satc_unit *satcu;
3290
3291	dev = pci_physfn(dev);
3292	satcu = dmar_find_matched_satc_unit(dev);
3293	if (satcu)
3294		/*
3295		 * This device supports ATS as it is in SATC table.
3296		 * When IOMMU is in legacy mode, enabling ATS is done
3297		 * automatically by HW for the device that requires
3298		 * ATS, hence OS should not enable this device ATS
3299		 * to avoid duplicated TLB invalidation.
3300		 */
3301		return !(satcu->atc_required && !sm_supported(iommu));
3302
3303	for (bus = dev->bus; bus; bus = bus->parent) {
3304		bridge = bus->self;
3305		/* If it's an integrated device, allow ATS */
3306		if (!bridge)
3307			return 1;
3308		/* Connected via non-PCIe: no ATS */
3309		if (!pci_is_pcie(bridge) ||
3310		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3311			return 0;
3312		/* If we found the root port, look it up in the ATSR */
3313		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3314			break;
3315	}
3316
3317	rcu_read_lock();
3318	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3319		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3320		if (atsr->segment != pci_domain_nr(dev->bus))
3321			continue;
3322
3323		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3324			if (tmp == &bridge->dev)
3325				goto out;
3326
3327		if (atsru->include_all)
3328			goto out;
3329	}
3330	ret = 0;
3331out:
3332	rcu_read_unlock();
3333
3334	return ret;
3335}
3336
3337int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3338{
3339	int ret;
3340	struct dmar_rmrr_unit *rmrru;
3341	struct dmar_atsr_unit *atsru;
3342	struct dmar_satc_unit *satcu;
3343	struct acpi_dmar_atsr *atsr;
3344	struct acpi_dmar_reserved_memory *rmrr;
3345	struct acpi_dmar_satc *satc;
3346
3347	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3348		return 0;
3349
3350	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3351		rmrr = container_of(rmrru->hdr,
3352				    struct acpi_dmar_reserved_memory, header);
3353		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3354			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3355				((void *)rmrr) + rmrr->header.length,
3356				rmrr->segment, rmrru->devices,
3357				rmrru->devices_cnt);
3358			if (ret < 0)
3359				return ret;
3360		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3361			dmar_remove_dev_scope(info, rmrr->segment,
3362				rmrru->devices, rmrru->devices_cnt);
3363		}
3364	}
3365
3366	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3367		if (atsru->include_all)
3368			continue;
3369
3370		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3371		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3372			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3373					(void *)atsr + atsr->header.length,
3374					atsr->segment, atsru->devices,
3375					atsru->devices_cnt);
3376			if (ret > 0)
3377				break;
3378			else if (ret < 0)
3379				return ret;
3380		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3381			if (dmar_remove_dev_scope(info, atsr->segment,
3382					atsru->devices, atsru->devices_cnt))
3383				break;
3384		}
3385	}
3386	list_for_each_entry(satcu, &dmar_satc_units, list) {
3387		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3388		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3389			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3390					(void *)satc + satc->header.length,
3391					satc->segment, satcu->devices,
3392					satcu->devices_cnt);
3393			if (ret > 0)
3394				break;
3395			else if (ret < 0)
3396				return ret;
3397		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3398			if (dmar_remove_dev_scope(info, satc->segment,
3399					satcu->devices, satcu->devices_cnt))
3400				break;
3401		}
3402	}
3403
3404	return 0;
3405}
3406
3407static int intel_iommu_memory_notifier(struct notifier_block *nb,
3408				       unsigned long val, void *v)
3409{
3410	struct memory_notify *mhp = v;
3411	unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3412	unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3413			mhp->nr_pages - 1);
3414
3415	switch (val) {
3416	case MEM_GOING_ONLINE:
3417		if (iommu_domain_identity_map(si_domain,
3418					      start_vpfn, last_vpfn)) {
3419			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3420				start_vpfn, last_vpfn);
3421			return NOTIFY_BAD;
3422		}
3423		break;
3424
3425	case MEM_OFFLINE:
3426	case MEM_CANCEL_ONLINE:
3427		{
3428			struct dmar_drhd_unit *drhd;
3429			struct intel_iommu *iommu;
3430			LIST_HEAD(freelist);
3431
3432			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3433
3434			rcu_read_lock();
3435			for_each_active_iommu(iommu, drhd)
3436				iommu_flush_iotlb_psi(iommu, si_domain,
3437					start_vpfn, mhp->nr_pages,
3438					list_empty(&freelist), 0);
3439			rcu_read_unlock();
3440			put_pages_list(&freelist);
3441		}
3442		break;
3443	}
3444
3445	return NOTIFY_OK;
3446}
3447
3448static struct notifier_block intel_iommu_memory_nb = {
3449	.notifier_call = intel_iommu_memory_notifier,
3450	.priority = 0
3451};
3452
3453static void intel_disable_iommus(void)
3454{
3455	struct intel_iommu *iommu = NULL;
3456	struct dmar_drhd_unit *drhd;
3457
3458	for_each_iommu(iommu, drhd)
3459		iommu_disable_translation(iommu);
3460}
3461
3462void intel_iommu_shutdown(void)
3463{
3464	struct dmar_drhd_unit *drhd;
3465	struct intel_iommu *iommu = NULL;
3466
3467	if (no_iommu || dmar_disabled)
3468		return;
3469
3470	down_write(&dmar_global_lock);
3471
3472	/* Disable PMRs explicitly here. */
3473	for_each_iommu(iommu, drhd)
3474		iommu_disable_protect_mem_regions(iommu);
3475
3476	/* Make sure the IOMMUs are switched off */
3477	intel_disable_iommus();
3478
3479	up_write(&dmar_global_lock);
3480}
3481
3482static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3483{
3484	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3485
3486	return container_of(iommu_dev, struct intel_iommu, iommu);
3487}
3488
3489static ssize_t version_show(struct device *dev,
3490			    struct device_attribute *attr, char *buf)
3491{
3492	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3493	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3494	return sysfs_emit(buf, "%d:%d\n",
3495			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3496}
3497static DEVICE_ATTR_RO(version);
3498
3499static ssize_t address_show(struct device *dev,
3500			    struct device_attribute *attr, char *buf)
3501{
3502	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3503	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3504}
3505static DEVICE_ATTR_RO(address);
3506
3507static ssize_t cap_show(struct device *dev,
3508			struct device_attribute *attr, char *buf)
3509{
3510	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3511	return sysfs_emit(buf, "%llx\n", iommu->cap);
3512}
3513static DEVICE_ATTR_RO(cap);
3514
3515static ssize_t ecap_show(struct device *dev,
3516			 struct device_attribute *attr, char *buf)
3517{
3518	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3519	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3520}
3521static DEVICE_ATTR_RO(ecap);
3522
3523static ssize_t domains_supported_show(struct device *dev,
3524				      struct device_attribute *attr, char *buf)
3525{
3526	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3527	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3528}
3529static DEVICE_ATTR_RO(domains_supported);
3530
3531static ssize_t domains_used_show(struct device *dev,
3532				 struct device_attribute *attr, char *buf)
3533{
3534	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3535	return sysfs_emit(buf, "%d\n",
3536			  bitmap_weight(iommu->domain_ids,
3537					cap_ndoms(iommu->cap)));
3538}
3539static DEVICE_ATTR_RO(domains_used);
3540
3541static struct attribute *intel_iommu_attrs[] = {
3542	&dev_attr_version.attr,
3543	&dev_attr_address.attr,
3544	&dev_attr_cap.attr,
3545	&dev_attr_ecap.attr,
3546	&dev_attr_domains_supported.attr,
3547	&dev_attr_domains_used.attr,
3548	NULL,
3549};
3550
3551static struct attribute_group intel_iommu_group = {
3552	.name = "intel-iommu",
3553	.attrs = intel_iommu_attrs,
3554};
3555
3556const struct attribute_group *intel_iommu_groups[] = {
3557	&intel_iommu_group,
3558	NULL,
3559};
3560
3561static bool has_external_pci(void)
3562{
3563	struct pci_dev *pdev = NULL;
3564
3565	for_each_pci_dev(pdev)
3566		if (pdev->external_facing) {
3567			pci_dev_put(pdev);
3568			return true;
3569		}
3570
3571	return false;
3572}
3573
3574static int __init platform_optin_force_iommu(void)
3575{
3576	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3577		return 0;
3578
3579	if (no_iommu || dmar_disabled)
3580		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3581
3582	/*
3583	 * If Intel-IOMMU is disabled by default, we will apply identity
3584	 * map for all devices except those marked as being untrusted.
3585	 */
3586	if (dmar_disabled)
3587		iommu_set_default_passthrough(false);
3588
3589	dmar_disabled = 0;
3590	no_iommu = 0;
3591
3592	return 1;
3593}
3594
3595static int __init probe_acpi_namespace_devices(void)
3596{
3597	struct dmar_drhd_unit *drhd;
3598	/* To avoid a -Wunused-but-set-variable warning. */
3599	struct intel_iommu *iommu __maybe_unused;
3600	struct device *dev;
3601	int i, ret = 0;
3602
3603	for_each_active_iommu(iommu, drhd) {
3604		for_each_active_dev_scope(drhd->devices,
3605					  drhd->devices_cnt, i, dev) {
3606			struct acpi_device_physical_node *pn;
 
3607			struct acpi_device *adev;
3608
3609			if (dev->bus != &acpi_bus_type)
3610				continue;
3611
3612			adev = to_acpi_device(dev);
3613			mutex_lock(&adev->physical_node_lock);
3614			list_for_each_entry(pn,
3615					    &adev->physical_node_list, node) {
 
 
 
 
 
 
3616				ret = iommu_probe_device(pn->dev);
3617				if (ret)
3618					break;
3619			}
3620			mutex_unlock(&adev->physical_node_lock);
3621
3622			if (ret)
3623				return ret;
3624		}
3625	}
3626
3627	return 0;
3628}
3629
3630static __init int tboot_force_iommu(void)
3631{
3632	if (!tboot_enabled())
3633		return 0;
3634
3635	if (no_iommu || dmar_disabled)
3636		pr_warn("Forcing Intel-IOMMU to enabled\n");
3637
3638	dmar_disabled = 0;
3639	no_iommu = 0;
3640
3641	return 1;
3642}
3643
3644int __init intel_iommu_init(void)
3645{
3646	int ret = -ENODEV;
3647	struct dmar_drhd_unit *drhd;
3648	struct intel_iommu *iommu;
3649
3650	/*
3651	 * Intel IOMMU is required for a TXT/tboot launch or platform
3652	 * opt in, so enforce that.
3653	 */
3654	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3655		    platform_optin_force_iommu();
3656
3657	down_write(&dmar_global_lock);
3658	if (dmar_table_init()) {
3659		if (force_on)
3660			panic("tboot: Failed to initialize DMAR table\n");
3661		goto out_free_dmar;
3662	}
3663
3664	if (dmar_dev_scope_init() < 0) {
3665		if (force_on)
3666			panic("tboot: Failed to initialize DMAR device scope\n");
3667		goto out_free_dmar;
3668	}
3669
3670	up_write(&dmar_global_lock);
3671
3672	/*
3673	 * The bus notifier takes the dmar_global_lock, so lockdep will
3674	 * complain later when we register it under the lock.
3675	 */
3676	dmar_register_bus_notifier();
3677
3678	down_write(&dmar_global_lock);
3679
3680	if (!no_iommu)
3681		intel_iommu_debugfs_init();
3682
3683	if (no_iommu || dmar_disabled) {
3684		/*
3685		 * We exit the function here to ensure IOMMU's remapping and
3686		 * mempool aren't setup, which means that the IOMMU's PMRs
3687		 * won't be disabled via the call to init_dmars(). So disable
3688		 * it explicitly here. The PMRs were setup by tboot prior to
3689		 * calling SENTER, but the kernel is expected to reset/tear
3690		 * down the PMRs.
3691		 */
3692		if (intel_iommu_tboot_noforce) {
3693			for_each_iommu(iommu, drhd)
3694				iommu_disable_protect_mem_regions(iommu);
3695		}
3696
3697		/*
3698		 * Make sure the IOMMUs are switched off, even when we
3699		 * boot into a kexec kernel and the previous kernel left
3700		 * them enabled
3701		 */
3702		intel_disable_iommus();
3703		goto out_free_dmar;
3704	}
3705
3706	if (list_empty(&dmar_rmrr_units))
3707		pr_info("No RMRR found\n");
3708
3709	if (list_empty(&dmar_atsr_units))
3710		pr_info("No ATSR found\n");
3711
3712	if (list_empty(&dmar_satc_units))
3713		pr_info("No SATC found\n");
3714
3715	init_no_remapping_devices();
3716
3717	ret = init_dmars();
3718	if (ret) {
3719		if (force_on)
3720			panic("tboot: Failed to initialize DMARs\n");
3721		pr_err("Initialization failed\n");
3722		goto out_free_dmar;
3723	}
3724	up_write(&dmar_global_lock);
3725
3726	init_iommu_pm_ops();
3727
3728	down_read(&dmar_global_lock);
3729	for_each_active_iommu(iommu, drhd) {
3730		/*
3731		 * The flush queue implementation does not perform
3732		 * page-selective invalidations that are required for efficient
3733		 * TLB flushes in virtual environments.  The benefit of batching
3734		 * is likely to be much lower than the overhead of synchronizing
3735		 * the virtual and physical IOMMU page-tables.
3736		 */
3737		if (cap_caching_mode(iommu->cap) &&
3738		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3739			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3740			iommu_set_dma_strict();
3741		}
3742		iommu_device_sysfs_add(&iommu->iommu, NULL,
3743				       intel_iommu_groups,
3744				       "%s", iommu->name);
3745		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3746
3747		iommu_pmu_register(iommu);
3748	}
3749	up_read(&dmar_global_lock);
3750
3751	if (si_domain && !hw_pass_through)
3752		register_memory_notifier(&intel_iommu_memory_nb);
3753
3754	down_read(&dmar_global_lock);
3755	if (probe_acpi_namespace_devices())
3756		pr_warn("ACPI name space devices didn't probe correctly\n");
3757
3758	/* Finally, we enable the DMA remapping hardware. */
3759	for_each_iommu(iommu, drhd) {
3760		if (!drhd->ignored && !translation_pre_enabled(iommu))
3761			iommu_enable_translation(iommu);
3762
3763		iommu_disable_protect_mem_regions(iommu);
3764	}
3765	up_read(&dmar_global_lock);
3766
3767	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3768
3769	intel_iommu_enabled = 1;
3770
3771	return 0;
3772
3773out_free_dmar:
3774	intel_iommu_free_dmars();
3775	up_write(&dmar_global_lock);
3776	return ret;
3777}
3778
3779static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3780{
3781	struct device_domain_info *info = opaque;
3782
3783	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3784	return 0;
3785}
3786
3787/*
3788 * NB - intel-iommu lacks any sort of reference counting for the users of
3789 * dependent devices.  If multiple endpoints have intersecting dependent
3790 * devices, unbinding the driver from any one of them will possibly leave
3791 * the others unable to operate.
3792 */
3793static void domain_context_clear(struct device_domain_info *info)
3794{
3795	if (!dev_is_pci(info->dev))
3796		domain_context_clear_one(info, info->bus, info->devfn);
3797
3798	pci_for_each_dma_alias(to_pci_dev(info->dev),
3799			       &domain_context_clear_one_cb, info);
3800}
3801
3802static void dmar_remove_one_dev_info(struct device *dev)
3803{
3804	struct device_domain_info *info = dev_iommu_priv_get(dev);
3805	struct dmar_domain *domain = info->domain;
3806	struct intel_iommu *iommu = info->iommu;
3807	unsigned long flags;
3808
3809	if (!dev_is_real_dma_subdevice(info->dev)) {
3810		if (dev_is_pci(info->dev) && sm_supported(iommu))
3811			intel_pasid_tear_down_entry(iommu, info->dev,
3812					IOMMU_NO_PASID, false);
3813
3814		iommu_disable_pci_caps(info);
3815		domain_context_clear(info);
3816	}
3817
3818	spin_lock_irqsave(&domain->lock, flags);
3819	list_del(&info->link);
3820	spin_unlock_irqrestore(&domain->lock, flags);
3821
3822	domain_detach_iommu(domain, iommu);
3823	info->domain = NULL;
3824}
3825
3826/*
3827 * Clear the page table pointer in context or pasid table entries so that
3828 * all DMA requests without PASID from the device are blocked. If the page
3829 * table has been set, clean up the data structures.
3830 */
3831void device_block_translation(struct device *dev)
3832{
3833	struct device_domain_info *info = dev_iommu_priv_get(dev);
3834	struct intel_iommu *iommu = info->iommu;
3835	unsigned long flags;
3836
3837	iommu_disable_pci_caps(info);
3838	if (!dev_is_real_dma_subdevice(dev)) {
3839		if (sm_supported(iommu))
3840			intel_pasid_tear_down_entry(iommu, dev,
3841						    IOMMU_NO_PASID, false);
3842		else
3843			domain_context_clear(info);
3844	}
3845
3846	if (!info->domain)
3847		return;
3848
3849	spin_lock_irqsave(&info->domain->lock, flags);
3850	list_del(&info->link);
3851	spin_unlock_irqrestore(&info->domain->lock, flags);
3852
3853	domain_detach_iommu(info->domain, iommu);
3854	info->domain = NULL;
3855}
3856
3857static int md_domain_init(struct dmar_domain *domain, int guest_width)
3858{
3859	int adjust_width;
3860
3861	/* calculate AGAW */
3862	domain->gaw = guest_width;
3863	adjust_width = guestwidth_to_adjustwidth(guest_width);
3864	domain->agaw = width_to_agaw(adjust_width);
3865
3866	domain->iommu_coherency = false;
3867	domain->iommu_superpage = 0;
3868	domain->max_addr = 0;
3869
3870	/* always allocate the top pgd */
3871	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
3872	if (!domain->pgd)
3873		return -ENOMEM;
3874	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3875	return 0;
3876}
3877
3878static int blocking_domain_attach_dev(struct iommu_domain *domain,
3879				      struct device *dev)
3880{
3881	device_block_translation(dev);
3882	return 0;
3883}
3884
3885static struct iommu_domain blocking_domain = {
3886	.type = IOMMU_DOMAIN_BLOCKED,
3887	.ops = &(const struct iommu_domain_ops) {
3888		.attach_dev	= blocking_domain_attach_dev,
 
3889	}
3890};
3891
3892static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3893{
3894	struct dmar_domain *dmar_domain;
3895	struct iommu_domain *domain;
3896
3897	switch (type) {
 
 
3898	case IOMMU_DOMAIN_DMA:
 
3899	case IOMMU_DOMAIN_UNMANAGED:
3900		dmar_domain = alloc_domain(type);
3901		if (!dmar_domain) {
3902			pr_err("Can't allocate dmar_domain\n");
3903			return NULL;
3904		}
3905		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3906			pr_err("Domain initialization failed\n");
3907			domain_exit(dmar_domain);
3908			return NULL;
3909		}
3910
3911		domain = &dmar_domain->domain;
3912		domain->geometry.aperture_start = 0;
3913		domain->geometry.aperture_end   =
3914				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
3915		domain->geometry.force_aperture = true;
3916
3917		return domain;
3918	case IOMMU_DOMAIN_IDENTITY:
3919		return &si_domain->domain;
3920	case IOMMU_DOMAIN_SVA:
3921		return intel_svm_domain_alloc();
3922	default:
3923		return NULL;
3924	}
3925
3926	return NULL;
3927}
3928
3929static struct iommu_domain *
3930intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3931			      struct iommu_domain *parent,
3932			      const struct iommu_user_data *user_data)
3933{
3934	struct device_domain_info *info = dev_iommu_priv_get(dev);
3935	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3936	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3937	struct intel_iommu *iommu = info->iommu;
3938	struct dmar_domain *dmar_domain;
3939	struct iommu_domain *domain;
3940
3941	/* Must be NESTING domain */
3942	if (parent) {
3943		if (!nested_supported(iommu) || flags)
3944			return ERR_PTR(-EOPNOTSUPP);
3945		return intel_nested_domain_alloc(parent, user_data);
3946	}
3947
3948	if (flags &
3949	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3950		return ERR_PTR(-EOPNOTSUPP);
3951	if (nested_parent && !nested_supported(iommu))
3952		return ERR_PTR(-EOPNOTSUPP);
3953	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3954		return ERR_PTR(-EOPNOTSUPP);
3955
3956	/*
3957	 * domain_alloc_user op needs to fully initialize a domain before
3958	 * return, so uses iommu_domain_alloc() here for simple.
3959	 */
3960	domain = iommu_domain_alloc(dev->bus);
3961	if (!domain)
3962		return ERR_PTR(-ENOMEM);
3963
3964	dmar_domain = to_dmar_domain(domain);
3965
3966	if (nested_parent) {
3967		dmar_domain->nested_parent = true;
3968		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3969		spin_lock_init(&dmar_domain->s1_lock);
3970	}
3971
3972	if (dirty_tracking) {
3973		if (dmar_domain->use_first_level) {
3974			iommu_domain_free(domain);
3975			return ERR_PTR(-EOPNOTSUPP);
3976		}
3977		domain->dirty_ops = &intel_dirty_ops;
3978	}
3979
3980	return domain;
3981}
3982
3983static void intel_iommu_domain_free(struct iommu_domain *domain)
3984{
3985	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3986
3987	WARN_ON(dmar_domain->nested_parent &&
3988		!list_empty(&dmar_domain->s1_domains));
3989	if (domain != &si_domain->domain)
3990		domain_exit(dmar_domain);
3991}
3992
3993int prepare_domain_attach_device(struct iommu_domain *domain,
3994				 struct device *dev)
3995{
3996	struct device_domain_info *info = dev_iommu_priv_get(dev);
3997	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3998	struct intel_iommu *iommu = info->iommu;
3999	int addr_width;
4000
4001	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4002		return -EINVAL;
 
4003
4004	if (domain->dirty_ops && !ssads_supported(iommu))
4005		return -EINVAL;
4006
4007	/* check if this iommu agaw is sufficient for max mapped address */
4008	addr_width = agaw_to_width(iommu->agaw);
4009	if (addr_width > cap_mgaw(iommu->cap))
4010		addr_width = cap_mgaw(iommu->cap);
4011
4012	if (dmar_domain->max_addr > (1LL << addr_width))
4013		return -EINVAL;
4014	dmar_domain->gaw = addr_width;
4015
4016	/*
4017	 * Knock out extra levels of page tables if necessary
4018	 */
4019	while (iommu->agaw < dmar_domain->agaw) {
4020		struct dma_pte *pte;
4021
4022		pte = dmar_domain->pgd;
4023		if (dma_pte_present(pte)) {
4024			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4025			free_pgtable_page(pte);
4026		}
4027		dmar_domain->agaw--;
4028	}
4029
4030	return 0;
4031}
4032
4033static int intel_iommu_attach_device(struct iommu_domain *domain,
4034				     struct device *dev)
4035{
4036	struct device_domain_info *info = dev_iommu_priv_get(dev);
4037	int ret;
4038
 
 
 
 
 
 
4039	if (info->domain)
4040		device_block_translation(dev);
4041
4042	ret = prepare_domain_attach_device(domain, dev);
4043	if (ret)
4044		return ret;
4045
4046	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4047}
4048
4049static int intel_iommu_map(struct iommu_domain *domain,
4050			   unsigned long iova, phys_addr_t hpa,
4051			   size_t size, int iommu_prot, gfp_t gfp)
4052{
4053	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4054	u64 max_addr;
4055	int prot = 0;
4056
4057	if (iommu_prot & IOMMU_READ)
4058		prot |= DMA_PTE_READ;
4059	if (iommu_prot & IOMMU_WRITE)
4060		prot |= DMA_PTE_WRITE;
4061	if (dmar_domain->set_pte_snp)
4062		prot |= DMA_PTE_SNP;
4063
4064	max_addr = iova + size;
4065	if (dmar_domain->max_addr < max_addr) {
4066		u64 end;
4067
4068		/* check if minimum agaw is sufficient for mapped address */
4069		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4070		if (end < max_addr) {
4071			pr_err("%s: iommu width (%d) is not "
4072			       "sufficient for the mapped address (%llx)\n",
4073			       __func__, dmar_domain->gaw, max_addr);
4074			return -EFAULT;
4075		}
4076		dmar_domain->max_addr = max_addr;
4077	}
4078	/* Round up size to next multiple of PAGE_SIZE, if it and
4079	   the low bits of hpa would take us onto the next page */
4080	size = aligned_nrpages(hpa, size);
4081	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4082				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4083}
4084
4085static int intel_iommu_map_pages(struct iommu_domain *domain,
4086				 unsigned long iova, phys_addr_t paddr,
4087				 size_t pgsize, size_t pgcount,
4088				 int prot, gfp_t gfp, size_t *mapped)
4089{
4090	unsigned long pgshift = __ffs(pgsize);
4091	size_t size = pgcount << pgshift;
4092	int ret;
4093
4094	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4095		return -EINVAL;
4096
4097	if (!IS_ALIGNED(iova | paddr, pgsize))
4098		return -EINVAL;
4099
4100	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4101	if (!ret && mapped)
4102		*mapped = size;
4103
4104	return ret;
4105}
4106
4107static size_t intel_iommu_unmap(struct iommu_domain *domain,
4108				unsigned long iova, size_t size,
4109				struct iommu_iotlb_gather *gather)
4110{
4111	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4112	unsigned long start_pfn, last_pfn;
4113	int level = 0;
4114
4115	/* Cope with horrid API which requires us to unmap more than the
4116	   size argument if it happens to be a large-page mapping. */
4117	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4118				     &level, GFP_ATOMIC)))
4119		return 0;
4120
4121	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4122		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4123
4124	start_pfn = iova >> VTD_PAGE_SHIFT;
4125	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4126
4127	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4128
4129	if (dmar_domain->max_addr == iova + size)
4130		dmar_domain->max_addr = iova;
4131
4132	/*
4133	 * We do not use page-selective IOTLB invalidation in flush queue,
4134	 * so there is no need to track page and sync iotlb.
4135	 */
4136	if (!iommu_iotlb_gather_queued(gather))
4137		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4138
4139	return size;
4140}
4141
4142static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4143				      unsigned long iova,
4144				      size_t pgsize, size_t pgcount,
4145				      struct iommu_iotlb_gather *gather)
4146{
4147	unsigned long pgshift = __ffs(pgsize);
4148	size_t size = pgcount << pgshift;
4149
4150	return intel_iommu_unmap(domain, iova, size, gather);
4151}
4152
4153static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4154				 struct iommu_iotlb_gather *gather)
4155{
4156	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4157	unsigned long iova_pfn = IOVA_PFN(gather->start);
4158	size_t size = gather->end - gather->start;
4159	struct iommu_domain_info *info;
4160	unsigned long start_pfn;
4161	unsigned long nrpages;
4162	unsigned long i;
4163
4164	nrpages = aligned_nrpages(gather->start, size);
4165	start_pfn = mm_to_dma_pfn_start(iova_pfn);
4166
4167	xa_for_each(&dmar_domain->iommu_array, i, info)
4168		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4169				      start_pfn, nrpages,
4170				      list_empty(&gather->freelist), 0);
4171
4172	if (dmar_domain->nested_parent)
4173		parent_domain_flush(dmar_domain, start_pfn, nrpages,
4174				    list_empty(&gather->freelist));
4175	put_pages_list(&gather->freelist);
4176}
4177
4178static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4179					    dma_addr_t iova)
4180{
4181	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4182	struct dma_pte *pte;
4183	int level = 0;
4184	u64 phys = 0;
4185
4186	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4187			     GFP_ATOMIC);
4188	if (pte && dma_pte_present(pte))
4189		phys = dma_pte_addr(pte) +
4190			(iova & (BIT_MASK(level_to_offset_bits(level) +
4191						VTD_PAGE_SHIFT) - 1));
4192
4193	return phys;
4194}
4195
4196static bool domain_support_force_snooping(struct dmar_domain *domain)
4197{
4198	struct device_domain_info *info;
4199	bool support = true;
4200
4201	assert_spin_locked(&domain->lock);
4202	list_for_each_entry(info, &domain->devices, link) {
4203		if (!ecap_sc_support(info->iommu->ecap)) {
4204			support = false;
4205			break;
4206		}
4207	}
4208
4209	return support;
4210}
4211
4212static void domain_set_force_snooping(struct dmar_domain *domain)
4213{
4214	struct device_domain_info *info;
4215
4216	assert_spin_locked(&domain->lock);
4217	/*
4218	 * Second level page table supports per-PTE snoop control. The
4219	 * iommu_map() interface will handle this by setting SNP bit.
4220	 */
4221	if (!domain->use_first_level) {
4222		domain->set_pte_snp = true;
4223		return;
4224	}
4225
4226	list_for_each_entry(info, &domain->devices, link)
4227		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4228						     IOMMU_NO_PASID);
4229}
4230
4231static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4232{
4233	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4234	unsigned long flags;
4235
4236	if (dmar_domain->force_snooping)
4237		return true;
4238
4239	spin_lock_irqsave(&dmar_domain->lock, flags);
4240	if (!domain_support_force_snooping(dmar_domain) ||
4241	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4242		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4243		return false;
4244	}
4245
4246	domain_set_force_snooping(dmar_domain);
4247	dmar_domain->force_snooping = true;
4248	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4249
4250	return true;
4251}
4252
4253static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4254{
4255	struct device_domain_info *info = dev_iommu_priv_get(dev);
4256
4257	switch (cap) {
4258	case IOMMU_CAP_CACHE_COHERENCY:
4259	case IOMMU_CAP_DEFERRED_FLUSH:
4260		return true;
 
 
4261	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4262		return dmar_platform_optin();
4263	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4264		return ecap_sc_support(info->iommu->ecap);
4265	case IOMMU_CAP_DIRTY_TRACKING:
4266		return ssads_supported(info->iommu);
4267	default:
4268		return false;
4269	}
4270}
4271
4272static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4273{
4274	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4275	struct device_domain_info *info;
4276	struct intel_iommu *iommu;
4277	u8 bus, devfn;
4278	int ret;
4279
4280	iommu = device_lookup_iommu(dev, &bus, &devfn);
4281	if (!iommu || !iommu->iommu.ops)
4282		return ERR_PTR(-ENODEV);
4283
4284	info = kzalloc(sizeof(*info), GFP_KERNEL);
4285	if (!info)
4286		return ERR_PTR(-ENOMEM);
4287
4288	if (dev_is_real_dma_subdevice(dev)) {
4289		info->bus = pdev->bus->number;
4290		info->devfn = pdev->devfn;
4291		info->segment = pci_domain_nr(pdev->bus);
4292	} else {
4293		info->bus = bus;
4294		info->devfn = devfn;
4295		info->segment = iommu->segment;
4296	}
4297
4298	info->dev = dev;
4299	info->iommu = iommu;
4300	if (dev_is_pci(dev)) {
4301		if (ecap_dev_iotlb_support(iommu->ecap) &&
4302		    pci_ats_supported(pdev) &&
4303		    dmar_ats_supported(pdev, iommu)) {
4304			info->ats_supported = 1;
4305			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4306
4307			/*
4308			 * For IOMMU that supports device IOTLB throttling
4309			 * (DIT), we assign PFSID to the invalidation desc
4310			 * of a VF such that IOMMU HW can gauge queue depth
4311			 * at PF level. If DIT is not set, PFSID will be
4312			 * treated as reserved, which should be set to 0.
4313			 */
4314			if (ecap_dit(iommu->ecap))
4315				info->pfsid = pci_dev_id(pci_physfn(pdev));
4316			info->ats_qdep = pci_ats_queue_depth(pdev);
4317		}
4318		if (sm_supported(iommu)) {
4319			if (pasid_supported(iommu)) {
4320				int features = pci_pasid_features(pdev);
4321
4322				if (features >= 0)
4323					info->pasid_supported = features | 1;
4324			}
4325
4326			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4327			    pci_pri_supported(pdev))
4328				info->pri_supported = 1;
4329		}
4330	}
4331
4332	dev_iommu_priv_set(dev, info);
4333
4334	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4335		ret = intel_pasid_alloc_table(dev);
4336		if (ret) {
4337			dev_err(dev, "PASID table allocation failed\n");
 
4338			kfree(info);
4339			return ERR_PTR(ret);
4340		}
4341	}
4342
4343	intel_iommu_debugfs_create_dev(info);
4344
4345	return &iommu->iommu;
4346}
4347
4348static void intel_iommu_release_device(struct device *dev)
4349{
4350	struct device_domain_info *info = dev_iommu_priv_get(dev);
4351
4352	dmar_remove_one_dev_info(dev);
4353	intel_pasid_free_table(dev);
4354	intel_iommu_debugfs_remove_dev(info);
4355	kfree(info);
4356	set_dma_ops(dev, NULL);
4357}
4358
4359static void intel_iommu_probe_finalize(struct device *dev)
4360{
4361	set_dma_ops(dev, NULL);
4362	iommu_setup_dma_ops(dev, 0, U64_MAX);
4363}
4364
4365static void intel_iommu_get_resv_regions(struct device *device,
4366					 struct list_head *head)
4367{
4368	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4369	struct iommu_resv_region *reg;
4370	struct dmar_rmrr_unit *rmrr;
4371	struct device *i_dev;
4372	int i;
4373
4374	rcu_read_lock();
4375	for_each_rmrr_units(rmrr) {
4376		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4377					  i, i_dev) {
4378			struct iommu_resv_region *resv;
4379			enum iommu_resv_type type;
4380			size_t length;
4381
4382			if (i_dev != device &&
4383			    !is_downstream_to_pci_bridge(device, i_dev))
4384				continue;
4385
4386			length = rmrr->end_address - rmrr->base_address + 1;
4387
4388			type = device_rmrr_is_relaxable(device) ?
4389				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4390
4391			resv = iommu_alloc_resv_region(rmrr->base_address,
4392						       length, prot, type,
4393						       GFP_ATOMIC);
4394			if (!resv)
4395				break;
4396
4397			list_add_tail(&resv->list, head);
4398		}
4399	}
4400	rcu_read_unlock();
4401
4402#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4403	if (dev_is_pci(device)) {
4404		struct pci_dev *pdev = to_pci_dev(device);
4405
4406		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4407			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4408					IOMMU_RESV_DIRECT_RELAXABLE,
4409					GFP_KERNEL);
4410			if (reg)
4411				list_add_tail(&reg->list, head);
4412		}
4413	}
4414#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4415
4416	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4417				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4418				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4419	if (!reg)
4420		return;
4421	list_add_tail(&reg->list, head);
4422}
4423
4424static struct iommu_group *intel_iommu_device_group(struct device *dev)
4425{
4426	if (dev_is_pci(dev))
4427		return pci_device_group(dev);
4428	return generic_device_group(dev);
4429}
4430
4431static int intel_iommu_enable_sva(struct device *dev)
4432{
4433	struct device_domain_info *info = dev_iommu_priv_get(dev);
4434	struct intel_iommu *iommu;
 
4435
4436	if (!info || dmar_disabled)
4437		return -EINVAL;
4438
4439	iommu = info->iommu;
4440	if (!iommu)
4441		return -EINVAL;
4442
4443	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4444		return -ENODEV;
4445
4446	if (!info->pasid_enabled || !info->ats_enabled)
4447		return -EINVAL;
4448
4449	/*
4450	 * Devices having device-specific I/O fault handling should not
4451	 * support PCI/PRI. The IOMMU side has no means to check the
4452	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4453	 * default that if the device driver enables SVA on a non-PRI
4454	 * device, it will handle IOPF in its own way.
4455	 */
4456	if (!info->pri_supported)
4457		return 0;
4458
4459	/* Devices supporting PRI should have it enabled. */
4460	if (!info->pri_enabled)
4461		return -EINVAL;
4462
4463	return 0;
4464}
4465
4466static int intel_iommu_enable_iopf(struct device *dev)
4467{
4468	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4469	struct device_domain_info *info = dev_iommu_priv_get(dev);
4470	struct intel_iommu *iommu;
4471	int ret;
4472
4473	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4474		return -ENODEV;
4475
4476	if (info->pri_enabled)
4477		return -EBUSY;
4478
4479	iommu = info->iommu;
4480	if (!iommu)
4481		return -EINVAL;
4482
4483	/* PASID is required in PRG Response Message. */
4484	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4485		return -EINVAL;
4486
4487	ret = pci_reset_pri(pdev);
4488	if (ret)
4489		return ret;
4490
4491	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4492	if (ret)
4493		return ret;
4494
4495	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4496	if (ret)
4497		goto iopf_remove_device;
4498
4499	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4500	if (ret)
4501		goto iopf_unregister_handler;
4502	info->pri_enabled = 1;
4503
4504	return 0;
4505
4506iopf_unregister_handler:
4507	iommu_unregister_device_fault_handler(dev);
4508iopf_remove_device:
4509	iopf_queue_remove_device(iommu->iopf_queue, dev);
4510
4511	return ret;
4512}
4513
4514static int intel_iommu_disable_iopf(struct device *dev)
4515{
4516	struct device_domain_info *info = dev_iommu_priv_get(dev);
4517	struct intel_iommu *iommu = info->iommu;
4518
4519	if (!info->pri_enabled)
4520		return -EINVAL;
4521
4522	/*
4523	 * PCIe spec states that by clearing PRI enable bit, the Page
4524	 * Request Interface will not issue new page requests, but has
4525	 * outstanding page requests that have been transmitted or are
4526	 * queued for transmission. This is supposed to be called after
4527	 * the device driver has stopped DMA, all PASIDs have been
4528	 * unbound and the outstanding PRQs have been drained.
4529	 */
4530	pci_disable_pri(to_pci_dev(dev));
4531	info->pri_enabled = 0;
4532
4533	/*
4534	 * With PRI disabled and outstanding PRQs drained, unregistering
4535	 * fault handler and removing device from iopf queue should never
4536	 * fail.
4537	 */
4538	WARN_ON(iommu_unregister_device_fault_handler(dev));
4539	WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4540
4541	return 0;
4542}
4543
4544static int
4545intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4546{
4547	switch (feat) {
4548	case IOMMU_DEV_FEAT_IOPF:
4549		return intel_iommu_enable_iopf(dev);
4550
4551	case IOMMU_DEV_FEAT_SVA:
4552		return intel_iommu_enable_sva(dev);
4553
4554	default:
4555		return -ENODEV;
4556	}
4557}
4558
4559static int
4560intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4561{
4562	switch (feat) {
4563	case IOMMU_DEV_FEAT_IOPF:
4564		return intel_iommu_disable_iopf(dev);
4565
4566	case IOMMU_DEV_FEAT_SVA:
4567		return 0;
4568
4569	default:
4570		return -ENODEV;
4571	}
4572}
4573
4574static bool intel_iommu_is_attach_deferred(struct device *dev)
4575{
4576	struct device_domain_info *info = dev_iommu_priv_get(dev);
4577
4578	return translation_pre_enabled(info->iommu) && !info->domain;
4579}
4580
4581/*
4582 * Check that the device does not live on an external facing PCI port that is
4583 * marked as untrusted. Such devices should not be able to apply quirks and
4584 * thus not be able to bypass the IOMMU restrictions.
4585 */
4586static bool risky_device(struct pci_dev *pdev)
4587{
4588	if (pdev->untrusted) {
4589		pci_info(pdev,
4590			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4591			 pdev->vendor, pdev->device);
4592		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4593		return true;
4594	}
4595	return false;
4596}
4597
4598static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4599				      unsigned long iova, size_t size)
4600{
4601	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4602	unsigned long pages = aligned_nrpages(iova, size);
4603	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4604	struct iommu_domain_info *info;
4605	unsigned long i;
4606
4607	xa_for_each(&dmar_domain->iommu_array, i, info)
4608		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4609	return 0;
4610}
4611
4612static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4613{
4614	struct device_domain_info *info = dev_iommu_priv_get(dev);
4615	struct dev_pasid_info *curr, *dev_pasid = NULL;
4616	struct intel_iommu *iommu = info->iommu;
4617	struct dmar_domain *dmar_domain;
4618	struct iommu_domain *domain;
4619	unsigned long flags;
4620
 
4621	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4622	if (WARN_ON_ONCE(!domain))
4623		goto out_tear_down;
4624
4625	/*
4626	 * The SVA implementation needs to handle its own stuffs like the mm
4627	 * notification. Before consolidating that code into iommu core, let
4628	 * the intel sva code handle it.
4629	 */
4630	if (domain->type == IOMMU_DOMAIN_SVA) {
4631		intel_svm_remove_dev_pasid(dev, pasid);
4632		goto out_tear_down;
4633	}
4634
4635	dmar_domain = to_dmar_domain(domain);
4636	spin_lock_irqsave(&dmar_domain->lock, flags);
4637	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4638		if (curr->dev == dev && curr->pasid == pasid) {
4639			list_del(&curr->link_domain);
4640			dev_pasid = curr;
4641			break;
4642		}
4643	}
4644	WARN_ON_ONCE(!dev_pasid);
4645	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4646
4647	domain_detach_iommu(dmar_domain, iommu);
4648	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4649	kfree(dev_pasid);
4650out_tear_down:
4651	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4652	intel_drain_pasid_prq(dev, pasid);
4653}
4654
4655static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4656				     struct device *dev, ioasid_t pasid)
4657{
4658	struct device_domain_info *info = dev_iommu_priv_get(dev);
4659	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4660	struct intel_iommu *iommu = info->iommu;
4661	struct dev_pasid_info *dev_pasid;
4662	unsigned long flags;
4663	int ret;
4664
4665	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4666		return -EOPNOTSUPP;
4667
4668	if (domain->dirty_ops)
4669		return -EINVAL;
4670
4671	if (context_copied(iommu, info->bus, info->devfn))
4672		return -EBUSY;
4673
4674	ret = prepare_domain_attach_device(domain, dev);
4675	if (ret)
4676		return ret;
4677
4678	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4679	if (!dev_pasid)
4680		return -ENOMEM;
4681
4682	ret = domain_attach_iommu(dmar_domain, iommu);
4683	if (ret)
4684		goto out_free;
4685
4686	if (domain_type_is_si(dmar_domain))
4687		ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4688	else if (dmar_domain->use_first_level)
4689		ret = domain_setup_first_level(iommu, dmar_domain,
4690					       dev, pasid);
4691	else
4692		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4693						     dev, pasid);
4694	if (ret)
4695		goto out_detach_iommu;
4696
4697	dev_pasid->dev = dev;
4698	dev_pasid->pasid = pasid;
4699	spin_lock_irqsave(&dmar_domain->lock, flags);
4700	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4701	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4702
4703	if (domain->type & __IOMMU_DOMAIN_PAGING)
4704		intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4705
4706	return 0;
4707out_detach_iommu:
4708	domain_detach_iommu(dmar_domain, iommu);
4709out_free:
4710	kfree(dev_pasid);
4711	return ret;
4712}
4713
4714static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4715{
4716	struct device_domain_info *info = dev_iommu_priv_get(dev);
4717	struct intel_iommu *iommu = info->iommu;
4718	struct iommu_hw_info_vtd *vtd;
4719
4720	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4721	if (!vtd)
4722		return ERR_PTR(-ENOMEM);
4723
4724	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4725	vtd->cap_reg = iommu->cap;
4726	vtd->ecap_reg = iommu->ecap;
4727	*length = sizeof(*vtd);
4728	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4729	return vtd;
4730}
4731
4732/*
4733 * Set dirty tracking for the device list of a domain. The caller must
4734 * hold the domain->lock when calling it.
4735 */
4736static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4737{
4738	struct device_domain_info *info;
4739	int ret = 0;
4740
4741	list_for_each_entry(info, devices, link) {
4742		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4743						       IOMMU_NO_PASID, enable);
4744		if (ret)
4745			break;
4746	}
4747
4748	return ret;
4749}
4750
4751static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4752					    bool enable)
4753{
4754	struct dmar_domain *s1_domain;
4755	unsigned long flags;
4756	int ret;
4757
4758	spin_lock(&domain->s1_lock);
4759	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4760		spin_lock_irqsave(&s1_domain->lock, flags);
4761		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4762		spin_unlock_irqrestore(&s1_domain->lock, flags);
4763		if (ret)
4764			goto err_unwind;
4765	}
4766	spin_unlock(&domain->s1_lock);
4767	return 0;
4768
4769err_unwind:
4770	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4771		spin_lock_irqsave(&s1_domain->lock, flags);
4772		device_set_dirty_tracking(&s1_domain->devices,
4773					  domain->dirty_tracking);
4774		spin_unlock_irqrestore(&s1_domain->lock, flags);
4775	}
4776	spin_unlock(&domain->s1_lock);
4777	return ret;
4778}
4779
4780static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4781					  bool enable)
4782{
4783	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4784	int ret;
4785
4786	spin_lock(&dmar_domain->lock);
4787	if (dmar_domain->dirty_tracking == enable)
4788		goto out_unlock;
4789
4790	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4791	if (ret)
4792		goto err_unwind;
4793
4794	if (dmar_domain->nested_parent) {
4795		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4796		if (ret)
4797			goto err_unwind;
4798	}
4799
4800	dmar_domain->dirty_tracking = enable;
4801out_unlock:
4802	spin_unlock(&dmar_domain->lock);
4803
4804	return 0;
4805
4806err_unwind:
4807	device_set_dirty_tracking(&dmar_domain->devices,
4808				  dmar_domain->dirty_tracking);
4809	spin_unlock(&dmar_domain->lock);
4810	return ret;
4811}
4812
4813static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4814					    unsigned long iova, size_t size,
4815					    unsigned long flags,
4816					    struct iommu_dirty_bitmap *dirty)
4817{
4818	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4819	unsigned long end = iova + size - 1;
4820	unsigned long pgsize;
4821
4822	/*
4823	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4824	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4825	 * have occurred when we stopped dirty tracking. This ensures that we
4826	 * never inherit dirtied bits from a previous cycle.
4827	 */
4828	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4829		return -EINVAL;
4830
4831	do {
4832		struct dma_pte *pte;
4833		int lvl = 0;
4834
4835		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4836				     GFP_ATOMIC);
4837		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4838		if (!pte || !dma_pte_present(pte)) {
4839			iova += pgsize;
4840			continue;
4841		}
4842
4843		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4844			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4845		iova += pgsize;
4846	} while (iova < end);
4847
4848	return 0;
4849}
4850
4851static const struct iommu_dirty_ops intel_dirty_ops = {
4852	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4853	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4854};
4855
4856const struct iommu_ops intel_iommu_ops = {
4857	.blocked_domain		= &blocking_domain,
4858	.capable		= intel_iommu_capable,
4859	.hw_info		= intel_iommu_hw_info,
4860	.domain_alloc		= intel_iommu_domain_alloc,
4861	.domain_alloc_user	= intel_iommu_domain_alloc_user,
4862	.probe_device		= intel_iommu_probe_device,
4863	.probe_finalize		= intel_iommu_probe_finalize,
4864	.release_device		= intel_iommu_release_device,
4865	.get_resv_regions	= intel_iommu_get_resv_regions,
4866	.device_group		= intel_iommu_device_group,
4867	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4868	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4869	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4870	.def_domain_type	= device_def_domain_type,
4871	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4872	.pgsize_bitmap		= SZ_4K,
4873#ifdef CONFIG_INTEL_IOMMU_SVM
4874	.page_response		= intel_svm_page_response,
4875#endif
4876	.default_domain_ops = &(const struct iommu_domain_ops) {
4877		.attach_dev		= intel_iommu_attach_device,
4878		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4879		.map_pages		= intel_iommu_map_pages,
4880		.unmap_pages		= intel_iommu_unmap_pages,
4881		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4882		.flush_iotlb_all        = intel_flush_iotlb_all,
4883		.iotlb_sync		= intel_iommu_tlb_sync,
4884		.iova_to_phys		= intel_iommu_iova_to_phys,
4885		.free			= intel_iommu_domain_free,
4886		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4887	}
4888};
4889
4890static void quirk_iommu_igfx(struct pci_dev *dev)
4891{
4892	if (risky_device(dev))
4893		return;
4894
4895	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4896	dmar_map_gfx = 0;
4897}
4898
4899/* G4x/GM45 integrated gfx dmar support is totally busted. */
4900DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4901DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4902DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4903DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4904DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4905DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4906DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4907
4908/* Broadwell igfx malfunctions with dmar */
4909DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4910DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4911DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4912DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4913DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4914DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4915DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4916DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4917DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4918DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4919DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4920DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4921DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4922DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4923DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4924DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4925DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4926DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4927DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4928DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4929DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4930DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4931DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4932DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4933
4934static void quirk_iommu_rwbf(struct pci_dev *dev)
4935{
4936	if (risky_device(dev))
4937		return;
4938
4939	/*
4940	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4941	 * but needs it. Same seems to hold for the desktop versions.
4942	 */
4943	pci_info(dev, "Forcing write-buffer flush capability\n");
4944	rwbf_quirk = 1;
4945}
4946
4947DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4948DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4949DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4950DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4951DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4952DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4953DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4954
4955#define GGC 0x52
4956#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4957#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4958#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4959#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4960#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4961#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4962#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4963#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4964
4965static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4966{
4967	unsigned short ggc;
4968
4969	if (risky_device(dev))
4970		return;
4971
4972	if (pci_read_config_word(dev, GGC, &ggc))
4973		return;
4974
4975	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4976		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4977		dmar_map_gfx = 0;
4978	} else if (dmar_map_gfx) {
4979		/* we have to ensure the gfx device is idle before we flush */
4980		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4981		iommu_set_dma_strict();
4982	}
4983}
4984DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4985DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4986DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4987DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4988
4989static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4990{
4991	unsigned short ver;
4992
4993	if (!IS_GFX_DEVICE(dev))
4994		return;
4995
4996	ver = (dev->device >> 8) & 0xff;
4997	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4998	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4999	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
5000		return;
5001
5002	if (risky_device(dev))
5003		return;
5004
5005	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5006	iommu_skip_te_disable = 1;
5007}
5008DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5009
5010/* On Tylersburg chipsets, some BIOSes have been known to enable the
5011   ISOCH DMAR unit for the Azalia sound device, but not give it any
5012   TLB entries, which causes it to deadlock. Check for that.  We do
5013   this in a function called from init_dmars(), instead of in a PCI
5014   quirk, because we don't want to print the obnoxious "BIOS broken"
5015   message if VT-d is actually disabled.
5016*/
5017static void __init check_tylersburg_isoch(void)
5018{
5019	struct pci_dev *pdev;
5020	uint32_t vtisochctrl;
5021
5022	/* If there's no Azalia in the system anyway, forget it. */
5023	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5024	if (!pdev)
5025		return;
5026
5027	if (risky_device(pdev)) {
5028		pci_dev_put(pdev);
5029		return;
5030	}
5031
5032	pci_dev_put(pdev);
5033
5034	/* System Management Registers. Might be hidden, in which case
5035	   we can't do the sanity check. But that's OK, because the
5036	   known-broken BIOSes _don't_ actually hide it, so far. */
5037	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5038	if (!pdev)
5039		return;
5040
5041	if (risky_device(pdev)) {
5042		pci_dev_put(pdev);
5043		return;
5044	}
5045
5046	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5047		pci_dev_put(pdev);
5048		return;
5049	}
5050
5051	pci_dev_put(pdev);
5052
5053	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5054	if (vtisochctrl & 1)
5055		return;
5056
5057	/* Drop all bits other than the number of TLB entries */
5058	vtisochctrl &= 0x1c;
5059
5060	/* If we have the recommended number of TLB entries (16), fine. */
5061	if (vtisochctrl == 0x10)
5062		return;
5063
5064	/* Zero TLB entries? You get to ride the short bus to school. */
5065	if (!vtisochctrl) {
5066		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5067		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5068		     dmi_get_system_info(DMI_BIOS_VENDOR),
5069		     dmi_get_system_info(DMI_BIOS_VERSION),
5070		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5071		iommu_identity_mapping |= IDENTMAP_AZALIA;
5072		return;
5073	}
5074
5075	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5076	       vtisochctrl);
5077}
5078
5079/*
5080 * Here we deal with a device TLB defect where device may inadvertently issue ATS
5081 * invalidation completion before posted writes initiated with translated address
5082 * that utilized translations matching the invalidation address range, violating
5083 * the invalidation completion ordering.
5084 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5085 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5086 * under the control of the trusted/privileged host device driver must use this
5087 * quirk.
5088 * Device TLBs are invalidated under the following six conditions:
5089 * 1. Device driver does DMA API unmap IOVA
5090 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5091 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5092 *    exit_mmap() due to crash
5093 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5094 *    VM has to free pages that were unmapped
5095 * 5. Userspace driver unmaps a DMA buffer
5096 * 6. Cache invalidation in vSVA usage (upcoming)
5097 *
5098 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5099 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5100 * invalidate TLB the same way as normal user unmap which will use this quirk.
5101 * The dTLB invalidation after PASID cache flush does not need this quirk.
5102 *
5103 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5104 */
5105void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5106			       unsigned long address, unsigned long mask,
5107			       u32 pasid, u16 qdep)
5108{
5109	u16 sid;
5110
5111	if (likely(!info->dtlb_extra_inval))
5112		return;
5113
5114	sid = PCI_DEVID(info->bus, info->devfn);
5115	if (pasid == IOMMU_NO_PASID) {
5116		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5117				   qdep, address, mask);
5118	} else {
5119		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5120					 pasid, qdep, address, mask);
5121	}
5122}
5123
5124#define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
5125
5126/*
5127 * Function to submit a command to the enhanced command interface. The
5128 * valid enhanced command descriptions are defined in Table 47 of the
5129 * VT-d spec. The VT-d hardware implementation may support some but not
5130 * all commands, which can be determined by checking the Enhanced
5131 * Command Capability Register.
5132 *
5133 * Return values:
5134 *  - 0: Command successful without any error;
5135 *  - Negative: software error value;
5136 *  - Nonzero positive: failure status code defined in Table 48.
5137 */
5138int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5139{
5140	unsigned long flags;
5141	u64 res;
5142	int ret;
5143
5144	if (!cap_ecmds(iommu->cap))
5145		return -ENODEV;
5146
5147	raw_spin_lock_irqsave(&iommu->register_lock, flags);
5148
5149	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5150	if (res & DMA_ECMD_ECRSP_IP) {
5151		ret = -EBUSY;
5152		goto err;
5153	}
5154
5155	/*
5156	 * Unconditionally write the operand B, because
5157	 * - There is no side effect if an ecmd doesn't require an
5158	 *   operand B, but we set the register to some value.
5159	 * - It's not invoked in any critical path. The extra MMIO
5160	 *   write doesn't bring any performance concerns.
5161	 */
5162	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5163	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5164
5165	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5166		      !(res & DMA_ECMD_ECRSP_IP), res);
5167
5168	if (res & DMA_ECMD_ECRSP_IP) {
5169		ret = -ETIMEDOUT;
5170		goto err;
5171	}
5172
5173	ret = ecmd_get_status_code(res);
5174err:
5175	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5176
5177	return ret;
5178}
v6.2
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright © 2006-2014 Intel Corporation.
   4 *
   5 * Authors: David Woodhouse <dwmw2@infradead.org>,
   6 *          Ashok Raj <ashok.raj@intel.com>,
   7 *          Shaohua Li <shaohua.li@intel.com>,
   8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9 *          Fenghua Yu <fenghua.yu@intel.com>
  10 *          Joerg Roedel <jroedel@suse.de>
  11 */
  12
  13#define pr_fmt(fmt)     "DMAR: " fmt
  14#define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16#include <linux/crash_dump.h>
  17#include <linux/dma-direct.h>
  18#include <linux/dmi.h>
  19#include <linux/intel-svm.h>
  20#include <linux/memory.h>
  21#include <linux/pci.h>
  22#include <linux/pci-ats.h>
  23#include <linux/spinlock.h>
  24#include <linux/syscore_ops.h>
  25#include <linux/tboot.h>
 
  26
  27#include "iommu.h"
  28#include "../dma-iommu.h"
  29#include "../irq_remapping.h"
  30#include "../iommu-sva.h"
  31#include "pasid.h"
  32#include "cap_audit.h"
 
  33
  34#define ROOT_SIZE		VTD_PAGE_SIZE
  35#define CONTEXT_SIZE		VTD_PAGE_SIZE
  36
  37#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  38#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  39#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  40#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  41
  42#define IOAPIC_RANGE_START	(0xfee00000)
  43#define IOAPIC_RANGE_END	(0xfeefffff)
  44#define IOVA_START_ADDR		(0x1000)
  45
  46#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  47
  48#define MAX_AGAW_WIDTH 64
  49#define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  50
  51#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
  52#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
  53
  54/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  55   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  56#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
  57				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  58#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  59
  60/* IO virtual address start page frame number */
  61#define IOVA_START_PFN		(1)
  62
  63#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
  64
  65/* page table handling */
  66#define LEVEL_STRIDE		(9)
  67#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
  68
  69static inline int agaw_to_level(int agaw)
  70{
  71	return agaw + 2;
  72}
  73
  74static inline int agaw_to_width(int agaw)
  75{
  76	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
  77}
  78
  79static inline int width_to_agaw(int width)
  80{
  81	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
  82}
  83
  84static inline unsigned int level_to_offset_bits(int level)
  85{
  86	return (level - 1) * LEVEL_STRIDE;
  87}
  88
  89static inline int pfn_level_offset(u64 pfn, int level)
  90{
  91	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
  92}
  93
  94static inline u64 level_mask(int level)
  95{
  96	return -1ULL << level_to_offset_bits(level);
  97}
  98
  99static inline u64 level_size(int level)
 100{
 101	return 1ULL << level_to_offset_bits(level);
 102}
 103
 104static inline u64 align_to_level(u64 pfn, int level)
 105{
 106	return (pfn + level_size(level) - 1) & level_mask(level);
 107}
 108
 109static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 110{
 111	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 112}
 113
 114/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 115   are never going to work. */
 116static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 117{
 118	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 119}
 120static inline unsigned long page_to_dma_pfn(struct page *pg)
 121{
 122	return mm_to_dma_pfn(page_to_pfn(pg));
 123}
 124static inline unsigned long virt_to_dma_pfn(void *p)
 125{
 126	return page_to_dma_pfn(virt_to_page(p));
 127}
 128
 129static void __init check_tylersburg_isoch(void);
 130static int rwbf_quirk;
 131
 132/*
 133 * set to 1 to panic kernel if can't successfully enable VT-d
 134 * (used when kernel is launched w/ TXT)
 135 */
 136static int force_on = 0;
 137static int intel_iommu_tboot_noforce;
 138static int no_platform_optin;
 139
 140#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 141
 142/*
 143 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 144 * if marked present.
 145 */
 146static phys_addr_t root_entry_lctp(struct root_entry *re)
 147{
 148	if (!(re->lo & 1))
 149		return 0;
 150
 151	return re->lo & VTD_PAGE_MASK;
 152}
 153
 154/*
 155 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 156 * if marked present.
 157 */
 158static phys_addr_t root_entry_uctp(struct root_entry *re)
 159{
 160	if (!(re->hi & 1))
 161		return 0;
 162
 163	return re->hi & VTD_PAGE_MASK;
 164}
 165
 166static inline void context_set_present(struct context_entry *context)
 167{
 168	context->lo |= 1;
 169}
 170
 171static inline void context_set_fault_enable(struct context_entry *context)
 172{
 173	context->lo &= (((u64)-1) << 2) | 1;
 174}
 175
 176static inline void context_set_translation_type(struct context_entry *context,
 177						unsigned long value)
 178{
 179	context->lo &= (((u64)-1) << 4) | 3;
 180	context->lo |= (value & 3) << 2;
 181}
 182
 183static inline void context_set_address_root(struct context_entry *context,
 184					    unsigned long value)
 185{
 186	context->lo &= ~VTD_PAGE_MASK;
 187	context->lo |= value & VTD_PAGE_MASK;
 188}
 189
 190static inline void context_set_address_width(struct context_entry *context,
 191					     unsigned long value)
 192{
 193	context->hi |= value & 7;
 194}
 195
 196static inline void context_set_domain_id(struct context_entry *context,
 197					 unsigned long value)
 198{
 199	context->hi |= (value & ((1 << 16) - 1)) << 8;
 200}
 201
 202static inline void context_set_pasid(struct context_entry *context)
 203{
 204	context->lo |= CONTEXT_PASIDE;
 205}
 206
 207static inline int context_domain_id(struct context_entry *c)
 208{
 209	return((c->hi >> 8) & 0xffff);
 210}
 211
 212static inline void context_clear_entry(struct context_entry *context)
 213{
 214	context->lo = 0;
 215	context->hi = 0;
 216}
 217
 218static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 219{
 220	if (!iommu->copied_tables)
 221		return false;
 222
 223	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 224}
 225
 226static inline void
 227set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 228{
 229	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 230}
 231
 232static inline void
 233clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 234{
 235	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 236}
 237
 238/*
 239 * This domain is a statically identity mapping domain.
 240 *	1. This domain creats a static 1:1 mapping to all usable memory.
 241 * 	2. It maps to each iommu if successful.
 242 *	3. Each iommu mapps to this domain if successful.
 243 */
 244static struct dmar_domain *si_domain;
 245static int hw_pass_through = 1;
 246
 247struct dmar_rmrr_unit {
 248	struct list_head list;		/* list of rmrr units	*/
 249	struct acpi_dmar_header *hdr;	/* ACPI header		*/
 250	u64	base_address;		/* reserved base address*/
 251	u64	end_address;		/* reserved end address */
 252	struct dmar_dev_scope *devices;	/* target devices */
 253	int	devices_cnt;		/* target device count */
 254};
 255
 256struct dmar_atsr_unit {
 257	struct list_head list;		/* list of ATSR units */
 258	struct acpi_dmar_header *hdr;	/* ACPI header */
 259	struct dmar_dev_scope *devices;	/* target devices */
 260	int devices_cnt;		/* target device count */
 261	u8 include_all:1;		/* include all ports */
 262};
 263
 264struct dmar_satc_unit {
 265	struct list_head list;		/* list of SATC units */
 266	struct acpi_dmar_header *hdr;	/* ACPI header */
 267	struct dmar_dev_scope *devices;	/* target devices */
 268	struct intel_iommu *iommu;	/* the corresponding iommu */
 269	int devices_cnt;		/* target device count */
 270	u8 atc_required:1;		/* ATS is required */
 271};
 272
 273static LIST_HEAD(dmar_atsr_units);
 274static LIST_HEAD(dmar_rmrr_units);
 275static LIST_HEAD(dmar_satc_units);
 276
 277#define for_each_rmrr_units(rmrr) \
 278	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 279
 280static void device_block_translation(struct device *dev);
 281static void intel_iommu_domain_free(struct iommu_domain *domain);
 282
 283int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
 284int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
 285
 286int intel_iommu_enabled = 0;
 287EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 288
 289static int dmar_map_gfx = 1;
 290static int intel_iommu_superpage = 1;
 291static int iommu_identity_mapping;
 292static int iommu_skip_te_disable;
 293
 294#define IDENTMAP_GFX		2
 295#define IDENTMAP_AZALIA		4
 296
 297const struct iommu_ops intel_iommu_ops;
 
 298
 299static bool translation_pre_enabled(struct intel_iommu *iommu)
 300{
 301	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 302}
 303
 304static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 305{
 306	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 307}
 308
 309static void init_translation_status(struct intel_iommu *iommu)
 310{
 311	u32 gsts;
 312
 313	gsts = readl(iommu->reg + DMAR_GSTS_REG);
 314	if (gsts & DMA_GSTS_TES)
 315		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 316}
 317
 318static int __init intel_iommu_setup(char *str)
 319{
 320	if (!str)
 321		return -EINVAL;
 322
 323	while (*str) {
 324		if (!strncmp(str, "on", 2)) {
 325			dmar_disabled = 0;
 326			pr_info("IOMMU enabled\n");
 327		} else if (!strncmp(str, "off", 3)) {
 328			dmar_disabled = 1;
 329			no_platform_optin = 1;
 330			pr_info("IOMMU disabled\n");
 331		} else if (!strncmp(str, "igfx_off", 8)) {
 332			dmar_map_gfx = 0;
 333			pr_info("Disable GFX device mapping\n");
 334		} else if (!strncmp(str, "forcedac", 8)) {
 335			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 336			iommu_dma_forcedac = true;
 337		} else if (!strncmp(str, "strict", 6)) {
 338			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
 339			iommu_set_dma_strict();
 340		} else if (!strncmp(str, "sp_off", 6)) {
 341			pr_info("Disable supported super page\n");
 342			intel_iommu_superpage = 0;
 343		} else if (!strncmp(str, "sm_on", 5)) {
 344			pr_info("Enable scalable mode if hardware supports\n");
 345			intel_iommu_sm = 1;
 346		} else if (!strncmp(str, "sm_off", 6)) {
 347			pr_info("Scalable mode is disallowed\n");
 348			intel_iommu_sm = 0;
 349		} else if (!strncmp(str, "tboot_noforce", 13)) {
 350			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 351			intel_iommu_tboot_noforce = 1;
 352		} else {
 353			pr_notice("Unknown option - '%s'\n", str);
 354		}
 355
 356		str += strcspn(str, ",");
 357		while (*str == ',')
 358			str++;
 359	}
 360
 361	return 1;
 362}
 363__setup("intel_iommu=", intel_iommu_setup);
 364
 365void *alloc_pgtable_page(int node)
 366{
 367	struct page *page;
 368	void *vaddr = NULL;
 369
 370	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 371	if (page)
 372		vaddr = page_address(page);
 373	return vaddr;
 374}
 375
 376void free_pgtable_page(void *vaddr)
 377{
 378	free_page((unsigned long)vaddr);
 379}
 380
 381static inline int domain_type_is_si(struct dmar_domain *domain)
 382{
 383	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
 384}
 385
 386static inline int domain_pfn_supported(struct dmar_domain *domain,
 387				       unsigned long pfn)
 388{
 389	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 390
 391	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 392}
 393
 394/*
 395 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
 396 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
 397 * the returned SAGAW.
 398 */
 399static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
 400{
 401	unsigned long fl_sagaw, sl_sagaw;
 402
 403	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
 404	sl_sagaw = cap_sagaw(iommu->cap);
 405
 406	/* Second level only. */
 407	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
 408		return sl_sagaw;
 409
 410	/* First level only. */
 411	if (!ecap_slts(iommu->ecap))
 412		return fl_sagaw;
 413
 414	return fl_sagaw & sl_sagaw;
 415}
 416
 417static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 418{
 419	unsigned long sagaw;
 420	int agaw;
 421
 422	sagaw = __iommu_calculate_sagaw(iommu);
 423	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
 424		if (test_bit(agaw, &sagaw))
 425			break;
 426	}
 427
 428	return agaw;
 429}
 430
 431/*
 432 * Calculate max SAGAW for each iommu.
 433 */
 434int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 435{
 436	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 437}
 438
 439/*
 440 * calculate agaw for each iommu.
 441 * "SAGAW" may be different across iommus, use a default agaw, and
 442 * get a supported less agaw for iommus that don't support the default agaw.
 443 */
 444int iommu_calculate_agaw(struct intel_iommu *iommu)
 445{
 446	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 447}
 448
 449static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 450{
 451	return sm_supported(iommu) ?
 452			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 453}
 454
 455static void domain_update_iommu_coherency(struct dmar_domain *domain)
 456{
 457	struct iommu_domain_info *info;
 458	struct dmar_drhd_unit *drhd;
 459	struct intel_iommu *iommu;
 460	bool found = false;
 461	unsigned long i;
 462
 463	domain->iommu_coherency = true;
 464	xa_for_each(&domain->iommu_array, i, info) {
 465		found = true;
 466		if (!iommu_paging_structure_coherency(info->iommu)) {
 467			domain->iommu_coherency = false;
 468			break;
 469		}
 470	}
 471	if (found)
 472		return;
 473
 474	/* No hardware attached; use lowest common denominator */
 475	rcu_read_lock();
 476	for_each_active_iommu(iommu, drhd) {
 477		if (!iommu_paging_structure_coherency(iommu)) {
 478			domain->iommu_coherency = false;
 479			break;
 480		}
 481	}
 482	rcu_read_unlock();
 483}
 484
 485static int domain_update_iommu_superpage(struct dmar_domain *domain,
 486					 struct intel_iommu *skip)
 487{
 488	struct dmar_drhd_unit *drhd;
 489	struct intel_iommu *iommu;
 490	int mask = 0x3;
 491
 492	if (!intel_iommu_superpage)
 493		return 0;
 494
 495	/* set iommu_superpage to the smallest common denominator */
 496	rcu_read_lock();
 497	for_each_active_iommu(iommu, drhd) {
 498		if (iommu != skip) {
 499			if (domain && domain->use_first_level) {
 500				if (!cap_fl1gp_support(iommu->cap))
 501					mask = 0x1;
 502			} else {
 503				mask &= cap_super_page_val(iommu->cap);
 504			}
 505
 506			if (!mask)
 507				break;
 508		}
 509	}
 510	rcu_read_unlock();
 511
 512	return fls(mask);
 513}
 514
 515static int domain_update_device_node(struct dmar_domain *domain)
 516{
 517	struct device_domain_info *info;
 518	int nid = NUMA_NO_NODE;
 519	unsigned long flags;
 520
 521	spin_lock_irqsave(&domain->lock, flags);
 522	list_for_each_entry(info, &domain->devices, link) {
 523		/*
 524		 * There could possibly be multiple device numa nodes as devices
 525		 * within the same domain may sit behind different IOMMUs. There
 526		 * isn't perfect answer in such situation, so we select first
 527		 * come first served policy.
 528		 */
 529		nid = dev_to_node(info->dev);
 530		if (nid != NUMA_NO_NODE)
 531			break;
 532	}
 533	spin_unlock_irqrestore(&domain->lock, flags);
 534
 535	return nid;
 536}
 537
 538static void domain_update_iotlb(struct dmar_domain *domain);
 539
 540/* Return the super pagesize bitmap if supported. */
 541static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
 542{
 543	unsigned long bitmap = 0;
 544
 545	/*
 546	 * 1-level super page supports page size of 2MiB, 2-level super page
 547	 * supports page size of both 2MiB and 1GiB.
 548	 */
 549	if (domain->iommu_superpage == 1)
 550		bitmap |= SZ_2M;
 551	else if (domain->iommu_superpage == 2)
 552		bitmap |= SZ_2M | SZ_1G;
 553
 554	return bitmap;
 555}
 556
 557/* Some capabilities may be different across iommus */
 558static void domain_update_iommu_cap(struct dmar_domain *domain)
 559{
 560	domain_update_iommu_coherency(domain);
 561	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 562
 563	/*
 564	 * If RHSA is missing, we should default to the device numa domain
 565	 * as fall back.
 566	 */
 567	if (domain->nid == NUMA_NO_NODE)
 568		domain->nid = domain_update_device_node(domain);
 569
 570	/*
 571	 * First-level translation restricts the input-address to a
 572	 * canonical address (i.e., address bits 63:N have the same
 573	 * value as address bit [N-1], where N is 48-bits with 4-level
 574	 * paging and 57-bits with 5-level paging). Hence, skip bit
 575	 * [N-1].
 576	 */
 577	if (domain->use_first_level)
 578		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
 579	else
 580		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
 581
 582	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
 583	domain_update_iotlb(domain);
 584}
 585
 586struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 587					 u8 devfn, int alloc)
 588{
 589	struct root_entry *root = &iommu->root_entry[bus];
 590	struct context_entry *context;
 591	u64 *entry;
 592
 593	/*
 594	 * Except that the caller requested to allocate a new entry,
 595	 * returning a copied context entry makes no sense.
 596	 */
 597	if (!alloc && context_copied(iommu, bus, devfn))
 598		return NULL;
 599
 600	entry = &root->lo;
 601	if (sm_supported(iommu)) {
 602		if (devfn >= 0x80) {
 603			devfn -= 0x80;
 604			entry = &root->hi;
 605		}
 606		devfn *= 2;
 607	}
 608	if (*entry & 1)
 609		context = phys_to_virt(*entry & VTD_PAGE_MASK);
 610	else {
 611		unsigned long phy_addr;
 612		if (!alloc)
 613			return NULL;
 614
 615		context = alloc_pgtable_page(iommu->node);
 616		if (!context)
 617			return NULL;
 618
 619		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 620		phy_addr = virt_to_phys((void *)context);
 621		*entry = phy_addr | 1;
 622		__iommu_flush_cache(iommu, entry, sizeof(*entry));
 623	}
 624	return &context[devfn];
 625}
 626
 627/**
 628 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 629 *				 sub-hierarchy of a candidate PCI-PCI bridge
 630 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 631 * @bridge: the candidate PCI-PCI bridge
 632 *
 633 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 634 */
 635static bool
 636is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 637{
 638	struct pci_dev *pdev, *pbridge;
 639
 640	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 641		return false;
 642
 643	pdev = to_pci_dev(dev);
 644	pbridge = to_pci_dev(bridge);
 645
 646	if (pbridge->subordinate &&
 647	    pbridge->subordinate->number <= pdev->bus->number &&
 648	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
 649		return true;
 650
 651	return false;
 652}
 653
 654static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 655{
 656	struct dmar_drhd_unit *drhd;
 657	u32 vtbar;
 658	int rc;
 659
 660	/* We know that this device on this chipset has its own IOMMU.
 661	 * If we find it under a different IOMMU, then the BIOS is lying
 662	 * to us. Hope that the IOMMU for this device is actually
 663	 * disabled, and it needs no translation...
 664	 */
 665	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 666	if (rc) {
 667		/* "can't" happen */
 668		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 669		return false;
 670	}
 671	vtbar &= 0xffff0000;
 672
 673	/* we know that the this iommu should be at offset 0xa000 from vtbar */
 674	drhd = dmar_find_matched_drhd_unit(pdev);
 675	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 676		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 677		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 678		return true;
 679	}
 680
 681	return false;
 682}
 683
 684static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 685{
 686	if (!iommu || iommu->drhd->ignored)
 687		return true;
 688
 689	if (dev_is_pci(dev)) {
 690		struct pci_dev *pdev = to_pci_dev(dev);
 691
 692		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 693		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 694		    quirk_ioat_snb_local_iommu(pdev))
 695			return true;
 696	}
 697
 698	return false;
 699}
 700
 701struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 702{
 703	struct dmar_drhd_unit *drhd = NULL;
 704	struct pci_dev *pdev = NULL;
 705	struct intel_iommu *iommu;
 706	struct device *tmp;
 707	u16 segment = 0;
 708	int i;
 709
 710	if (!dev)
 711		return NULL;
 712
 713	if (dev_is_pci(dev)) {
 714		struct pci_dev *pf_pdev;
 715
 716		pdev = pci_real_dma_dev(to_pci_dev(dev));
 717
 718		/* VFs aren't listed in scope tables; we need to look up
 719		 * the PF instead to find the IOMMU. */
 720		pf_pdev = pci_physfn(pdev);
 721		dev = &pf_pdev->dev;
 722		segment = pci_domain_nr(pdev->bus);
 723	} else if (has_acpi_companion(dev))
 724		dev = &ACPI_COMPANION(dev)->dev;
 725
 726	rcu_read_lock();
 727	for_each_iommu(iommu, drhd) {
 728		if (pdev && segment != drhd->segment)
 729			continue;
 730
 731		for_each_active_dev_scope(drhd->devices,
 732					  drhd->devices_cnt, i, tmp) {
 733			if (tmp == dev) {
 734				/* For a VF use its original BDF# not that of the PF
 735				 * which we used for the IOMMU lookup. Strictly speaking
 736				 * we could do this for all PCI devices; we only need to
 737				 * get the BDF# from the scope table for ACPI matches. */
 738				if (pdev && pdev->is_virtfn)
 739					goto got_pdev;
 740
 741				if (bus && devfn) {
 742					*bus = drhd->devices[i].bus;
 743					*devfn = drhd->devices[i].devfn;
 744				}
 745				goto out;
 746			}
 747
 748			if (is_downstream_to_pci_bridge(dev, tmp))
 749				goto got_pdev;
 750		}
 751
 752		if (pdev && drhd->include_all) {
 753got_pdev:
 754			if (bus && devfn) {
 755				*bus = pdev->bus->number;
 756				*devfn = pdev->devfn;
 757			}
 758			goto out;
 759		}
 760	}
 761	iommu = NULL;
 762out:
 763	if (iommu_is_dummy(iommu, dev))
 764		iommu = NULL;
 765
 766	rcu_read_unlock();
 767
 768	return iommu;
 769}
 770
 771static void domain_flush_cache(struct dmar_domain *domain,
 772			       void *addr, int size)
 773{
 774	if (!domain->iommu_coherency)
 775		clflush_cache_range(addr, size);
 776}
 777
 778static void free_context_table(struct intel_iommu *iommu)
 779{
 780	struct context_entry *context;
 781	int i;
 782
 783	if (!iommu->root_entry)
 784		return;
 785
 786	for (i = 0; i < ROOT_ENTRY_NR; i++) {
 787		context = iommu_context_addr(iommu, i, 0, 0);
 788		if (context)
 789			free_pgtable_page(context);
 790
 791		if (!sm_supported(iommu))
 792			continue;
 793
 794		context = iommu_context_addr(iommu, i, 0x80, 0);
 795		if (context)
 796			free_pgtable_page(context);
 797	}
 798
 799	free_pgtable_page(iommu->root_entry);
 800	iommu->root_entry = NULL;
 801}
 802
 803#ifdef CONFIG_DMAR_DEBUG
 804static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
 805			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
 806{
 807	struct dma_pte *pte;
 808	int offset;
 809
 810	while (1) {
 811		offset = pfn_level_offset(pfn, level);
 812		pte = &parent[offset];
 813		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
 814			pr_info("PTE not present at level %d\n", level);
 815			break;
 816		}
 817
 818		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
 819
 820		if (level == 1)
 821			break;
 822
 823		parent = phys_to_virt(dma_pte_addr(pte));
 824		level--;
 825	}
 826}
 827
 828void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
 829			  unsigned long long addr, u32 pasid)
 830{
 831	struct pasid_dir_entry *dir, *pde;
 832	struct pasid_entry *entries, *pte;
 833	struct context_entry *ctx_entry;
 834	struct root_entry *rt_entry;
 835	int i, dir_index, index, level;
 836	u8 devfn = source_id & 0xff;
 837	u8 bus = source_id >> 8;
 838	struct dma_pte *pgtable;
 839
 840	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
 841
 842	/* root entry dump */
 843	rt_entry = &iommu->root_entry[bus];
 844	if (!rt_entry) {
 845		pr_info("root table entry is not present\n");
 846		return;
 847	}
 848
 849	if (sm_supported(iommu))
 850		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
 851			rt_entry->hi, rt_entry->lo);
 852	else
 853		pr_info("root entry: 0x%016llx", rt_entry->lo);
 854
 855	/* context entry dump */
 856	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
 857	if (!ctx_entry) {
 858		pr_info("context table entry is not present\n");
 859		return;
 860	}
 861
 862	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
 863		ctx_entry->hi, ctx_entry->lo);
 864
 865	/* legacy mode does not require PASID entries */
 866	if (!sm_supported(iommu)) {
 867		level = agaw_to_level(ctx_entry->hi & 7);
 868		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 869		goto pgtable_walk;
 870	}
 871
 872	/* get the pointer to pasid directory entry */
 873	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 874	if (!dir) {
 875		pr_info("pasid directory entry is not present\n");
 876		return;
 877	}
 878	/* For request-without-pasid, get the pasid from context entry */
 879	if (intel_iommu_sm && pasid == INVALID_IOASID)
 880		pasid = PASID_RID2PASID;
 881
 882	dir_index = pasid >> PASID_PDE_SHIFT;
 883	pde = &dir[dir_index];
 884	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
 885
 886	/* get the pointer to the pasid table entry */
 887	entries = get_pasid_table_from_pde(pde);
 888	if (!entries) {
 889		pr_info("pasid table entry is not present\n");
 890		return;
 891	}
 892	index = pasid & PASID_PTE_MASK;
 893	pte = &entries[index];
 894	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
 895		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
 896
 897	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
 898		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
 899		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
 900	} else {
 901		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
 902		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
 903	}
 904
 905pgtable_walk:
 906	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
 907}
 908#endif
 909
 910static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 911				      unsigned long pfn, int *target_level)
 
 912{
 913	struct dma_pte *parent, *pte;
 914	int level = agaw_to_level(domain->agaw);
 915	int offset;
 916
 917	BUG_ON(!domain->pgd);
 918
 919	if (!domain_pfn_supported(domain, pfn))
 920		/* Address beyond IOMMU's addressing capabilities. */
 921		return NULL;
 922
 923	parent = domain->pgd;
 924
 925	while (1) {
 926		void *tmp_page;
 927
 928		offset = pfn_level_offset(pfn, level);
 929		pte = &parent[offset];
 930		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 931			break;
 932		if (level == *target_level)
 933			break;
 934
 935		if (!dma_pte_present(pte)) {
 936			uint64_t pteval;
 937
 938			tmp_page = alloc_pgtable_page(domain->nid);
 939
 940			if (!tmp_page)
 941				return NULL;
 942
 943			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 944			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 945			if (domain->use_first_level)
 946				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
 947
 948			if (cmpxchg64(&pte->val, 0ULL, pteval))
 949				/* Someone else set it while we were thinking; use theirs. */
 950				free_pgtable_page(tmp_page);
 951			else
 952				domain_flush_cache(domain, pte, sizeof(*pte));
 953		}
 954		if (level == 1)
 955			break;
 956
 957		parent = phys_to_virt(dma_pte_addr(pte));
 958		level--;
 959	}
 960
 961	if (!*target_level)
 962		*target_level = level;
 963
 964	return pte;
 965}
 966
 967/* return address's pte at specific level */
 968static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 969					 unsigned long pfn,
 970					 int level, int *large_page)
 971{
 972	struct dma_pte *parent, *pte;
 973	int total = agaw_to_level(domain->agaw);
 974	int offset;
 975
 976	parent = domain->pgd;
 977	while (level <= total) {
 978		offset = pfn_level_offset(pfn, total);
 979		pte = &parent[offset];
 980		if (level == total)
 981			return pte;
 982
 983		if (!dma_pte_present(pte)) {
 984			*large_page = total;
 985			break;
 986		}
 987
 988		if (dma_pte_superpage(pte)) {
 989			*large_page = total;
 990			return pte;
 991		}
 992
 993		parent = phys_to_virt(dma_pte_addr(pte));
 994		total--;
 995	}
 996	return NULL;
 997}
 998
 999/* clear last level pte, a tlb flush should be followed */
1000static void dma_pte_clear_range(struct dmar_domain *domain,
1001				unsigned long start_pfn,
1002				unsigned long last_pfn)
1003{
1004	unsigned int large_page;
1005	struct dma_pte *first_pte, *pte;
1006
1007	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1008	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1009	BUG_ON(start_pfn > last_pfn);
1010
1011	/* we don't need lock here; nobody else touches the iova range */
1012	do {
1013		large_page = 1;
1014		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1015		if (!pte) {
1016			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1017			continue;
1018		}
1019		do {
1020			dma_clear_pte(pte);
1021			start_pfn += lvl_to_nr_pages(large_page);
1022			pte++;
1023		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1024
1025		domain_flush_cache(domain, first_pte,
1026				   (void *)pte - (void *)first_pte);
1027
1028	} while (start_pfn && start_pfn <= last_pfn);
1029}
1030
1031static void dma_pte_free_level(struct dmar_domain *domain, int level,
1032			       int retain_level, struct dma_pte *pte,
1033			       unsigned long pfn, unsigned long start_pfn,
1034			       unsigned long last_pfn)
1035{
1036	pfn = max(start_pfn, pfn);
1037	pte = &pte[pfn_level_offset(pfn, level)];
1038
1039	do {
1040		unsigned long level_pfn;
1041		struct dma_pte *level_pte;
1042
1043		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1044			goto next;
1045
1046		level_pfn = pfn & level_mask(level);
1047		level_pte = phys_to_virt(dma_pte_addr(pte));
1048
1049		if (level > 2) {
1050			dma_pte_free_level(domain, level - 1, retain_level,
1051					   level_pte, level_pfn, start_pfn,
1052					   last_pfn);
1053		}
1054
1055		/*
1056		 * Free the page table if we're below the level we want to
1057		 * retain and the range covers the entire table.
1058		 */
1059		if (level < retain_level && !(start_pfn > level_pfn ||
1060		      last_pfn < level_pfn + level_size(level) - 1)) {
1061			dma_clear_pte(pte);
1062			domain_flush_cache(domain, pte, sizeof(*pte));
1063			free_pgtable_page(level_pte);
1064		}
1065next:
1066		pfn += level_size(level);
1067	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1068}
1069
1070/*
1071 * clear last level (leaf) ptes and free page table pages below the
1072 * level we wish to keep intact.
1073 */
1074static void dma_pte_free_pagetable(struct dmar_domain *domain,
1075				   unsigned long start_pfn,
1076				   unsigned long last_pfn,
1077				   int retain_level)
1078{
1079	dma_pte_clear_range(domain, start_pfn, last_pfn);
1080
1081	/* We don't need lock here; nobody else touches the iova range */
1082	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1083			   domain->pgd, 0, start_pfn, last_pfn);
1084
1085	/* free pgd */
1086	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1087		free_pgtable_page(domain->pgd);
1088		domain->pgd = NULL;
1089	}
1090}
1091
1092/* When a page at a given level is being unlinked from its parent, we don't
1093   need to *modify* it at all. All we need to do is make a list of all the
1094   pages which can be freed just as soon as we've flushed the IOTLB and we
1095   know the hardware page-walk will no longer touch them.
1096   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1097   be freed. */
1098static void dma_pte_list_pagetables(struct dmar_domain *domain,
1099				    int level, struct dma_pte *pte,
1100				    struct list_head *freelist)
1101{
1102	struct page *pg;
1103
1104	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1105	list_add_tail(&pg->lru, freelist);
1106
1107	if (level == 1)
1108		return;
1109
1110	pte = page_address(pg);
1111	do {
1112		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1113			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1114		pte++;
1115	} while (!first_pte_in_page(pte));
1116}
1117
1118static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1119				struct dma_pte *pte, unsigned long pfn,
1120				unsigned long start_pfn, unsigned long last_pfn,
1121				struct list_head *freelist)
1122{
1123	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1124
1125	pfn = max(start_pfn, pfn);
1126	pte = &pte[pfn_level_offset(pfn, level)];
1127
1128	do {
1129		unsigned long level_pfn = pfn & level_mask(level);
1130
1131		if (!dma_pte_present(pte))
1132			goto next;
1133
1134		/* If range covers entire pagetable, free it */
1135		if (start_pfn <= level_pfn &&
1136		    last_pfn >= level_pfn + level_size(level) - 1) {
1137			/* These suborbinate page tables are going away entirely. Don't
1138			   bother to clear them; we're just going to *free* them. */
1139			if (level > 1 && !dma_pte_superpage(pte))
1140				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1141
1142			dma_clear_pte(pte);
1143			if (!first_pte)
1144				first_pte = pte;
1145			last_pte = pte;
1146		} else if (level > 1) {
1147			/* Recurse down into a level that isn't *entirely* obsolete */
1148			dma_pte_clear_level(domain, level - 1,
1149					    phys_to_virt(dma_pte_addr(pte)),
1150					    level_pfn, start_pfn, last_pfn,
1151					    freelist);
1152		}
1153next:
1154		pfn = level_pfn + level_size(level);
1155	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1156
1157	if (first_pte)
1158		domain_flush_cache(domain, first_pte,
1159				   (void *)++last_pte - (void *)first_pte);
1160}
1161
1162/* We can't just free the pages because the IOMMU may still be walking
1163   the page tables, and may have cached the intermediate levels. The
1164   pages can only be freed after the IOTLB flush has been done. */
1165static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1166			 unsigned long last_pfn, struct list_head *freelist)
1167{
1168	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1169	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1170	BUG_ON(start_pfn > last_pfn);
1171
1172	/* we don't need lock here; nobody else touches the iova range */
1173	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1174			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1175
1176	/* free pgd */
1177	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1178		struct page *pgd_page = virt_to_page(domain->pgd);
1179		list_add_tail(&pgd_page->lru, freelist);
1180		domain->pgd = NULL;
1181	}
1182}
1183
1184/* iommu handling */
1185static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1186{
1187	struct root_entry *root;
1188
1189	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1190	if (!root) {
1191		pr_err("Allocating root entry for %s failed\n",
1192			iommu->name);
1193		return -ENOMEM;
1194	}
1195
1196	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1197	iommu->root_entry = root;
1198
1199	return 0;
1200}
1201
1202static void iommu_set_root_entry(struct intel_iommu *iommu)
1203{
1204	u64 addr;
1205	u32 sts;
1206	unsigned long flag;
1207
1208	addr = virt_to_phys(iommu->root_entry);
1209	if (sm_supported(iommu))
1210		addr |= DMA_RTADDR_SMT;
1211
1212	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1213	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1214
1215	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1216
1217	/* Make sure hardware complete it */
1218	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1219		      readl, (sts & DMA_GSTS_RTPS), sts);
1220
1221	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1222
1223	/*
1224	 * Hardware invalidates all DMA remapping hardware translation
1225	 * caches as part of SRTP flow.
1226	 */
1227	if (cap_esrtps(iommu->cap))
1228		return;
1229
1230	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1231	if (sm_supported(iommu))
1232		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1233	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1234}
1235
1236void iommu_flush_write_buffer(struct intel_iommu *iommu)
1237{
1238	u32 val;
1239	unsigned long flag;
1240
1241	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1242		return;
1243
1244	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1245	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1246
1247	/* Make sure hardware complete it */
1248	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1249		      readl, (!(val & DMA_GSTS_WBFS)), val);
1250
1251	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1252}
1253
1254/* return value determine if we need a write buffer flush */
1255static void __iommu_flush_context(struct intel_iommu *iommu,
1256				  u16 did, u16 source_id, u8 function_mask,
1257				  u64 type)
1258{
1259	u64 val = 0;
1260	unsigned long flag;
1261
1262	switch (type) {
1263	case DMA_CCMD_GLOBAL_INVL:
1264		val = DMA_CCMD_GLOBAL_INVL;
1265		break;
1266	case DMA_CCMD_DOMAIN_INVL:
1267		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1268		break;
1269	case DMA_CCMD_DEVICE_INVL:
1270		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1271			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1272		break;
1273	default:
1274		BUG();
 
 
1275	}
1276	val |= DMA_CCMD_ICC;
1277
1278	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1279	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1280
1281	/* Make sure hardware complete it */
1282	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1283		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1284
1285	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1286}
1287
1288/* return value determine if we need a write buffer flush */
1289static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1290				u64 addr, unsigned int size_order, u64 type)
1291{
1292	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1293	u64 val = 0, val_iva = 0;
1294	unsigned long flag;
1295
1296	switch (type) {
1297	case DMA_TLB_GLOBAL_FLUSH:
1298		/* global flush doesn't need set IVA_REG */
1299		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1300		break;
1301	case DMA_TLB_DSI_FLUSH:
1302		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1303		break;
1304	case DMA_TLB_PSI_FLUSH:
1305		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1306		/* IH bit is passed in as part of address */
1307		val_iva = size_order | addr;
1308		break;
1309	default:
1310		BUG();
 
 
1311	}
1312	/* Note: set drain read/write */
1313#if 0
1314	/*
1315	 * This is probably to be super secure.. Looks like we can
1316	 * ignore it without any impact.
1317	 */
1318	if (cap_read_drain(iommu->cap))
1319		val |= DMA_TLB_READ_DRAIN;
1320#endif
1321	if (cap_write_drain(iommu->cap))
1322		val |= DMA_TLB_WRITE_DRAIN;
1323
1324	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325	/* Note: Only uses first TLB reg currently */
1326	if (val_iva)
1327		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329
1330	/* Make sure hardware complete it */
1331	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333
1334	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335
1336	/* check IOTLB invalidation granularity */
1337	if (DMA_TLB_IAIG(val) == 0)
1338		pr_err("Flush IOTLB failed\n");
1339	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340		pr_debug("TLB flush request %Lx, actual %Lx\n",
1341			(unsigned long long)DMA_TLB_IIRG(type),
1342			(unsigned long long)DMA_TLB_IAIG(val));
1343}
1344
1345static struct device_domain_info *
1346domain_lookup_dev_info(struct dmar_domain *domain,
1347		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1348{
1349	struct device_domain_info *info;
1350	unsigned long flags;
1351
1352	spin_lock_irqsave(&domain->lock, flags);
1353	list_for_each_entry(info, &domain->devices, link) {
1354		if (info->iommu == iommu && info->bus == bus &&
1355		    info->devfn == devfn) {
1356			spin_unlock_irqrestore(&domain->lock, flags);
1357			return info;
1358		}
1359	}
1360	spin_unlock_irqrestore(&domain->lock, flags);
1361
1362	return NULL;
1363}
1364
1365static void domain_update_iotlb(struct dmar_domain *domain)
1366{
 
1367	struct device_domain_info *info;
1368	bool has_iotlb_device = false;
1369	unsigned long flags;
1370
1371	spin_lock_irqsave(&domain->lock, flags);
1372	list_for_each_entry(info, &domain->devices, link) {
1373		if (info->ats_enabled) {
1374			has_iotlb_device = true;
1375			break;
1376		}
1377	}
 
 
 
 
 
 
 
 
1378	domain->has_iotlb_device = has_iotlb_device;
1379	spin_unlock_irqrestore(&domain->lock, flags);
1380}
1381
1382/*
1383 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1384 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1385 * check because it applies only to the built-in QAT devices and it doesn't
1386 * grant additional privileges.
1387 */
1388#define BUGGY_QAT_DEVID_MASK 0x4940
1389static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1390{
1391	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1392		return false;
1393
1394	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1395		return false;
1396
1397	return true;
1398}
1399
1400static void iommu_enable_pci_caps(struct device_domain_info *info)
1401{
1402	struct pci_dev *pdev;
1403
1404	if (!dev_is_pci(info->dev))
1405		return;
1406
1407	pdev = to_pci_dev(info->dev);
1408	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1409	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1410	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1411	 * reserved, which should be set to 0.
1412	 */
1413	if (!ecap_dit(info->iommu->ecap))
1414		info->pfsid = 0;
1415	else {
1416		struct pci_dev *pf_pdev;
1417
1418		/* pdev will be returned if device is not a vf */
1419		pf_pdev = pci_physfn(pdev);
1420		info->pfsid = pci_dev_id(pf_pdev);
1421	}
1422
1423	/* The PCIe spec, in its wisdom, declares that the behaviour of
1424	   the device if you enable PASID support after ATS support is
1425	   undefined. So always enable PASID support on devices which
1426	   have it, even if we can't yet know if we're ever going to
1427	   use it. */
1428	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1429		info->pasid_enabled = 1;
1430
1431	if (info->pri_supported &&
1432	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1433	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1434		info->pri_enabled = 1;
1435
1436	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1437	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1438		info->ats_enabled = 1;
1439		domain_update_iotlb(info->domain);
1440		info->ats_qdep = pci_ats_queue_depth(pdev);
1441	}
1442}
1443
1444static void iommu_disable_pci_caps(struct device_domain_info *info)
1445{
1446	struct pci_dev *pdev;
1447
1448	if (!dev_is_pci(info->dev))
1449		return;
1450
1451	pdev = to_pci_dev(info->dev);
1452
1453	if (info->ats_enabled) {
1454		pci_disable_ats(pdev);
1455		info->ats_enabled = 0;
1456		domain_update_iotlb(info->domain);
1457	}
1458
1459	if (info->pri_enabled) {
1460		pci_disable_pri(pdev);
1461		info->pri_enabled = 0;
1462	}
1463
1464	if (info->pasid_enabled) {
1465		pci_disable_pasid(pdev);
1466		info->pasid_enabled = 0;
1467	}
1468}
1469
1470static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1471				    u64 addr, unsigned int mask)
1472{
1473	u16 sid, qdep;
1474
1475	if (!info || !info->ats_enabled)
1476		return;
1477
1478	sid = info->bus << 8 | info->devfn;
1479	qdep = info->ats_qdep;
1480	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1481			   qdep, addr, mask);
1482	quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
1483}
1484
1485static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1486				  u64 addr, unsigned mask)
1487{
 
1488	struct device_domain_info *info;
1489	unsigned long flags;
1490
1491	if (!domain->has_iotlb_device)
1492		return;
1493
1494	spin_lock_irqsave(&domain->lock, flags);
1495	list_for_each_entry(info, &domain->devices, link)
1496		__iommu_flush_dev_iotlb(info, addr, mask);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1497	spin_unlock_irqrestore(&domain->lock, flags);
1498}
1499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1500static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1501				  struct dmar_domain *domain,
1502				  unsigned long pfn, unsigned int pages,
1503				  int ih, int map)
1504{
1505	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1506	unsigned int mask = ilog2(aligned_pages);
1507	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1508	u16 did = domain_id_iommu(domain, iommu);
1509
1510	BUG_ON(pages == 0);
 
1511
1512	if (ih)
1513		ih = 1 << 6;
1514
1515	if (domain->use_first_level) {
1516		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1517	} else {
1518		unsigned long bitmask = aligned_pages - 1;
1519
1520		/*
1521		 * PSI masks the low order bits of the base address. If the
1522		 * address isn't aligned to the mask, then compute a mask value
1523		 * needed to ensure the target range is flushed.
1524		 */
1525		if (unlikely(bitmask & pfn)) {
1526			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1527
1528			/*
1529			 * Since end_pfn <= pfn + bitmask, the only way bits
1530			 * higher than bitmask can differ in pfn and end_pfn is
1531			 * by carrying. This means after masking out bitmask,
1532			 * high bits starting with the first set bit in
1533			 * shared_bits are all equal in both pfn and end_pfn.
1534			 */
1535			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1536			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1537		}
1538
1539		/*
1540		 * Fallback to domain selective flush if no PSI support or
1541		 * the size is too big.
1542		 */
1543		if (!cap_pgsel_inv(iommu->cap) ||
1544		    mask > cap_max_amask_val(iommu->cap))
1545			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1546							DMA_TLB_DSI_FLUSH);
1547		else
1548			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1549							DMA_TLB_PSI_FLUSH);
1550	}
1551
1552	/*
1553	 * In caching mode, changes of pages from non-present to present require
1554	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1555	 */
1556	if (!cap_caching_mode(iommu->cap) || !map)
1557		iommu_flush_dev_iotlb(domain, addr, mask);
1558}
1559
1560/* Notification for newly created mappings */
1561static inline void __mapping_notify_one(struct intel_iommu *iommu,
1562					struct dmar_domain *domain,
1563					unsigned long pfn, unsigned int pages)
1564{
1565	/*
1566	 * It's a non-present to present mapping. Only flush if caching mode
1567	 * and second level.
1568	 */
1569	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1570		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1571	else
1572		iommu_flush_write_buffer(iommu);
1573}
1574
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1575static void intel_flush_iotlb_all(struct iommu_domain *domain)
1576{
1577	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1578	struct iommu_domain_info *info;
1579	unsigned long idx;
1580
1581	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1582		struct intel_iommu *iommu = info->iommu;
1583		u16 did = domain_id_iommu(dmar_domain, iommu);
1584
1585		if (dmar_domain->use_first_level)
1586			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1587		else
1588			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1589						 DMA_TLB_DSI_FLUSH);
1590
1591		if (!cap_caching_mode(iommu->cap))
1592			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1593	}
 
 
 
1594}
1595
1596static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1597{
1598	u32 pmen;
1599	unsigned long flags;
1600
1601	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1602		return;
1603
1604	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1605	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1606	pmen &= ~DMA_PMEN_EPM;
1607	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1608
1609	/* wait for the protected region status bit to clear */
1610	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1611		readl, !(pmen & DMA_PMEN_PRS), pmen);
1612
1613	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1614}
1615
1616static void iommu_enable_translation(struct intel_iommu *iommu)
1617{
1618	u32 sts;
1619	unsigned long flags;
1620
1621	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1622	iommu->gcmd |= DMA_GCMD_TE;
1623	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1624
1625	/* Make sure hardware complete it */
1626	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1627		      readl, (sts & DMA_GSTS_TES), sts);
1628
1629	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630}
1631
1632static void iommu_disable_translation(struct intel_iommu *iommu)
1633{
1634	u32 sts;
1635	unsigned long flag;
1636
1637	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1638	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1639		return;
1640
1641	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1642	iommu->gcmd &= ~DMA_GCMD_TE;
1643	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1644
1645	/* Make sure hardware complete it */
1646	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1647		      readl, (!(sts & DMA_GSTS_TES)), sts);
1648
1649	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1650}
1651
1652static int iommu_init_domains(struct intel_iommu *iommu)
1653{
1654	u32 ndomains;
1655
1656	ndomains = cap_ndoms(iommu->cap);
1657	pr_debug("%s: Number of Domains supported <%d>\n",
1658		 iommu->name, ndomains);
1659
1660	spin_lock_init(&iommu->lock);
1661
1662	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1663	if (!iommu->domain_ids)
1664		return -ENOMEM;
1665
1666	/*
1667	 * If Caching mode is set, then invalid translations are tagged
1668	 * with domain-id 0, hence we need to pre-allocate it. We also
1669	 * use domain-id 0 as a marker for non-allocated domain-id, so
1670	 * make sure it is not used for a real domain.
1671	 */
1672	set_bit(0, iommu->domain_ids);
1673
1674	/*
1675	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1676	 * entry for first-level or pass-through translation modes should
1677	 * be programmed with a domain id different from those used for
1678	 * second-level or nested translation. We reserve a domain id for
1679	 * this purpose.
1680	 */
1681	if (sm_supported(iommu))
1682		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1683
1684	return 0;
1685}
1686
1687static void disable_dmar_iommu(struct intel_iommu *iommu)
1688{
1689	if (!iommu->domain_ids)
1690		return;
1691
1692	/*
1693	 * All iommu domains must have been detached from the devices,
1694	 * hence there should be no domain IDs in use.
1695	 */
1696	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1697		    > NUM_RESERVED_DID))
1698		return;
1699
1700	if (iommu->gcmd & DMA_GCMD_TE)
1701		iommu_disable_translation(iommu);
1702}
1703
1704static void free_dmar_iommu(struct intel_iommu *iommu)
1705{
1706	if (iommu->domain_ids) {
1707		bitmap_free(iommu->domain_ids);
1708		iommu->domain_ids = NULL;
1709	}
1710
1711	if (iommu->copied_tables) {
1712		bitmap_free(iommu->copied_tables);
1713		iommu->copied_tables = NULL;
1714	}
1715
1716	/* free context mapping */
1717	free_context_table(iommu);
1718
1719#ifdef CONFIG_INTEL_IOMMU_SVM
1720	if (pasid_supported(iommu)) {
1721		if (ecap_prs(iommu->ecap))
1722			intel_svm_finish_prq(iommu);
1723	}
1724	if (vccap_pasid(iommu->vccap))
1725		ioasid_unregister_allocator(&iommu->pasid_allocator);
1726
1727#endif
1728}
1729
1730/*
1731 * Check and return whether first level is used by default for
1732 * DMA translation.
1733 */
1734static bool first_level_by_default(unsigned int type)
1735{
1736	/* Only SL is available in legacy mode */
1737	if (!scalable_mode_support())
1738		return false;
1739
1740	/* Only level (either FL or SL) is available, just use it */
1741	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1742		return intel_cap_flts_sanity();
1743
1744	/* Both levels are available, decide it based on domain type */
1745	return type != IOMMU_DOMAIN_UNMANAGED;
1746}
1747
1748static struct dmar_domain *alloc_domain(unsigned int type)
1749{
1750	struct dmar_domain *domain;
1751
1752	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1753	if (!domain)
1754		return NULL;
1755
1756	domain->nid = NUMA_NO_NODE;
1757	if (first_level_by_default(type))
1758		domain->use_first_level = true;
1759	domain->has_iotlb_device = false;
1760	INIT_LIST_HEAD(&domain->devices);
 
1761	spin_lock_init(&domain->lock);
1762	xa_init(&domain->iommu_array);
1763
1764	return domain;
1765}
1766
1767static int domain_attach_iommu(struct dmar_domain *domain,
1768			       struct intel_iommu *iommu)
1769{
1770	struct iommu_domain_info *info, *curr;
1771	unsigned long ndomains;
1772	int num, ret = -ENOSPC;
1773
1774	info = kzalloc(sizeof(*info), GFP_KERNEL);
1775	if (!info)
1776		return -ENOMEM;
1777
1778	spin_lock(&iommu->lock);
1779	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1780	if (curr) {
1781		curr->refcnt++;
1782		spin_unlock(&iommu->lock);
1783		kfree(info);
1784		return 0;
1785	}
1786
1787	ndomains = cap_ndoms(iommu->cap);
1788	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1789	if (num >= ndomains) {
1790		pr_err("%s: No free domain ids\n", iommu->name);
1791		goto err_unlock;
1792	}
1793
1794	set_bit(num, iommu->domain_ids);
1795	info->refcnt	= 1;
1796	info->did	= num;
1797	info->iommu	= iommu;
1798	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1799			  NULL, info, GFP_ATOMIC);
1800	if (curr) {
1801		ret = xa_err(curr) ? : -EBUSY;
1802		goto err_clear;
1803	}
1804	domain_update_iommu_cap(domain);
1805
1806	spin_unlock(&iommu->lock);
1807	return 0;
1808
1809err_clear:
1810	clear_bit(info->did, iommu->domain_ids);
1811err_unlock:
1812	spin_unlock(&iommu->lock);
1813	kfree(info);
1814	return ret;
1815}
1816
1817static void domain_detach_iommu(struct dmar_domain *domain,
1818				struct intel_iommu *iommu)
1819{
1820	struct iommu_domain_info *info;
1821
1822	spin_lock(&iommu->lock);
1823	info = xa_load(&domain->iommu_array, iommu->seq_id);
1824	if (--info->refcnt == 0) {
1825		clear_bit(info->did, iommu->domain_ids);
1826		xa_erase(&domain->iommu_array, iommu->seq_id);
1827		domain->nid = NUMA_NO_NODE;
1828		domain_update_iommu_cap(domain);
1829		kfree(info);
1830	}
1831	spin_unlock(&iommu->lock);
1832}
1833
1834static inline int guestwidth_to_adjustwidth(int gaw)
1835{
1836	int agaw;
1837	int r = (gaw - 12) % 9;
1838
1839	if (r == 0)
1840		agaw = gaw;
1841	else
1842		agaw = gaw + 9 - r;
1843	if (agaw > 64)
1844		agaw = 64;
1845	return agaw;
1846}
1847
1848static void domain_exit(struct dmar_domain *domain)
1849{
1850	if (domain->pgd) {
1851		LIST_HEAD(freelist);
1852
1853		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1854		put_pages_list(&freelist);
1855	}
1856
1857	if (WARN_ON(!list_empty(&domain->devices)))
1858		return;
1859
1860	kfree(domain);
1861}
1862
1863/*
1864 * Get the PASID directory size for scalable mode context entry.
1865 * Value of X in the PDTS field of a scalable mode context entry
1866 * indicates PASID directory with 2^(X + 7) entries.
1867 */
1868static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1869{
1870	unsigned long pds, max_pde;
1871
1872	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1873	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1874	if (pds < 7)
1875		return 0;
1876
1877	return pds - 7;
1878}
1879
1880/*
1881 * Set the RID_PASID field of a scalable mode context entry. The
1882 * IOMMU hardware will use the PASID value set in this field for
1883 * DMA translations of DMA requests without PASID.
1884 */
1885static inline void
1886context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1887{
1888	context->hi |= pasid & ((1 << 20) - 1);
1889}
1890
1891/*
1892 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1893 * entry.
1894 */
1895static inline void context_set_sm_dte(struct context_entry *context)
1896{
1897	context->lo |= (1 << 2);
1898}
1899
1900/*
1901 * Set the PRE(Page Request Enable) field of a scalable mode context
1902 * entry.
1903 */
1904static inline void context_set_sm_pre(struct context_entry *context)
1905{
1906	context->lo |= (1 << 4);
1907}
1908
1909/* Convert value to context PASID directory size field coding. */
1910#define context_pdts(pds)	(((pds) & 0x7) << 9)
1911
1912static int domain_context_mapping_one(struct dmar_domain *domain,
1913				      struct intel_iommu *iommu,
1914				      struct pasid_table *table,
1915				      u8 bus, u8 devfn)
1916{
1917	struct device_domain_info *info =
1918			domain_lookup_dev_info(domain, iommu, bus, devfn);
1919	u16 did = domain_id_iommu(domain, iommu);
1920	int translation = CONTEXT_TT_MULTI_LEVEL;
1921	struct context_entry *context;
1922	int ret;
1923
1924	WARN_ON(did == 0);
1925
1926	if (hw_pass_through && domain_type_is_si(domain))
1927		translation = CONTEXT_TT_PASS_THROUGH;
1928
1929	pr_debug("Set context mapping for %02x:%02x.%d\n",
1930		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1931
1932	BUG_ON(!domain->pgd);
1933
1934	spin_lock(&iommu->lock);
1935	ret = -ENOMEM;
1936	context = iommu_context_addr(iommu, bus, devfn, 1);
1937	if (!context)
1938		goto out_unlock;
1939
1940	ret = 0;
1941	if (context_present(context) && !context_copied(iommu, bus, devfn))
1942		goto out_unlock;
1943
1944	/*
1945	 * For kdump cases, old valid entries may be cached due to the
1946	 * in-flight DMA and copied pgtable, but there is no unmapping
1947	 * behaviour for them, thus we need an explicit cache flush for
1948	 * the newly-mapped device. For kdump, at this point, the device
1949	 * is supposed to finish reset at its driver probe stage, so no
1950	 * in-flight DMA will exist, and we don't need to worry anymore
1951	 * hereafter.
1952	 */
1953	if (context_copied(iommu, bus, devfn)) {
1954		u16 did_old = context_domain_id(context);
1955
1956		if (did_old < cap_ndoms(iommu->cap)) {
1957			iommu->flush.flush_context(iommu, did_old,
1958						   (((u16)bus) << 8) | devfn,
1959						   DMA_CCMD_MASK_NOBIT,
1960						   DMA_CCMD_DEVICE_INVL);
1961			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1962						 DMA_TLB_DSI_FLUSH);
1963		}
1964
1965		clear_context_copied(iommu, bus, devfn);
1966	}
1967
1968	context_clear_entry(context);
1969
1970	if (sm_supported(iommu)) {
1971		unsigned long pds;
1972
1973		WARN_ON(!table);
1974
1975		/* Setup the PASID DIR pointer: */
1976		pds = context_get_sm_pds(table);
1977		context->lo = (u64)virt_to_phys(table->table) |
1978				context_pdts(pds);
1979
1980		/* Setup the RID_PASID field: */
1981		context_set_sm_rid2pasid(context, PASID_RID2PASID);
1982
1983		/*
1984		 * Setup the Device-TLB enable bit and Page request
1985		 * Enable bit:
1986		 */
1987		if (info && info->ats_supported)
1988			context_set_sm_dte(context);
1989		if (info && info->pri_supported)
1990			context_set_sm_pre(context);
1991		if (info && info->pasid_supported)
1992			context_set_pasid(context);
1993	} else {
1994		struct dma_pte *pgd = domain->pgd;
1995		int agaw;
1996
1997		context_set_domain_id(context, did);
1998
1999		if (translation != CONTEXT_TT_PASS_THROUGH) {
2000			/*
2001			 * Skip top levels of page tables for iommu which has
2002			 * less agaw than default. Unnecessary for PT mode.
2003			 */
2004			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2005				ret = -ENOMEM;
2006				pgd = phys_to_virt(dma_pte_addr(pgd));
2007				if (!dma_pte_present(pgd))
2008					goto out_unlock;
2009			}
2010
2011			if (info && info->ats_supported)
2012				translation = CONTEXT_TT_DEV_IOTLB;
2013			else
2014				translation = CONTEXT_TT_MULTI_LEVEL;
2015
2016			context_set_address_root(context, virt_to_phys(pgd));
2017			context_set_address_width(context, agaw);
2018		} else {
2019			/*
2020			 * In pass through mode, AW must be programmed to
2021			 * indicate the largest AGAW value supported by
2022			 * hardware. And ASR is ignored by hardware.
2023			 */
2024			context_set_address_width(context, iommu->msagaw);
2025		}
2026
2027		context_set_translation_type(context, translation);
2028	}
2029
2030	context_set_fault_enable(context);
2031	context_set_present(context);
2032	if (!ecap_coherent(iommu->ecap))
2033		clflush_cache_range(context, sizeof(*context));
2034
2035	/*
2036	 * It's a non-present to present mapping. If hardware doesn't cache
2037	 * non-present entry we only need to flush the write-buffer. If the
2038	 * _does_ cache non-present entries, then it does so in the special
2039	 * domain #0, which we have to flush:
2040	 */
2041	if (cap_caching_mode(iommu->cap)) {
2042		iommu->flush.flush_context(iommu, 0,
2043					   (((u16)bus) << 8) | devfn,
2044					   DMA_CCMD_MASK_NOBIT,
2045					   DMA_CCMD_DEVICE_INVL);
2046		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2047	} else {
2048		iommu_flush_write_buffer(iommu);
2049	}
2050
2051	ret = 0;
2052
2053out_unlock:
2054	spin_unlock(&iommu->lock);
2055
2056	return ret;
2057}
2058
2059struct domain_context_mapping_data {
2060	struct dmar_domain *domain;
2061	struct intel_iommu *iommu;
2062	struct pasid_table *table;
2063};
2064
2065static int domain_context_mapping_cb(struct pci_dev *pdev,
2066				     u16 alias, void *opaque)
2067{
2068	struct domain_context_mapping_data *data = opaque;
2069
2070	return domain_context_mapping_one(data->domain, data->iommu,
2071					  data->table, PCI_BUS_NUM(alias),
2072					  alias & 0xff);
2073}
2074
2075static int
2076domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2077{
 
2078	struct domain_context_mapping_data data;
 
 
2079	struct pasid_table *table;
2080	struct intel_iommu *iommu;
2081	u8 bus, devfn;
2082
2083	iommu = device_to_iommu(dev, &bus, &devfn);
2084	if (!iommu)
2085		return -ENODEV;
2086
2087	table = intel_pasid_get_table(dev);
2088
2089	if (!dev_is_pci(dev))
2090		return domain_context_mapping_one(domain, iommu, table,
2091						  bus, devfn);
2092
2093	data.domain = domain;
2094	data.iommu = iommu;
2095	data.table = table;
2096
2097	return pci_for_each_dma_alias(to_pci_dev(dev),
2098				      &domain_context_mapping_cb, &data);
2099}
2100
2101/* Returns a number of VTD pages, but aligned to MM page size */
2102static inline unsigned long aligned_nrpages(unsigned long host_addr,
2103					    size_t size)
2104{
2105	host_addr &= ~PAGE_MASK;
2106	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2107}
2108
2109/* Return largest possible superpage level for a given mapping */
2110static inline int hardware_largepage_caps(struct dmar_domain *domain,
2111					  unsigned long iov_pfn,
2112					  unsigned long phy_pfn,
2113					  unsigned long pages)
2114{
2115	int support, level = 1;
2116	unsigned long pfnmerge;
2117
2118	support = domain->iommu_superpage;
2119
2120	/* To use a large page, the virtual *and* physical addresses
2121	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2122	   of them will mean we have to use smaller pages. So just
2123	   merge them and check both at once. */
2124	pfnmerge = iov_pfn | phy_pfn;
2125
2126	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2127		pages >>= VTD_STRIDE_SHIFT;
2128		if (!pages)
2129			break;
2130		pfnmerge >>= VTD_STRIDE_SHIFT;
2131		level++;
2132		support--;
2133	}
2134	return level;
2135}
2136
2137/*
2138 * Ensure that old small page tables are removed to make room for superpage(s).
2139 * We're going to add new large pages, so make sure we don't remove their parent
2140 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2141 */
2142static void switch_to_super_page(struct dmar_domain *domain,
2143				 unsigned long start_pfn,
2144				 unsigned long end_pfn, int level)
2145{
2146	unsigned long lvl_pages = lvl_to_nr_pages(level);
2147	struct iommu_domain_info *info;
2148	struct dma_pte *pte = NULL;
2149	unsigned long i;
2150
2151	while (start_pfn <= end_pfn) {
2152		if (!pte)
2153			pte = pfn_to_dma_pte(domain, start_pfn, &level);
 
2154
2155		if (dma_pte_present(pte)) {
2156			dma_pte_free_pagetable(domain, start_pfn,
2157					       start_pfn + lvl_pages - 1,
2158					       level + 1);
2159
2160			xa_for_each(&domain->iommu_array, i, info)
2161				iommu_flush_iotlb_psi(info->iommu, domain,
2162						      start_pfn, lvl_pages,
2163						      0, 0);
 
 
 
2164		}
2165
2166		pte++;
2167		start_pfn += lvl_pages;
2168		if (first_pte_in_page(pte))
2169			pte = NULL;
2170	}
2171}
2172
2173static int
2174__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2175		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
 
2176{
2177	struct dma_pte *first_pte = NULL, *pte = NULL;
2178	unsigned int largepage_lvl = 0;
2179	unsigned long lvl_pages = 0;
2180	phys_addr_t pteval;
2181	u64 attr;
2182
2183	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
 
2184
2185	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2186		return -EINVAL;
2187
 
 
 
 
 
2188	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2189	attr |= DMA_FL_PTE_PRESENT;
2190	if (domain->use_first_level) {
2191		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2192		if (prot & DMA_PTE_WRITE)
2193			attr |= DMA_FL_PTE_DIRTY;
2194	}
2195
 
 
2196	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2197
2198	while (nr_pages > 0) {
2199		uint64_t tmp;
2200
2201		if (!pte) {
2202			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2203					phys_pfn, nr_pages);
2204
2205			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
 
2206			if (!pte)
2207				return -ENOMEM;
2208			first_pte = pte;
2209
2210			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2211
2212			/* It is large page*/
2213			if (largepage_lvl > 1) {
2214				unsigned long end_pfn;
2215				unsigned long pages_to_remove;
2216
2217				pteval |= DMA_PTE_LARGE_PAGE;
2218				pages_to_remove = min_t(unsigned long, nr_pages,
2219							nr_pte_to_next_page(pte) * lvl_pages);
2220				end_pfn = iov_pfn + pages_to_remove - 1;
2221				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2222			} else {
2223				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2224			}
2225
2226		}
2227		/* We don't need lock here, nobody else
2228		 * touches the iova range
2229		 */
2230		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2231		if (tmp) {
2232			static int dumps = 5;
2233			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2234				iov_pfn, tmp, (unsigned long long)pteval);
2235			if (dumps) {
2236				dumps--;
2237				debug_dma_dump_mappings(NULL);
2238			}
2239			WARN_ON(1);
2240		}
2241
2242		nr_pages -= lvl_pages;
2243		iov_pfn += lvl_pages;
2244		phys_pfn += lvl_pages;
2245		pteval += lvl_pages * VTD_PAGE_SIZE;
2246
2247		/* If the next PTE would be the first in a new page, then we
2248		 * need to flush the cache on the entries we've just written.
2249		 * And then we'll need to recalculate 'pte', so clear it and
2250		 * let it get set again in the if (!pte) block above.
2251		 *
2252		 * If we're done (!nr_pages) we need to flush the cache too.
2253		 *
2254		 * Also if we've been setting superpages, we may need to
2255		 * recalculate 'pte' and switch back to smaller pages for the
2256		 * end of the mapping, if the trailing size is not enough to
2257		 * use another superpage (i.e. nr_pages < lvl_pages).
2258		 */
2259		pte++;
2260		if (!nr_pages || first_pte_in_page(pte) ||
2261		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2262			domain_flush_cache(domain, first_pte,
2263					   (void *)pte - (void *)first_pte);
2264			pte = NULL;
2265		}
2266	}
2267
2268	return 0;
2269}
2270
2271static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2272{
2273	struct intel_iommu *iommu = info->iommu;
2274	struct context_entry *context;
2275	u16 did_old;
2276
2277	if (!iommu)
2278		return;
2279
2280	spin_lock(&iommu->lock);
2281	context = iommu_context_addr(iommu, bus, devfn, 0);
2282	if (!context) {
2283		spin_unlock(&iommu->lock);
2284		return;
2285	}
2286
2287	if (sm_supported(iommu)) {
2288		if (hw_pass_through && domain_type_is_si(info->domain))
2289			did_old = FLPT_DEFAULT_DID;
2290		else
2291			did_old = domain_id_iommu(info->domain, iommu);
2292	} else {
2293		did_old = context_domain_id(context);
2294	}
2295
2296	context_clear_entry(context);
2297	__iommu_flush_cache(iommu, context, sizeof(*context));
2298	spin_unlock(&iommu->lock);
2299	iommu->flush.flush_context(iommu,
2300				   did_old,
2301				   (((u16)bus) << 8) | devfn,
2302				   DMA_CCMD_MASK_NOBIT,
2303				   DMA_CCMD_DEVICE_INVL);
2304
2305	if (sm_supported(iommu))
2306		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2307
2308	iommu->flush.flush_iotlb(iommu,
2309				 did_old,
2310				 0,
2311				 0,
2312				 DMA_TLB_DSI_FLUSH);
2313
2314	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2315}
2316
2317static int domain_setup_first_level(struct intel_iommu *iommu,
2318				    struct dmar_domain *domain,
2319				    struct device *dev,
2320				    u32 pasid)
2321{
2322	struct dma_pte *pgd = domain->pgd;
2323	int agaw, level;
2324	int flags = 0;
2325
2326	/*
2327	 * Skip top levels of page tables for iommu which has
2328	 * less agaw than default. Unnecessary for PT mode.
2329	 */
2330	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2331		pgd = phys_to_virt(dma_pte_addr(pgd));
2332		if (!dma_pte_present(pgd))
2333			return -ENOMEM;
2334	}
2335
2336	level = agaw_to_level(agaw);
2337	if (level != 4 && level != 5)
2338		return -EINVAL;
2339
2340	if (pasid != PASID_RID2PASID)
2341		flags |= PASID_FLAG_SUPERVISOR_MODE;
2342	if (level == 5)
2343		flags |= PASID_FLAG_FL5LP;
2344
2345	if (domain->force_snooping)
2346		flags |= PASID_FLAG_PAGE_SNOOP;
2347
2348	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2349					     domain_id_iommu(domain, iommu),
2350					     flags);
2351}
2352
2353static bool dev_is_real_dma_subdevice(struct device *dev)
2354{
2355	return dev && dev_is_pci(dev) &&
2356	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2357}
2358
2359static int iommu_domain_identity_map(struct dmar_domain *domain,
2360				     unsigned long first_vpfn,
2361				     unsigned long last_vpfn)
2362{
2363	/*
2364	 * RMRR range might have overlap with physical memory range,
2365	 * clear it first
2366	 */
2367	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2368
2369	return __domain_mapping(domain, first_vpfn,
2370				first_vpfn, last_vpfn - first_vpfn + 1,
2371				DMA_PTE_READ|DMA_PTE_WRITE);
2372}
2373
2374static int md_domain_init(struct dmar_domain *domain, int guest_width);
2375
2376static int __init si_domain_init(int hw)
2377{
2378	struct dmar_rmrr_unit *rmrr;
2379	struct device *dev;
2380	int i, nid, ret;
2381
2382	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2383	if (!si_domain)
2384		return -EFAULT;
2385
2386	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2387		domain_exit(si_domain);
2388		si_domain = NULL;
2389		return -EFAULT;
2390	}
2391
2392	if (hw)
2393		return 0;
2394
2395	for_each_online_node(nid) {
2396		unsigned long start_pfn, end_pfn;
2397		int i;
2398
2399		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2400			ret = iommu_domain_identity_map(si_domain,
2401					mm_to_dma_pfn(start_pfn),
2402					mm_to_dma_pfn(end_pfn));
2403			if (ret)
2404				return ret;
2405		}
2406	}
2407
2408	/*
2409	 * Identity map the RMRRs so that devices with RMRRs could also use
2410	 * the si_domain.
2411	 */
2412	for_each_rmrr_units(rmrr) {
2413		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2414					  i, dev) {
2415			unsigned long long start = rmrr->base_address;
2416			unsigned long long end = rmrr->end_address;
2417
2418			if (WARN_ON(end < start ||
2419				    end >> agaw_to_width(si_domain->agaw)))
2420				continue;
2421
2422			ret = iommu_domain_identity_map(si_domain,
2423					mm_to_dma_pfn(start >> PAGE_SHIFT),
2424					mm_to_dma_pfn(end >> PAGE_SHIFT));
2425			if (ret)
2426				return ret;
2427		}
2428	}
2429
2430	return 0;
2431}
2432
2433static int dmar_domain_attach_device(struct dmar_domain *domain,
2434				     struct device *dev)
2435{
2436	struct device_domain_info *info = dev_iommu_priv_get(dev);
2437	struct intel_iommu *iommu;
2438	unsigned long flags;
2439	u8 bus, devfn;
2440	int ret;
2441
2442	iommu = device_to_iommu(dev, &bus, &devfn);
2443	if (!iommu)
2444		return -ENODEV;
2445
2446	ret = domain_attach_iommu(domain, iommu);
2447	if (ret)
2448		return ret;
2449	info->domain = domain;
2450	spin_lock_irqsave(&domain->lock, flags);
2451	list_add(&info->link, &domain->devices);
2452	spin_unlock_irqrestore(&domain->lock, flags);
2453
2454	/* PASID table is mandatory for a PCI device in scalable mode. */
2455	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2456		/* Setup the PASID entry for requests without PASID: */
2457		if (hw_pass_through && domain_type_is_si(domain))
2458			ret = intel_pasid_setup_pass_through(iommu, domain,
2459					dev, PASID_RID2PASID);
2460		else if (domain->use_first_level)
2461			ret = domain_setup_first_level(iommu, domain, dev,
2462					PASID_RID2PASID);
2463		else
2464			ret = intel_pasid_setup_second_level(iommu, domain,
2465					dev, PASID_RID2PASID);
2466		if (ret) {
2467			dev_err(dev, "Setup RID2PASID failed\n");
2468			device_block_translation(dev);
2469			return ret;
2470		}
2471	}
2472
2473	ret = domain_context_mapping(domain, dev);
2474	if (ret) {
2475		dev_err(dev, "Domain context map failed\n");
2476		device_block_translation(dev);
2477		return ret;
2478	}
2479
2480	iommu_enable_pci_caps(info);
 
2481
2482	return 0;
2483}
2484
2485static bool device_has_rmrr(struct device *dev)
2486{
2487	struct dmar_rmrr_unit *rmrr;
2488	struct device *tmp;
2489	int i;
2490
2491	rcu_read_lock();
2492	for_each_rmrr_units(rmrr) {
2493		/*
2494		 * Return TRUE if this RMRR contains the device that
2495		 * is passed in.
2496		 */
2497		for_each_active_dev_scope(rmrr->devices,
2498					  rmrr->devices_cnt, i, tmp)
2499			if (tmp == dev ||
2500			    is_downstream_to_pci_bridge(dev, tmp)) {
2501				rcu_read_unlock();
2502				return true;
2503			}
2504	}
2505	rcu_read_unlock();
2506	return false;
2507}
2508
2509/**
2510 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2511 * is relaxable (ie. is allowed to be not enforced under some conditions)
2512 * @dev: device handle
2513 *
2514 * We assume that PCI USB devices with RMRRs have them largely
2515 * for historical reasons and that the RMRR space is not actively used post
2516 * boot.  This exclusion may change if vendors begin to abuse it.
2517 *
2518 * The same exception is made for graphics devices, with the requirement that
2519 * any use of the RMRR regions will be torn down before assigning the device
2520 * to a guest.
2521 *
2522 * Return: true if the RMRR is relaxable, false otherwise
2523 */
2524static bool device_rmrr_is_relaxable(struct device *dev)
2525{
2526	struct pci_dev *pdev;
2527
2528	if (!dev_is_pci(dev))
2529		return false;
2530
2531	pdev = to_pci_dev(dev);
2532	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2533		return true;
2534	else
2535		return false;
2536}
2537
2538/*
2539 * There are a couple cases where we need to restrict the functionality of
2540 * devices associated with RMRRs.  The first is when evaluating a device for
2541 * identity mapping because problems exist when devices are moved in and out
2542 * of domains and their respective RMRR information is lost.  This means that
2543 * a device with associated RMRRs will never be in a "passthrough" domain.
2544 * The second is use of the device through the IOMMU API.  This interface
2545 * expects to have full control of the IOVA space for the device.  We cannot
2546 * satisfy both the requirement that RMRR access is maintained and have an
2547 * unencumbered IOVA space.  We also have no ability to quiesce the device's
2548 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2549 * We therefore prevent devices associated with an RMRR from participating in
2550 * the IOMMU API, which eliminates them from device assignment.
2551 *
2552 * In both cases, devices which have relaxable RMRRs are not concerned by this
2553 * restriction. See device_rmrr_is_relaxable comment.
2554 */
2555static bool device_is_rmrr_locked(struct device *dev)
2556{
2557	if (!device_has_rmrr(dev))
2558		return false;
2559
2560	if (device_rmrr_is_relaxable(dev))
2561		return false;
2562
2563	return true;
2564}
2565
2566/*
2567 * Return the required default domain type for a specific device.
2568 *
2569 * @dev: the device in query
2570 * @startup: true if this is during early boot
2571 *
2572 * Returns:
2573 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2574 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2575 *  - 0: both identity and dynamic domains work for this device
2576 */
2577static int device_def_domain_type(struct device *dev)
2578{
2579	if (dev_is_pci(dev)) {
2580		struct pci_dev *pdev = to_pci_dev(dev);
2581
2582		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2583			return IOMMU_DOMAIN_IDENTITY;
2584
2585		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2586			return IOMMU_DOMAIN_IDENTITY;
2587	}
2588
2589	return 0;
2590}
2591
2592static void intel_iommu_init_qi(struct intel_iommu *iommu)
2593{
2594	/*
2595	 * Start from the sane iommu hardware state.
2596	 * If the queued invalidation is already initialized by us
2597	 * (for example, while enabling interrupt-remapping) then
2598	 * we got the things already rolling from a sane state.
2599	 */
2600	if (!iommu->qi) {
2601		/*
2602		 * Clear any previous faults.
2603		 */
2604		dmar_fault(-1, iommu);
2605		/*
2606		 * Disable queued invalidation if supported and already enabled
2607		 * before OS handover.
2608		 */
2609		dmar_disable_qi(iommu);
2610	}
2611
2612	if (dmar_enable_qi(iommu)) {
2613		/*
2614		 * Queued Invalidate not enabled, use Register Based Invalidate
2615		 */
2616		iommu->flush.flush_context = __iommu_flush_context;
2617		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2618		pr_info("%s: Using Register based invalidation\n",
2619			iommu->name);
2620	} else {
2621		iommu->flush.flush_context = qi_flush_context;
2622		iommu->flush.flush_iotlb = qi_flush_iotlb;
2623		pr_info("%s: Using Queued invalidation\n", iommu->name);
2624	}
2625}
2626
2627static int copy_context_table(struct intel_iommu *iommu,
2628			      struct root_entry *old_re,
2629			      struct context_entry **tbl,
2630			      int bus, bool ext)
2631{
2632	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2633	struct context_entry *new_ce = NULL, ce;
2634	struct context_entry *old_ce = NULL;
2635	struct root_entry re;
2636	phys_addr_t old_ce_phys;
2637
2638	tbl_idx = ext ? bus * 2 : bus;
2639	memcpy(&re, old_re, sizeof(re));
2640
2641	for (devfn = 0; devfn < 256; devfn++) {
2642		/* First calculate the correct index */
2643		idx = (ext ? devfn * 2 : devfn) % 256;
2644
2645		if (idx == 0) {
2646			/* First save what we may have and clean up */
2647			if (new_ce) {
2648				tbl[tbl_idx] = new_ce;
2649				__iommu_flush_cache(iommu, new_ce,
2650						    VTD_PAGE_SIZE);
2651				pos = 1;
2652			}
2653
2654			if (old_ce)
2655				memunmap(old_ce);
2656
2657			ret = 0;
2658			if (devfn < 0x80)
2659				old_ce_phys = root_entry_lctp(&re);
2660			else
2661				old_ce_phys = root_entry_uctp(&re);
2662
2663			if (!old_ce_phys) {
2664				if (ext && devfn == 0) {
2665					/* No LCTP, try UCTP */
2666					devfn = 0x7f;
2667					continue;
2668				} else {
2669					goto out;
2670				}
2671			}
2672
2673			ret = -ENOMEM;
2674			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2675					MEMREMAP_WB);
2676			if (!old_ce)
2677				goto out;
2678
2679			new_ce = alloc_pgtable_page(iommu->node);
2680			if (!new_ce)
2681				goto out_unmap;
2682
2683			ret = 0;
2684		}
2685
2686		/* Now copy the context entry */
2687		memcpy(&ce, old_ce + idx, sizeof(ce));
2688
2689		if (!context_present(&ce))
2690			continue;
2691
2692		did = context_domain_id(&ce);
2693		if (did >= 0 && did < cap_ndoms(iommu->cap))
2694			set_bit(did, iommu->domain_ids);
2695
2696		set_context_copied(iommu, bus, devfn);
2697		new_ce[idx] = ce;
2698	}
2699
2700	tbl[tbl_idx + pos] = new_ce;
2701
2702	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2703
2704out_unmap:
2705	memunmap(old_ce);
2706
2707out:
2708	return ret;
2709}
2710
2711static int copy_translation_tables(struct intel_iommu *iommu)
2712{
2713	struct context_entry **ctxt_tbls;
2714	struct root_entry *old_rt;
2715	phys_addr_t old_rt_phys;
2716	int ctxt_table_entries;
2717	u64 rtaddr_reg;
2718	int bus, ret;
2719	bool new_ext, ext;
2720
2721	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2722	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2723	new_ext    = !!sm_supported(iommu);
2724
2725	/*
2726	 * The RTT bit can only be changed when translation is disabled,
2727	 * but disabling translation means to open a window for data
2728	 * corruption. So bail out and don't copy anything if we would
2729	 * have to change the bit.
2730	 */
2731	if (new_ext != ext)
2732		return -EINVAL;
2733
2734	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2735	if (!iommu->copied_tables)
2736		return -ENOMEM;
2737
2738	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2739	if (!old_rt_phys)
2740		return -EINVAL;
2741
2742	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2743	if (!old_rt)
2744		return -ENOMEM;
2745
2746	/* This is too big for the stack - allocate it from slab */
2747	ctxt_table_entries = ext ? 512 : 256;
2748	ret = -ENOMEM;
2749	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2750	if (!ctxt_tbls)
2751		goto out_unmap;
2752
2753	for (bus = 0; bus < 256; bus++) {
2754		ret = copy_context_table(iommu, &old_rt[bus],
2755					 ctxt_tbls, bus, ext);
2756		if (ret) {
2757			pr_err("%s: Failed to copy context table for bus %d\n",
2758				iommu->name, bus);
2759			continue;
2760		}
2761	}
2762
2763	spin_lock(&iommu->lock);
2764
2765	/* Context tables are copied, now write them to the root_entry table */
2766	for (bus = 0; bus < 256; bus++) {
2767		int idx = ext ? bus * 2 : bus;
2768		u64 val;
2769
2770		if (ctxt_tbls[idx]) {
2771			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2772			iommu->root_entry[bus].lo = val;
2773		}
2774
2775		if (!ext || !ctxt_tbls[idx + 1])
2776			continue;
2777
2778		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2779		iommu->root_entry[bus].hi = val;
2780	}
2781
2782	spin_unlock(&iommu->lock);
2783
2784	kfree(ctxt_tbls);
2785
2786	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2787
2788	ret = 0;
2789
2790out_unmap:
2791	memunmap(old_rt);
2792
2793	return ret;
2794}
2795
2796#ifdef CONFIG_INTEL_IOMMU_SVM
2797static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2798{
2799	struct intel_iommu *iommu = data;
2800	ioasid_t ioasid;
2801
2802	if (!iommu)
2803		return INVALID_IOASID;
2804	/*
2805	 * VT-d virtual command interface always uses the full 20 bit
2806	 * PASID range. Host can partition guest PASID range based on
2807	 * policies but it is out of guest's control.
2808	 */
2809	if (min < PASID_MIN || max > intel_pasid_max_id)
2810		return INVALID_IOASID;
2811
2812	if (vcmd_alloc_pasid(iommu, &ioasid))
2813		return INVALID_IOASID;
2814
2815	return ioasid;
2816}
2817
2818static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2819{
2820	struct intel_iommu *iommu = data;
2821
2822	if (!iommu)
2823		return;
2824	/*
2825	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2826	 * We can only free the PASID when all the devices are unbound.
2827	 */
2828	if (ioasid_find(NULL, ioasid, NULL)) {
2829		pr_alert("Cannot free active IOASID %d\n", ioasid);
2830		return;
2831	}
2832	vcmd_free_pasid(iommu, ioasid);
2833}
2834
2835static void register_pasid_allocator(struct intel_iommu *iommu)
2836{
2837	/*
2838	 * If we are running in the host, no need for custom allocator
2839	 * in that PASIDs are allocated from the host system-wide.
2840	 */
2841	if (!cap_caching_mode(iommu->cap))
2842		return;
2843
2844	if (!sm_supported(iommu)) {
2845		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2846		return;
2847	}
2848
2849	/*
2850	 * Register a custom PASID allocator if we are running in a guest,
2851	 * guest PASID must be obtained via virtual command interface.
2852	 * There can be multiple vIOMMUs in each guest but only one allocator
2853	 * is active. All vIOMMU allocators will eventually be calling the same
2854	 * host allocator.
2855	 */
2856	if (!vccap_pasid(iommu->vccap))
2857		return;
2858
2859	pr_info("Register custom PASID allocator\n");
2860	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2861	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2862	iommu->pasid_allocator.pdata = (void *)iommu;
2863	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2864		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2865		/*
2866		 * Disable scalable mode on this IOMMU if there
2867		 * is no custom allocator. Mixing SM capable vIOMMU
2868		 * and non-SM vIOMMU are not supported.
2869		 */
2870		intel_iommu_sm = 0;
2871	}
2872}
2873#endif
2874
2875static int __init init_dmars(void)
2876{
2877	struct dmar_drhd_unit *drhd;
2878	struct intel_iommu *iommu;
2879	int ret;
2880
2881	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2882	if (ret)
2883		goto free_iommu;
2884
2885	for_each_iommu(iommu, drhd) {
2886		if (drhd->ignored) {
2887			iommu_disable_translation(iommu);
2888			continue;
2889		}
2890
2891		/*
2892		 * Find the max pasid size of all IOMMU's in the system.
2893		 * We need to ensure the system pasid table is no bigger
2894		 * than the smallest supported.
2895		 */
2896		if (pasid_supported(iommu)) {
2897			u32 temp = 2 << ecap_pss(iommu->ecap);
2898
2899			intel_pasid_max_id = min_t(u32, temp,
2900						   intel_pasid_max_id);
2901		}
2902
2903		intel_iommu_init_qi(iommu);
2904
2905		ret = iommu_init_domains(iommu);
2906		if (ret)
2907			goto free_iommu;
2908
2909		init_translation_status(iommu);
2910
2911		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2912			iommu_disable_translation(iommu);
2913			clear_translation_pre_enabled(iommu);
2914			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2915				iommu->name);
2916		}
2917
2918		/*
2919		 * TBD:
2920		 * we could share the same root & context tables
2921		 * among all IOMMU's. Need to Split it later.
2922		 */
2923		ret = iommu_alloc_root_entry(iommu);
2924		if (ret)
2925			goto free_iommu;
2926
2927		if (translation_pre_enabled(iommu)) {
2928			pr_info("Translation already enabled - trying to copy translation structures\n");
2929
2930			ret = copy_translation_tables(iommu);
2931			if (ret) {
2932				/*
2933				 * We found the IOMMU with translation
2934				 * enabled - but failed to copy over the
2935				 * old root-entry table. Try to proceed
2936				 * by disabling translation now and
2937				 * allocating a clean root-entry table.
2938				 * This might cause DMAR faults, but
2939				 * probably the dump will still succeed.
2940				 */
2941				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2942				       iommu->name);
2943				iommu_disable_translation(iommu);
2944				clear_translation_pre_enabled(iommu);
2945			} else {
2946				pr_info("Copied translation tables from previous kernel for %s\n",
2947					iommu->name);
2948			}
2949		}
2950
2951		if (!ecap_pass_through(iommu->ecap))
2952			hw_pass_through = 0;
2953		intel_svm_check(iommu);
2954	}
2955
2956	/*
2957	 * Now that qi is enabled on all iommus, set the root entry and flush
2958	 * caches. This is required on some Intel X58 chipsets, otherwise the
2959	 * flush_context function will loop forever and the boot hangs.
2960	 */
2961	for_each_active_iommu(iommu, drhd) {
2962		iommu_flush_write_buffer(iommu);
2963#ifdef CONFIG_INTEL_IOMMU_SVM
2964		register_pasid_allocator(iommu);
2965#endif
2966		iommu_set_root_entry(iommu);
2967	}
2968
2969#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2970	dmar_map_gfx = 0;
2971#endif
2972
2973	if (!dmar_map_gfx)
2974		iommu_identity_mapping |= IDENTMAP_GFX;
2975
2976	check_tylersburg_isoch();
2977
2978	ret = si_domain_init(hw_pass_through);
2979	if (ret)
2980		goto free_iommu;
2981
2982	/*
2983	 * for each drhd
2984	 *   enable fault log
2985	 *   global invalidate context cache
2986	 *   global invalidate iotlb
2987	 *   enable translation
2988	 */
2989	for_each_iommu(iommu, drhd) {
2990		if (drhd->ignored) {
2991			/*
2992			 * we always have to disable PMRs or DMA may fail on
2993			 * this device
2994			 */
2995			if (force_on)
2996				iommu_disable_protect_mem_regions(iommu);
2997			continue;
2998		}
2999
3000		iommu_flush_write_buffer(iommu);
3001
3002#ifdef CONFIG_INTEL_IOMMU_SVM
3003		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3004			/*
3005			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3006			 * could cause possible lock race condition.
3007			 */
3008			up_write(&dmar_global_lock);
3009			ret = intel_svm_enable_prq(iommu);
3010			down_write(&dmar_global_lock);
3011			if (ret)
3012				goto free_iommu;
3013		}
3014#endif
3015		ret = dmar_set_interrupt(iommu);
3016		if (ret)
3017			goto free_iommu;
3018	}
3019
3020	return 0;
3021
3022free_iommu:
3023	for_each_active_iommu(iommu, drhd) {
3024		disable_dmar_iommu(iommu);
3025		free_dmar_iommu(iommu);
3026	}
3027	if (si_domain) {
3028		domain_exit(si_domain);
3029		si_domain = NULL;
3030	}
3031
3032	return ret;
3033}
3034
3035static void __init init_no_remapping_devices(void)
3036{
3037	struct dmar_drhd_unit *drhd;
3038	struct device *dev;
3039	int i;
3040
3041	for_each_drhd_unit(drhd) {
3042		if (!drhd->include_all) {
3043			for_each_active_dev_scope(drhd->devices,
3044						  drhd->devices_cnt, i, dev)
3045				break;
3046			/* ignore DMAR unit if no devices exist */
3047			if (i == drhd->devices_cnt)
3048				drhd->ignored = 1;
3049		}
3050	}
3051
3052	for_each_active_drhd_unit(drhd) {
3053		if (drhd->include_all)
3054			continue;
3055
3056		for_each_active_dev_scope(drhd->devices,
3057					  drhd->devices_cnt, i, dev)
3058			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3059				break;
3060		if (i < drhd->devices_cnt)
3061			continue;
3062
3063		/* This IOMMU has *only* gfx devices. Either bypass it or
3064		   set the gfx_mapped flag, as appropriate */
3065		drhd->gfx_dedicated = 1;
3066		if (!dmar_map_gfx)
3067			drhd->ignored = 1;
3068	}
3069}
3070
3071#ifdef CONFIG_SUSPEND
3072static int init_iommu_hw(void)
3073{
3074	struct dmar_drhd_unit *drhd;
3075	struct intel_iommu *iommu = NULL;
 
3076
3077	for_each_active_iommu(iommu, drhd)
3078		if (iommu->qi)
3079			dmar_reenable_qi(iommu);
 
 
 
 
3080
3081	for_each_iommu(iommu, drhd) {
3082		if (drhd->ignored) {
3083			/*
3084			 * we always have to disable PMRs or DMA may fail on
3085			 * this device
3086			 */
3087			if (force_on)
3088				iommu_disable_protect_mem_regions(iommu);
3089			continue;
3090		}
3091
3092		iommu_flush_write_buffer(iommu);
3093		iommu_set_root_entry(iommu);
3094		iommu_enable_translation(iommu);
3095		iommu_disable_protect_mem_regions(iommu);
3096	}
3097
3098	return 0;
3099}
3100
3101static void iommu_flush_all(void)
3102{
3103	struct dmar_drhd_unit *drhd;
3104	struct intel_iommu *iommu;
3105
3106	for_each_active_iommu(iommu, drhd) {
3107		iommu->flush.flush_context(iommu, 0, 0, 0,
3108					   DMA_CCMD_GLOBAL_INVL);
3109		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3110					 DMA_TLB_GLOBAL_FLUSH);
3111	}
3112}
3113
3114static int iommu_suspend(void)
3115{
3116	struct dmar_drhd_unit *drhd;
3117	struct intel_iommu *iommu = NULL;
3118	unsigned long flag;
3119
3120	for_each_active_iommu(iommu, drhd) {
3121		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3122					     GFP_KERNEL);
3123		if (!iommu->iommu_state)
3124			goto nomem;
3125	}
3126
3127	iommu_flush_all();
3128
3129	for_each_active_iommu(iommu, drhd) {
3130		iommu_disable_translation(iommu);
3131
3132		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3133
3134		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3135			readl(iommu->reg + DMAR_FECTL_REG);
3136		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3137			readl(iommu->reg + DMAR_FEDATA_REG);
3138		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3139			readl(iommu->reg + DMAR_FEADDR_REG);
3140		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3141			readl(iommu->reg + DMAR_FEUADDR_REG);
3142
3143		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3144	}
3145	return 0;
3146
3147nomem:
3148	for_each_active_iommu(iommu, drhd)
3149		kfree(iommu->iommu_state);
3150
3151	return -ENOMEM;
3152}
3153
3154static void iommu_resume(void)
3155{
3156	struct dmar_drhd_unit *drhd;
3157	struct intel_iommu *iommu = NULL;
3158	unsigned long flag;
3159
3160	if (init_iommu_hw()) {
3161		if (force_on)
3162			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3163		else
3164			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3165		return;
3166	}
3167
3168	for_each_active_iommu(iommu, drhd) {
3169
3170		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3171
3172		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3173			iommu->reg + DMAR_FECTL_REG);
3174		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3175			iommu->reg + DMAR_FEDATA_REG);
3176		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3177			iommu->reg + DMAR_FEADDR_REG);
3178		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3179			iommu->reg + DMAR_FEUADDR_REG);
3180
3181		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3182	}
3183
3184	for_each_active_iommu(iommu, drhd)
3185		kfree(iommu->iommu_state);
3186}
3187
3188static struct syscore_ops iommu_syscore_ops = {
3189	.resume		= iommu_resume,
3190	.suspend	= iommu_suspend,
3191};
3192
3193static void __init init_iommu_pm_ops(void)
3194{
3195	register_syscore_ops(&iommu_syscore_ops);
3196}
3197
3198#else
3199static inline void init_iommu_pm_ops(void) {}
3200#endif	/* CONFIG_PM */
3201
3202static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3203{
3204	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3205	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3206	    rmrr->end_address <= rmrr->base_address ||
3207	    arch_rmrr_sanity_check(rmrr))
3208		return -EINVAL;
3209
3210	return 0;
3211}
3212
3213int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3214{
3215	struct acpi_dmar_reserved_memory *rmrr;
3216	struct dmar_rmrr_unit *rmrru;
3217
3218	rmrr = (struct acpi_dmar_reserved_memory *)header;
3219	if (rmrr_sanity_check(rmrr)) {
3220		pr_warn(FW_BUG
3221			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3222			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3223			   rmrr->base_address, rmrr->end_address,
3224			   dmi_get_system_info(DMI_BIOS_VENDOR),
3225			   dmi_get_system_info(DMI_BIOS_VERSION),
3226			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3227		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3228	}
3229
3230	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3231	if (!rmrru)
3232		goto out;
3233
3234	rmrru->hdr = header;
3235
3236	rmrru->base_address = rmrr->base_address;
3237	rmrru->end_address = rmrr->end_address;
3238
3239	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3240				((void *)rmrr) + rmrr->header.length,
3241				&rmrru->devices_cnt);
3242	if (rmrru->devices_cnt && rmrru->devices == NULL)
3243		goto free_rmrru;
3244
3245	list_add(&rmrru->list, &dmar_rmrr_units);
3246
3247	return 0;
3248free_rmrru:
3249	kfree(rmrru);
3250out:
3251	return -ENOMEM;
3252}
3253
3254static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3255{
3256	struct dmar_atsr_unit *atsru;
3257	struct acpi_dmar_atsr *tmp;
3258
3259	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3260				dmar_rcu_check()) {
3261		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3262		if (atsr->segment != tmp->segment)
3263			continue;
3264		if (atsr->header.length != tmp->header.length)
3265			continue;
3266		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3267			return atsru;
3268	}
3269
3270	return NULL;
3271}
3272
3273int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3274{
3275	struct acpi_dmar_atsr *atsr;
3276	struct dmar_atsr_unit *atsru;
3277
3278	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3279		return 0;
3280
3281	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3282	atsru = dmar_find_atsr(atsr);
3283	if (atsru)
3284		return 0;
3285
3286	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3287	if (!atsru)
3288		return -ENOMEM;
3289
3290	/*
3291	 * If memory is allocated from slab by ACPI _DSM method, we need to
3292	 * copy the memory content because the memory buffer will be freed
3293	 * on return.
3294	 */
3295	atsru->hdr = (void *)(atsru + 1);
3296	memcpy(atsru->hdr, hdr, hdr->length);
3297	atsru->include_all = atsr->flags & 0x1;
3298	if (!atsru->include_all) {
3299		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3300				(void *)atsr + atsr->header.length,
3301				&atsru->devices_cnt);
3302		if (atsru->devices_cnt && atsru->devices == NULL) {
3303			kfree(atsru);
3304			return -ENOMEM;
3305		}
3306	}
3307
3308	list_add_rcu(&atsru->list, &dmar_atsr_units);
3309
3310	return 0;
3311}
3312
3313static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3314{
3315	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3316	kfree(atsru);
3317}
3318
3319int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3320{
3321	struct acpi_dmar_atsr *atsr;
3322	struct dmar_atsr_unit *atsru;
3323
3324	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3325	atsru = dmar_find_atsr(atsr);
3326	if (atsru) {
3327		list_del_rcu(&atsru->list);
3328		synchronize_rcu();
3329		intel_iommu_free_atsr(atsru);
3330	}
3331
3332	return 0;
3333}
3334
3335int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3336{
3337	int i;
3338	struct device *dev;
3339	struct acpi_dmar_atsr *atsr;
3340	struct dmar_atsr_unit *atsru;
3341
3342	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3343	atsru = dmar_find_atsr(atsr);
3344	if (!atsru)
3345		return 0;
3346
3347	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3348		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3349					  i, dev)
3350			return -EBUSY;
3351	}
3352
3353	return 0;
3354}
3355
3356static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3357{
3358	struct dmar_satc_unit *satcu;
3359	struct acpi_dmar_satc *tmp;
3360
3361	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3362				dmar_rcu_check()) {
3363		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3364		if (satc->segment != tmp->segment)
3365			continue;
3366		if (satc->header.length != tmp->header.length)
3367			continue;
3368		if (memcmp(satc, tmp, satc->header.length) == 0)
3369			return satcu;
3370	}
3371
3372	return NULL;
3373}
3374
3375int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3376{
3377	struct acpi_dmar_satc *satc;
3378	struct dmar_satc_unit *satcu;
3379
3380	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3381		return 0;
3382
3383	satc = container_of(hdr, struct acpi_dmar_satc, header);
3384	satcu = dmar_find_satc(satc);
3385	if (satcu)
3386		return 0;
3387
3388	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3389	if (!satcu)
3390		return -ENOMEM;
3391
3392	satcu->hdr = (void *)(satcu + 1);
3393	memcpy(satcu->hdr, hdr, hdr->length);
3394	satcu->atc_required = satc->flags & 0x1;
3395	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3396					      (void *)satc + satc->header.length,
3397					      &satcu->devices_cnt);
3398	if (satcu->devices_cnt && !satcu->devices) {
3399		kfree(satcu);
3400		return -ENOMEM;
3401	}
3402	list_add_rcu(&satcu->list, &dmar_satc_units);
3403
3404	return 0;
3405}
3406
3407static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3408{
3409	int sp, ret;
3410	struct intel_iommu *iommu = dmaru->iommu;
3411
3412	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3413	if (ret)
3414		goto out;
3415
3416	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3417		pr_warn("%s: Doesn't support hardware pass through.\n",
3418			iommu->name);
3419		return -ENXIO;
3420	}
3421
3422	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3423	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3424		pr_warn("%s: Doesn't support large page.\n",
3425			iommu->name);
3426		return -ENXIO;
3427	}
3428
3429	/*
3430	 * Disable translation if already enabled prior to OS handover.
3431	 */
3432	if (iommu->gcmd & DMA_GCMD_TE)
3433		iommu_disable_translation(iommu);
3434
3435	ret = iommu_init_domains(iommu);
3436	if (ret == 0)
3437		ret = iommu_alloc_root_entry(iommu);
3438	if (ret)
3439		goto out;
3440
3441	intel_svm_check(iommu);
3442
3443	if (dmaru->ignored) {
3444		/*
3445		 * we always have to disable PMRs or DMA may fail on this device
3446		 */
3447		if (force_on)
3448			iommu_disable_protect_mem_regions(iommu);
3449		return 0;
3450	}
3451
3452	intel_iommu_init_qi(iommu);
3453	iommu_flush_write_buffer(iommu);
3454
3455#ifdef CONFIG_INTEL_IOMMU_SVM
3456	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3457		ret = intel_svm_enable_prq(iommu);
3458		if (ret)
3459			goto disable_iommu;
3460	}
3461#endif
3462	ret = dmar_set_interrupt(iommu);
3463	if (ret)
3464		goto disable_iommu;
3465
3466	iommu_set_root_entry(iommu);
3467	iommu_enable_translation(iommu);
3468
3469	iommu_disable_protect_mem_regions(iommu);
3470	return 0;
3471
3472disable_iommu:
3473	disable_dmar_iommu(iommu);
3474out:
3475	free_dmar_iommu(iommu);
3476	return ret;
3477}
3478
3479int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3480{
3481	int ret = 0;
3482	struct intel_iommu *iommu = dmaru->iommu;
3483
3484	if (!intel_iommu_enabled)
3485		return 0;
3486	if (iommu == NULL)
3487		return -EINVAL;
3488
3489	if (insert) {
3490		ret = intel_iommu_add(dmaru);
3491	} else {
3492		disable_dmar_iommu(iommu);
3493		free_dmar_iommu(iommu);
3494	}
3495
3496	return ret;
3497}
3498
3499static void intel_iommu_free_dmars(void)
3500{
3501	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3502	struct dmar_atsr_unit *atsru, *atsr_n;
3503	struct dmar_satc_unit *satcu, *satc_n;
3504
3505	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3506		list_del(&rmrru->list);
3507		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3508		kfree(rmrru);
3509	}
3510
3511	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3512		list_del(&atsru->list);
3513		intel_iommu_free_atsr(atsru);
3514	}
3515	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3516		list_del(&satcu->list);
3517		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3518		kfree(satcu);
3519	}
3520}
3521
3522static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3523{
3524	struct dmar_satc_unit *satcu;
3525	struct acpi_dmar_satc *satc;
3526	struct device *tmp;
3527	int i;
3528
3529	dev = pci_physfn(dev);
3530	rcu_read_lock();
3531
3532	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3533		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3534		if (satc->segment != pci_domain_nr(dev->bus))
3535			continue;
3536		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3537			if (to_pci_dev(tmp) == dev)
3538				goto out;
3539	}
3540	satcu = NULL;
3541out:
3542	rcu_read_unlock();
3543	return satcu;
3544}
3545
3546static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3547{
3548	int i, ret = 1;
3549	struct pci_bus *bus;
3550	struct pci_dev *bridge = NULL;
3551	struct device *tmp;
3552	struct acpi_dmar_atsr *atsr;
3553	struct dmar_atsr_unit *atsru;
3554	struct dmar_satc_unit *satcu;
3555
3556	dev = pci_physfn(dev);
3557	satcu = dmar_find_matched_satc_unit(dev);
3558	if (satcu)
3559		/*
3560		 * This device supports ATS as it is in SATC table.
3561		 * When IOMMU is in legacy mode, enabling ATS is done
3562		 * automatically by HW for the device that requires
3563		 * ATS, hence OS should not enable this device ATS
3564		 * to avoid duplicated TLB invalidation.
3565		 */
3566		return !(satcu->atc_required && !sm_supported(iommu));
3567
3568	for (bus = dev->bus; bus; bus = bus->parent) {
3569		bridge = bus->self;
3570		/* If it's an integrated device, allow ATS */
3571		if (!bridge)
3572			return 1;
3573		/* Connected via non-PCIe: no ATS */
3574		if (!pci_is_pcie(bridge) ||
3575		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3576			return 0;
3577		/* If we found the root port, look it up in the ATSR */
3578		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3579			break;
3580	}
3581
3582	rcu_read_lock();
3583	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3584		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3585		if (atsr->segment != pci_domain_nr(dev->bus))
3586			continue;
3587
3588		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3589			if (tmp == &bridge->dev)
3590				goto out;
3591
3592		if (atsru->include_all)
3593			goto out;
3594	}
3595	ret = 0;
3596out:
3597	rcu_read_unlock();
3598
3599	return ret;
3600}
3601
3602int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3603{
3604	int ret;
3605	struct dmar_rmrr_unit *rmrru;
3606	struct dmar_atsr_unit *atsru;
3607	struct dmar_satc_unit *satcu;
3608	struct acpi_dmar_atsr *atsr;
3609	struct acpi_dmar_reserved_memory *rmrr;
3610	struct acpi_dmar_satc *satc;
3611
3612	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3613		return 0;
3614
3615	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3616		rmrr = container_of(rmrru->hdr,
3617				    struct acpi_dmar_reserved_memory, header);
3618		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3619			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3620				((void *)rmrr) + rmrr->header.length,
3621				rmrr->segment, rmrru->devices,
3622				rmrru->devices_cnt);
3623			if (ret < 0)
3624				return ret;
3625		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3626			dmar_remove_dev_scope(info, rmrr->segment,
3627				rmrru->devices, rmrru->devices_cnt);
3628		}
3629	}
3630
3631	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3632		if (atsru->include_all)
3633			continue;
3634
3635		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3636		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3637			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3638					(void *)atsr + atsr->header.length,
3639					atsr->segment, atsru->devices,
3640					atsru->devices_cnt);
3641			if (ret > 0)
3642				break;
3643			else if (ret < 0)
3644				return ret;
3645		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3646			if (dmar_remove_dev_scope(info, atsr->segment,
3647					atsru->devices, atsru->devices_cnt))
3648				break;
3649		}
3650	}
3651	list_for_each_entry(satcu, &dmar_satc_units, list) {
3652		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3653		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3654			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3655					(void *)satc + satc->header.length,
3656					satc->segment, satcu->devices,
3657					satcu->devices_cnt);
3658			if (ret > 0)
3659				break;
3660			else if (ret < 0)
3661				return ret;
3662		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3663			if (dmar_remove_dev_scope(info, satc->segment,
3664					satcu->devices, satcu->devices_cnt))
3665				break;
3666		}
3667	}
3668
3669	return 0;
3670}
3671
3672static int intel_iommu_memory_notifier(struct notifier_block *nb,
3673				       unsigned long val, void *v)
3674{
3675	struct memory_notify *mhp = v;
3676	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3677	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3678			mhp->nr_pages - 1);
3679
3680	switch (val) {
3681	case MEM_GOING_ONLINE:
3682		if (iommu_domain_identity_map(si_domain,
3683					      start_vpfn, last_vpfn)) {
3684			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3685				start_vpfn, last_vpfn);
3686			return NOTIFY_BAD;
3687		}
3688		break;
3689
3690	case MEM_OFFLINE:
3691	case MEM_CANCEL_ONLINE:
3692		{
3693			struct dmar_drhd_unit *drhd;
3694			struct intel_iommu *iommu;
3695			LIST_HEAD(freelist);
3696
3697			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3698
3699			rcu_read_lock();
3700			for_each_active_iommu(iommu, drhd)
3701				iommu_flush_iotlb_psi(iommu, si_domain,
3702					start_vpfn, mhp->nr_pages,
3703					list_empty(&freelist), 0);
3704			rcu_read_unlock();
3705			put_pages_list(&freelist);
3706		}
3707		break;
3708	}
3709
3710	return NOTIFY_OK;
3711}
3712
3713static struct notifier_block intel_iommu_memory_nb = {
3714	.notifier_call = intel_iommu_memory_notifier,
3715	.priority = 0
3716};
3717
3718static void intel_disable_iommus(void)
3719{
3720	struct intel_iommu *iommu = NULL;
3721	struct dmar_drhd_unit *drhd;
3722
3723	for_each_iommu(iommu, drhd)
3724		iommu_disable_translation(iommu);
3725}
3726
3727void intel_iommu_shutdown(void)
3728{
3729	struct dmar_drhd_unit *drhd;
3730	struct intel_iommu *iommu = NULL;
3731
3732	if (no_iommu || dmar_disabled)
3733		return;
3734
3735	down_write(&dmar_global_lock);
3736
3737	/* Disable PMRs explicitly here. */
3738	for_each_iommu(iommu, drhd)
3739		iommu_disable_protect_mem_regions(iommu);
3740
3741	/* Make sure the IOMMUs are switched off */
3742	intel_disable_iommus();
3743
3744	up_write(&dmar_global_lock);
3745}
3746
3747static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3748{
3749	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3750
3751	return container_of(iommu_dev, struct intel_iommu, iommu);
3752}
3753
3754static ssize_t version_show(struct device *dev,
3755			    struct device_attribute *attr, char *buf)
3756{
3757	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3758	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3759	return sprintf(buf, "%d:%d\n",
3760		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3761}
3762static DEVICE_ATTR_RO(version);
3763
3764static ssize_t address_show(struct device *dev,
3765			    struct device_attribute *attr, char *buf)
3766{
3767	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3768	return sprintf(buf, "%llx\n", iommu->reg_phys);
3769}
3770static DEVICE_ATTR_RO(address);
3771
3772static ssize_t cap_show(struct device *dev,
3773			struct device_attribute *attr, char *buf)
3774{
3775	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3776	return sprintf(buf, "%llx\n", iommu->cap);
3777}
3778static DEVICE_ATTR_RO(cap);
3779
3780static ssize_t ecap_show(struct device *dev,
3781			 struct device_attribute *attr, char *buf)
3782{
3783	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3784	return sprintf(buf, "%llx\n", iommu->ecap);
3785}
3786static DEVICE_ATTR_RO(ecap);
3787
3788static ssize_t domains_supported_show(struct device *dev,
3789				      struct device_attribute *attr, char *buf)
3790{
3791	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3792	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3793}
3794static DEVICE_ATTR_RO(domains_supported);
3795
3796static ssize_t domains_used_show(struct device *dev,
3797				 struct device_attribute *attr, char *buf)
3798{
3799	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3800	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3801						  cap_ndoms(iommu->cap)));
 
3802}
3803static DEVICE_ATTR_RO(domains_used);
3804
3805static struct attribute *intel_iommu_attrs[] = {
3806	&dev_attr_version.attr,
3807	&dev_attr_address.attr,
3808	&dev_attr_cap.attr,
3809	&dev_attr_ecap.attr,
3810	&dev_attr_domains_supported.attr,
3811	&dev_attr_domains_used.attr,
3812	NULL,
3813};
3814
3815static struct attribute_group intel_iommu_group = {
3816	.name = "intel-iommu",
3817	.attrs = intel_iommu_attrs,
3818};
3819
3820const struct attribute_group *intel_iommu_groups[] = {
3821	&intel_iommu_group,
3822	NULL,
3823};
3824
3825static inline bool has_external_pci(void)
3826{
3827	struct pci_dev *pdev = NULL;
3828
3829	for_each_pci_dev(pdev)
3830		if (pdev->external_facing) {
3831			pci_dev_put(pdev);
3832			return true;
3833		}
3834
3835	return false;
3836}
3837
3838static int __init platform_optin_force_iommu(void)
3839{
3840	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3841		return 0;
3842
3843	if (no_iommu || dmar_disabled)
3844		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3845
3846	/*
3847	 * If Intel-IOMMU is disabled by default, we will apply identity
3848	 * map for all devices except those marked as being untrusted.
3849	 */
3850	if (dmar_disabled)
3851		iommu_set_default_passthrough(false);
3852
3853	dmar_disabled = 0;
3854	no_iommu = 0;
3855
3856	return 1;
3857}
3858
3859static int __init probe_acpi_namespace_devices(void)
3860{
3861	struct dmar_drhd_unit *drhd;
3862	/* To avoid a -Wunused-but-set-variable warning. */
3863	struct intel_iommu *iommu __maybe_unused;
3864	struct device *dev;
3865	int i, ret = 0;
3866
3867	for_each_active_iommu(iommu, drhd) {
3868		for_each_active_dev_scope(drhd->devices,
3869					  drhd->devices_cnt, i, dev) {
3870			struct acpi_device_physical_node *pn;
3871			struct iommu_group *group;
3872			struct acpi_device *adev;
3873
3874			if (dev->bus != &acpi_bus_type)
3875				continue;
3876
3877			adev = to_acpi_device(dev);
3878			mutex_lock(&adev->physical_node_lock);
3879			list_for_each_entry(pn,
3880					    &adev->physical_node_list, node) {
3881				group = iommu_group_get(pn->dev);
3882				if (group) {
3883					iommu_group_put(group);
3884					continue;
3885				}
3886
3887				ret = iommu_probe_device(pn->dev);
3888				if (ret)
3889					break;
3890			}
3891			mutex_unlock(&adev->physical_node_lock);
3892
3893			if (ret)
3894				return ret;
3895		}
3896	}
3897
3898	return 0;
3899}
3900
3901static __init int tboot_force_iommu(void)
3902{
3903	if (!tboot_enabled())
3904		return 0;
3905
3906	if (no_iommu || dmar_disabled)
3907		pr_warn("Forcing Intel-IOMMU to enabled\n");
3908
3909	dmar_disabled = 0;
3910	no_iommu = 0;
3911
3912	return 1;
3913}
3914
3915int __init intel_iommu_init(void)
3916{
3917	int ret = -ENODEV;
3918	struct dmar_drhd_unit *drhd;
3919	struct intel_iommu *iommu;
3920
3921	/*
3922	 * Intel IOMMU is required for a TXT/tboot launch or platform
3923	 * opt in, so enforce that.
3924	 */
3925	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3926		    platform_optin_force_iommu();
3927
3928	down_write(&dmar_global_lock);
3929	if (dmar_table_init()) {
3930		if (force_on)
3931			panic("tboot: Failed to initialize DMAR table\n");
3932		goto out_free_dmar;
3933	}
3934
3935	if (dmar_dev_scope_init() < 0) {
3936		if (force_on)
3937			panic("tboot: Failed to initialize DMAR device scope\n");
3938		goto out_free_dmar;
3939	}
3940
3941	up_write(&dmar_global_lock);
3942
3943	/*
3944	 * The bus notifier takes the dmar_global_lock, so lockdep will
3945	 * complain later when we register it under the lock.
3946	 */
3947	dmar_register_bus_notifier();
3948
3949	down_write(&dmar_global_lock);
3950
3951	if (!no_iommu)
3952		intel_iommu_debugfs_init();
3953
3954	if (no_iommu || dmar_disabled) {
3955		/*
3956		 * We exit the function here to ensure IOMMU's remapping and
3957		 * mempool aren't setup, which means that the IOMMU's PMRs
3958		 * won't be disabled via the call to init_dmars(). So disable
3959		 * it explicitly here. The PMRs were setup by tboot prior to
3960		 * calling SENTER, but the kernel is expected to reset/tear
3961		 * down the PMRs.
3962		 */
3963		if (intel_iommu_tboot_noforce) {
3964			for_each_iommu(iommu, drhd)
3965				iommu_disable_protect_mem_regions(iommu);
3966		}
3967
3968		/*
3969		 * Make sure the IOMMUs are switched off, even when we
3970		 * boot into a kexec kernel and the previous kernel left
3971		 * them enabled
3972		 */
3973		intel_disable_iommus();
3974		goto out_free_dmar;
3975	}
3976
3977	if (list_empty(&dmar_rmrr_units))
3978		pr_info("No RMRR found\n");
3979
3980	if (list_empty(&dmar_atsr_units))
3981		pr_info("No ATSR found\n");
3982
3983	if (list_empty(&dmar_satc_units))
3984		pr_info("No SATC found\n");
3985
3986	init_no_remapping_devices();
3987
3988	ret = init_dmars();
3989	if (ret) {
3990		if (force_on)
3991			panic("tboot: Failed to initialize DMARs\n");
3992		pr_err("Initialization failed\n");
3993		goto out_free_dmar;
3994	}
3995	up_write(&dmar_global_lock);
3996
3997	init_iommu_pm_ops();
3998
3999	down_read(&dmar_global_lock);
4000	for_each_active_iommu(iommu, drhd) {
4001		/*
4002		 * The flush queue implementation does not perform
4003		 * page-selective invalidations that are required for efficient
4004		 * TLB flushes in virtual environments.  The benefit of batching
4005		 * is likely to be much lower than the overhead of synchronizing
4006		 * the virtual and physical IOMMU page-tables.
4007		 */
4008		if (cap_caching_mode(iommu->cap)) {
 
4009			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4010			iommu_set_dma_strict();
4011		}
4012		iommu_device_sysfs_add(&iommu->iommu, NULL,
4013				       intel_iommu_groups,
4014				       "%s", iommu->name);
4015		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
 
 
4016	}
4017	up_read(&dmar_global_lock);
4018
4019	if (si_domain && !hw_pass_through)
4020		register_memory_notifier(&intel_iommu_memory_nb);
4021
4022	down_read(&dmar_global_lock);
4023	if (probe_acpi_namespace_devices())
4024		pr_warn("ACPI name space devices didn't probe correctly\n");
4025
4026	/* Finally, we enable the DMA remapping hardware. */
4027	for_each_iommu(iommu, drhd) {
4028		if (!drhd->ignored && !translation_pre_enabled(iommu))
4029			iommu_enable_translation(iommu);
4030
4031		iommu_disable_protect_mem_regions(iommu);
4032	}
4033	up_read(&dmar_global_lock);
4034
4035	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4036
4037	intel_iommu_enabled = 1;
4038
4039	return 0;
4040
4041out_free_dmar:
4042	intel_iommu_free_dmars();
4043	up_write(&dmar_global_lock);
4044	return ret;
4045}
4046
4047static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4048{
4049	struct device_domain_info *info = opaque;
4050
4051	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4052	return 0;
4053}
4054
4055/*
4056 * NB - intel-iommu lacks any sort of reference counting for the users of
4057 * dependent devices.  If multiple endpoints have intersecting dependent
4058 * devices, unbinding the driver from any one of them will possibly leave
4059 * the others unable to operate.
4060 */
4061static void domain_context_clear(struct device_domain_info *info)
4062{
4063	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4064		return;
4065
4066	pci_for_each_dma_alias(to_pci_dev(info->dev),
4067			       &domain_context_clear_one_cb, info);
4068}
4069
4070static void dmar_remove_one_dev_info(struct device *dev)
4071{
4072	struct device_domain_info *info = dev_iommu_priv_get(dev);
4073	struct dmar_domain *domain = info->domain;
4074	struct intel_iommu *iommu = info->iommu;
4075	unsigned long flags;
4076
4077	if (!dev_is_real_dma_subdevice(info->dev)) {
4078		if (dev_is_pci(info->dev) && sm_supported(iommu))
4079			intel_pasid_tear_down_entry(iommu, info->dev,
4080					PASID_RID2PASID, false);
4081
4082		iommu_disable_pci_caps(info);
4083		domain_context_clear(info);
4084	}
4085
4086	spin_lock_irqsave(&domain->lock, flags);
4087	list_del(&info->link);
4088	spin_unlock_irqrestore(&domain->lock, flags);
4089
4090	domain_detach_iommu(domain, iommu);
4091	info->domain = NULL;
4092}
4093
4094/*
4095 * Clear the page table pointer in context or pasid table entries so that
4096 * all DMA requests without PASID from the device are blocked. If the page
4097 * table has been set, clean up the data structures.
4098 */
4099static void device_block_translation(struct device *dev)
4100{
4101	struct device_domain_info *info = dev_iommu_priv_get(dev);
4102	struct intel_iommu *iommu = info->iommu;
4103	unsigned long flags;
4104
4105	iommu_disable_pci_caps(info);
4106	if (!dev_is_real_dma_subdevice(dev)) {
4107		if (sm_supported(iommu))
4108			intel_pasid_tear_down_entry(iommu, dev,
4109						    PASID_RID2PASID, false);
4110		else
4111			domain_context_clear(info);
4112	}
4113
4114	if (!info->domain)
4115		return;
4116
4117	spin_lock_irqsave(&info->domain->lock, flags);
4118	list_del(&info->link);
4119	spin_unlock_irqrestore(&info->domain->lock, flags);
4120
4121	domain_detach_iommu(info->domain, iommu);
4122	info->domain = NULL;
4123}
4124
4125static int md_domain_init(struct dmar_domain *domain, int guest_width)
4126{
4127	int adjust_width;
4128
4129	/* calculate AGAW */
4130	domain->gaw = guest_width;
4131	adjust_width = guestwidth_to_adjustwidth(guest_width);
4132	domain->agaw = width_to_agaw(adjust_width);
4133
4134	domain->iommu_coherency = false;
4135	domain->iommu_superpage = 0;
4136	domain->max_addr = 0;
4137
4138	/* always allocate the top pgd */
4139	domain->pgd = alloc_pgtable_page(domain->nid);
4140	if (!domain->pgd)
4141		return -ENOMEM;
4142	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4143	return 0;
4144}
4145
4146static int blocking_domain_attach_dev(struct iommu_domain *domain,
4147				      struct device *dev)
4148{
4149	device_block_translation(dev);
4150	return 0;
4151}
4152
4153static struct iommu_domain blocking_domain = {
 
4154	.ops = &(const struct iommu_domain_ops) {
4155		.attach_dev	= blocking_domain_attach_dev,
4156		.free		= intel_iommu_domain_free
4157	}
4158};
4159
4160static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4161{
4162	struct dmar_domain *dmar_domain;
4163	struct iommu_domain *domain;
4164
4165	switch (type) {
4166	case IOMMU_DOMAIN_BLOCKED:
4167		return &blocking_domain;
4168	case IOMMU_DOMAIN_DMA:
4169	case IOMMU_DOMAIN_DMA_FQ:
4170	case IOMMU_DOMAIN_UNMANAGED:
4171		dmar_domain = alloc_domain(type);
4172		if (!dmar_domain) {
4173			pr_err("Can't allocate dmar_domain\n");
4174			return NULL;
4175		}
4176		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4177			pr_err("Domain initialization failed\n");
4178			domain_exit(dmar_domain);
4179			return NULL;
4180		}
4181
4182		domain = &dmar_domain->domain;
4183		domain->geometry.aperture_start = 0;
4184		domain->geometry.aperture_end   =
4185				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4186		domain->geometry.force_aperture = true;
4187
4188		return domain;
4189	case IOMMU_DOMAIN_IDENTITY:
4190		return &si_domain->domain;
4191	case IOMMU_DOMAIN_SVA:
4192		return intel_svm_domain_alloc();
4193	default:
4194		return NULL;
4195	}
4196
4197	return NULL;
4198}
4199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4200static void intel_iommu_domain_free(struct iommu_domain *domain)
4201{
4202	if (domain != &si_domain->domain && domain != &blocking_domain)
4203		domain_exit(to_dmar_domain(domain));
 
 
 
 
4204}
4205
4206static int prepare_domain_attach_device(struct iommu_domain *domain,
4207					struct device *dev)
4208{
 
4209	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4210	struct intel_iommu *iommu;
4211	int addr_width;
4212
4213	iommu = device_to_iommu(dev, NULL, NULL);
4214	if (!iommu)
4215		return -ENODEV;
4216
4217	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4218		return -EINVAL;
4219
4220	/* check if this iommu agaw is sufficient for max mapped address */
4221	addr_width = agaw_to_width(iommu->agaw);
4222	if (addr_width > cap_mgaw(iommu->cap))
4223		addr_width = cap_mgaw(iommu->cap);
4224
4225	if (dmar_domain->max_addr > (1LL << addr_width))
4226		return -EINVAL;
4227	dmar_domain->gaw = addr_width;
4228
4229	/*
4230	 * Knock out extra levels of page tables if necessary
4231	 */
4232	while (iommu->agaw < dmar_domain->agaw) {
4233		struct dma_pte *pte;
4234
4235		pte = dmar_domain->pgd;
4236		if (dma_pte_present(pte)) {
4237			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4238			free_pgtable_page(pte);
4239		}
4240		dmar_domain->agaw--;
4241	}
4242
4243	return 0;
4244}
4245
4246static int intel_iommu_attach_device(struct iommu_domain *domain,
4247				     struct device *dev)
4248{
4249	struct device_domain_info *info = dev_iommu_priv_get(dev);
4250	int ret;
4251
4252	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4253	    device_is_rmrr_locked(dev)) {
4254		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4255		return -EPERM;
4256	}
4257
4258	if (info->domain)
4259		device_block_translation(dev);
4260
4261	ret = prepare_domain_attach_device(domain, dev);
4262	if (ret)
4263		return ret;
4264
4265	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4266}
4267
4268static int intel_iommu_map(struct iommu_domain *domain,
4269			   unsigned long iova, phys_addr_t hpa,
4270			   size_t size, int iommu_prot, gfp_t gfp)
4271{
4272	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4273	u64 max_addr;
4274	int prot = 0;
4275
4276	if (iommu_prot & IOMMU_READ)
4277		prot |= DMA_PTE_READ;
4278	if (iommu_prot & IOMMU_WRITE)
4279		prot |= DMA_PTE_WRITE;
4280	if (dmar_domain->set_pte_snp)
4281		prot |= DMA_PTE_SNP;
4282
4283	max_addr = iova + size;
4284	if (dmar_domain->max_addr < max_addr) {
4285		u64 end;
4286
4287		/* check if minimum agaw is sufficient for mapped address */
4288		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4289		if (end < max_addr) {
4290			pr_err("%s: iommu width (%d) is not "
4291			       "sufficient for the mapped address (%llx)\n",
4292			       __func__, dmar_domain->gaw, max_addr);
4293			return -EFAULT;
4294		}
4295		dmar_domain->max_addr = max_addr;
4296	}
4297	/* Round up size to next multiple of PAGE_SIZE, if it and
4298	   the low bits of hpa would take us onto the next page */
4299	size = aligned_nrpages(hpa, size);
4300	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4301				hpa >> VTD_PAGE_SHIFT, size, prot);
4302}
4303
4304static int intel_iommu_map_pages(struct iommu_domain *domain,
4305				 unsigned long iova, phys_addr_t paddr,
4306				 size_t pgsize, size_t pgcount,
4307				 int prot, gfp_t gfp, size_t *mapped)
4308{
4309	unsigned long pgshift = __ffs(pgsize);
4310	size_t size = pgcount << pgshift;
4311	int ret;
4312
4313	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4314		return -EINVAL;
4315
4316	if (!IS_ALIGNED(iova | paddr, pgsize))
4317		return -EINVAL;
4318
4319	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4320	if (!ret && mapped)
4321		*mapped = size;
4322
4323	return ret;
4324}
4325
4326static size_t intel_iommu_unmap(struct iommu_domain *domain,
4327				unsigned long iova, size_t size,
4328				struct iommu_iotlb_gather *gather)
4329{
4330	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4331	unsigned long start_pfn, last_pfn;
4332	int level = 0;
4333
4334	/* Cope with horrid API which requires us to unmap more than the
4335	   size argument if it happens to be a large-page mapping. */
4336	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
 
 
4337
4338	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4339		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4340
4341	start_pfn = iova >> VTD_PAGE_SHIFT;
4342	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4343
4344	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4345
4346	if (dmar_domain->max_addr == iova + size)
4347		dmar_domain->max_addr = iova;
4348
4349	iommu_iotlb_gather_add_page(domain, gather, iova, size);
 
 
 
 
 
4350
4351	return size;
4352}
4353
4354static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4355				      unsigned long iova,
4356				      size_t pgsize, size_t pgcount,
4357				      struct iommu_iotlb_gather *gather)
4358{
4359	unsigned long pgshift = __ffs(pgsize);
4360	size_t size = pgcount << pgshift;
4361
4362	return intel_iommu_unmap(domain, iova, size, gather);
4363}
4364
4365static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4366				 struct iommu_iotlb_gather *gather)
4367{
4368	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4369	unsigned long iova_pfn = IOVA_PFN(gather->start);
4370	size_t size = gather->end - gather->start;
4371	struct iommu_domain_info *info;
4372	unsigned long start_pfn;
4373	unsigned long nrpages;
4374	unsigned long i;
4375
4376	nrpages = aligned_nrpages(gather->start, size);
4377	start_pfn = mm_to_dma_pfn(iova_pfn);
4378
4379	xa_for_each(&dmar_domain->iommu_array, i, info)
4380		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4381				      start_pfn, nrpages,
4382				      list_empty(&gather->freelist), 0);
4383
 
 
 
4384	put_pages_list(&gather->freelist);
4385}
4386
4387static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4388					    dma_addr_t iova)
4389{
4390	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4391	struct dma_pte *pte;
4392	int level = 0;
4393	u64 phys = 0;
4394
4395	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
 
4396	if (pte && dma_pte_present(pte))
4397		phys = dma_pte_addr(pte) +
4398			(iova & (BIT_MASK(level_to_offset_bits(level) +
4399						VTD_PAGE_SHIFT) - 1));
4400
4401	return phys;
4402}
4403
4404static bool domain_support_force_snooping(struct dmar_domain *domain)
4405{
4406	struct device_domain_info *info;
4407	bool support = true;
4408
4409	assert_spin_locked(&domain->lock);
4410	list_for_each_entry(info, &domain->devices, link) {
4411		if (!ecap_sc_support(info->iommu->ecap)) {
4412			support = false;
4413			break;
4414		}
4415	}
4416
4417	return support;
4418}
4419
4420static void domain_set_force_snooping(struct dmar_domain *domain)
4421{
4422	struct device_domain_info *info;
4423
4424	assert_spin_locked(&domain->lock);
4425	/*
4426	 * Second level page table supports per-PTE snoop control. The
4427	 * iommu_map() interface will handle this by setting SNP bit.
4428	 */
4429	if (!domain->use_first_level) {
4430		domain->set_pte_snp = true;
4431		return;
4432	}
4433
4434	list_for_each_entry(info, &domain->devices, link)
4435		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4436						     PASID_RID2PASID);
4437}
4438
4439static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4440{
4441	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4442	unsigned long flags;
4443
4444	if (dmar_domain->force_snooping)
4445		return true;
4446
4447	spin_lock_irqsave(&dmar_domain->lock, flags);
4448	if (!domain_support_force_snooping(dmar_domain)) {
 
4449		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4450		return false;
4451	}
4452
4453	domain_set_force_snooping(dmar_domain);
4454	dmar_domain->force_snooping = true;
4455	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4456
4457	return true;
4458}
4459
4460static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4461{
4462	struct device_domain_info *info = dev_iommu_priv_get(dev);
4463
4464	switch (cap) {
4465	case IOMMU_CAP_CACHE_COHERENCY:
 
4466		return true;
4467	case IOMMU_CAP_INTR_REMAP:
4468		return irq_remapping_enabled == 1;
4469	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4470		return dmar_platform_optin();
4471	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4472		return ecap_sc_support(info->iommu->ecap);
 
 
4473	default:
4474		return false;
4475	}
4476}
4477
4478static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4479{
4480	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4481	struct device_domain_info *info;
4482	struct intel_iommu *iommu;
4483	u8 bus, devfn;
4484	int ret;
4485
4486	iommu = device_to_iommu(dev, &bus, &devfn);
4487	if (!iommu || !iommu->iommu.ops)
4488		return ERR_PTR(-ENODEV);
4489
4490	info = kzalloc(sizeof(*info), GFP_KERNEL);
4491	if (!info)
4492		return ERR_PTR(-ENOMEM);
4493
4494	if (dev_is_real_dma_subdevice(dev)) {
4495		info->bus = pdev->bus->number;
4496		info->devfn = pdev->devfn;
4497		info->segment = pci_domain_nr(pdev->bus);
4498	} else {
4499		info->bus = bus;
4500		info->devfn = devfn;
4501		info->segment = iommu->segment;
4502	}
4503
4504	info->dev = dev;
4505	info->iommu = iommu;
4506	if (dev_is_pci(dev)) {
4507		if (ecap_dev_iotlb_support(iommu->ecap) &&
4508		    pci_ats_supported(pdev) &&
4509		    dmar_ats_supported(pdev, iommu)) {
4510			info->ats_supported = 1;
4511			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
 
 
 
 
 
 
 
 
 
 
 
4512		}
4513		if (sm_supported(iommu)) {
4514			if (pasid_supported(iommu)) {
4515				int features = pci_pasid_features(pdev);
4516
4517				if (features >= 0)
4518					info->pasid_supported = features | 1;
4519			}
4520
4521			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4522			    pci_pri_supported(pdev))
4523				info->pri_supported = 1;
4524		}
4525	}
4526
4527	dev_iommu_priv_set(dev, info);
4528
4529	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4530		ret = intel_pasid_alloc_table(dev);
4531		if (ret) {
4532			dev_err(dev, "PASID table allocation failed\n");
4533			dev_iommu_priv_set(dev, NULL);
4534			kfree(info);
4535			return ERR_PTR(ret);
4536		}
4537	}
4538
 
 
4539	return &iommu->iommu;
4540}
4541
4542static void intel_iommu_release_device(struct device *dev)
4543{
4544	struct device_domain_info *info = dev_iommu_priv_get(dev);
4545
4546	dmar_remove_one_dev_info(dev);
4547	intel_pasid_free_table(dev);
4548	dev_iommu_priv_set(dev, NULL);
4549	kfree(info);
4550	set_dma_ops(dev, NULL);
4551}
4552
4553static void intel_iommu_probe_finalize(struct device *dev)
4554{
4555	set_dma_ops(dev, NULL);
4556	iommu_setup_dma_ops(dev, 0, U64_MAX);
4557}
4558
4559static void intel_iommu_get_resv_regions(struct device *device,
4560					 struct list_head *head)
4561{
4562	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4563	struct iommu_resv_region *reg;
4564	struct dmar_rmrr_unit *rmrr;
4565	struct device *i_dev;
4566	int i;
4567
4568	rcu_read_lock();
4569	for_each_rmrr_units(rmrr) {
4570		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4571					  i, i_dev) {
4572			struct iommu_resv_region *resv;
4573			enum iommu_resv_type type;
4574			size_t length;
4575
4576			if (i_dev != device &&
4577			    !is_downstream_to_pci_bridge(device, i_dev))
4578				continue;
4579
4580			length = rmrr->end_address - rmrr->base_address + 1;
4581
4582			type = device_rmrr_is_relaxable(device) ?
4583				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4584
4585			resv = iommu_alloc_resv_region(rmrr->base_address,
4586						       length, prot, type,
4587						       GFP_ATOMIC);
4588			if (!resv)
4589				break;
4590
4591			list_add_tail(&resv->list, head);
4592		}
4593	}
4594	rcu_read_unlock();
4595
4596#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4597	if (dev_is_pci(device)) {
4598		struct pci_dev *pdev = to_pci_dev(device);
4599
4600		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4601			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4602					IOMMU_RESV_DIRECT_RELAXABLE,
4603					GFP_KERNEL);
4604			if (reg)
4605				list_add_tail(&reg->list, head);
4606		}
4607	}
4608#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4609
4610	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4611				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4612				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4613	if (!reg)
4614		return;
4615	list_add_tail(&reg->list, head);
4616}
4617
4618static struct iommu_group *intel_iommu_device_group(struct device *dev)
4619{
4620	if (dev_is_pci(dev))
4621		return pci_device_group(dev);
4622	return generic_device_group(dev);
4623}
4624
4625static int intel_iommu_enable_sva(struct device *dev)
4626{
4627	struct device_domain_info *info = dev_iommu_priv_get(dev);
4628	struct intel_iommu *iommu;
4629	int ret;
4630
4631	if (!info || dmar_disabled)
4632		return -EINVAL;
4633
4634	iommu = info->iommu;
4635	if (!iommu)
4636		return -EINVAL;
4637
4638	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4639		return -ENODEV;
4640
4641	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4642		return -EINVAL;
4643
4644	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4645	if (!ret)
4646		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
 
 
 
 
 
 
 
 
 
 
4647
4648	return ret;
4649}
4650
4651static int intel_iommu_disable_sva(struct device *dev)
4652{
 
4653	struct device_domain_info *info = dev_iommu_priv_get(dev);
4654	struct intel_iommu *iommu = info->iommu;
4655	int ret;
4656
4657	ret = iommu_unregister_device_fault_handler(dev);
4658	if (!ret)
4659		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4660
4661	return ret;
4662}
4663
4664static int intel_iommu_enable_iopf(struct device *dev)
4665{
4666	struct device_domain_info *info = dev_iommu_priv_get(dev);
 
 
 
 
4667
4668	if (info && info->pri_supported)
4669		return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4670
4671	return -ENODEV;
4672}
4673
4674static int
4675intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4676{
4677	switch (feat) {
4678	case IOMMU_DEV_FEAT_IOPF:
4679		return intel_iommu_enable_iopf(dev);
4680
4681	case IOMMU_DEV_FEAT_SVA:
4682		return intel_iommu_enable_sva(dev);
4683
4684	default:
4685		return -ENODEV;
4686	}
4687}
4688
4689static int
4690intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4691{
4692	switch (feat) {
4693	case IOMMU_DEV_FEAT_IOPF:
4694		return 0;
4695
4696	case IOMMU_DEV_FEAT_SVA:
4697		return intel_iommu_disable_sva(dev);
4698
4699	default:
4700		return -ENODEV;
4701	}
4702}
4703
4704static bool intel_iommu_is_attach_deferred(struct device *dev)
4705{
4706	struct device_domain_info *info = dev_iommu_priv_get(dev);
4707
4708	return translation_pre_enabled(info->iommu) && !info->domain;
4709}
4710
4711/*
4712 * Check that the device does not live on an external facing PCI port that is
4713 * marked as untrusted. Such devices should not be able to apply quirks and
4714 * thus not be able to bypass the IOMMU restrictions.
4715 */
4716static bool risky_device(struct pci_dev *pdev)
4717{
4718	if (pdev->untrusted) {
4719		pci_info(pdev,
4720			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4721			 pdev->vendor, pdev->device);
4722		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4723		return true;
4724	}
4725	return false;
4726}
4727
4728static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4729				       unsigned long iova, size_t size)
4730{
4731	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4732	unsigned long pages = aligned_nrpages(iova, size);
4733	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4734	struct iommu_domain_info *info;
4735	unsigned long i;
4736
4737	xa_for_each(&dmar_domain->iommu_array, i, info)
4738		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
 
4739}
4740
4741static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4742{
4743	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 
 
 
4744	struct iommu_domain *domain;
 
4745
4746	/* Domain type specific cleanup: */
4747	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4748	if (domain) {
4749		switch (domain->type) {
4750		case IOMMU_DOMAIN_SVA:
4751			intel_svm_remove_dev_pasid(dev, pasid);
4752			break;
4753		default:
4754			/* should never reach here */
4755			WARN_ON(1);
 
 
 
 
 
 
 
 
 
 
 
4756			break;
4757		}
4758	}
 
 
4759
 
 
 
 
4760	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4761}
4762
 
 
 
 
 
4763const struct iommu_ops intel_iommu_ops = {
 
4764	.capable		= intel_iommu_capable,
 
4765	.domain_alloc		= intel_iommu_domain_alloc,
 
4766	.probe_device		= intel_iommu_probe_device,
4767	.probe_finalize		= intel_iommu_probe_finalize,
4768	.release_device		= intel_iommu_release_device,
4769	.get_resv_regions	= intel_iommu_get_resv_regions,
4770	.device_group		= intel_iommu_device_group,
4771	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4772	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4773	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4774	.def_domain_type	= device_def_domain_type,
4775	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4776	.pgsize_bitmap		= SZ_4K,
4777#ifdef CONFIG_INTEL_IOMMU_SVM
4778	.page_response		= intel_svm_page_response,
4779#endif
4780	.default_domain_ops = &(const struct iommu_domain_ops) {
4781		.attach_dev		= intel_iommu_attach_device,
 
4782		.map_pages		= intel_iommu_map_pages,
4783		.unmap_pages		= intel_iommu_unmap_pages,
4784		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4785		.flush_iotlb_all        = intel_flush_iotlb_all,
4786		.iotlb_sync		= intel_iommu_tlb_sync,
4787		.iova_to_phys		= intel_iommu_iova_to_phys,
4788		.free			= intel_iommu_domain_free,
4789		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4790	}
4791};
4792
4793static void quirk_iommu_igfx(struct pci_dev *dev)
4794{
4795	if (risky_device(dev))
4796		return;
4797
4798	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4799	dmar_map_gfx = 0;
4800}
4801
4802/* G4x/GM45 integrated gfx dmar support is totally busted. */
4803DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4804DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4805DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4806DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4807DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4808DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4809DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4810
4811/* Broadwell igfx malfunctions with dmar */
4812DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4813DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4814DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4815DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4816DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4817DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4818DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4819DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4820DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4821DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4822DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4823DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4824DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4825DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4826DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4827DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4828DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4829DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4830DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4831DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4832DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4833DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4834DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4835DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4836
4837static void quirk_iommu_rwbf(struct pci_dev *dev)
4838{
4839	if (risky_device(dev))
4840		return;
4841
4842	/*
4843	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4844	 * but needs it. Same seems to hold for the desktop versions.
4845	 */
4846	pci_info(dev, "Forcing write-buffer flush capability\n");
4847	rwbf_quirk = 1;
4848}
4849
4850DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4851DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4852DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4853DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4854DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4855DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4856DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4857
4858#define GGC 0x52
4859#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4860#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4861#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4862#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4863#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4864#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4865#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4866#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4867
4868static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4869{
4870	unsigned short ggc;
4871
4872	if (risky_device(dev))
4873		return;
4874
4875	if (pci_read_config_word(dev, GGC, &ggc))
4876		return;
4877
4878	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4879		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4880		dmar_map_gfx = 0;
4881	} else if (dmar_map_gfx) {
4882		/* we have to ensure the gfx device is idle before we flush */
4883		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4884		iommu_set_dma_strict();
4885	}
4886}
4887DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4888DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4889DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4890DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4891
4892static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4893{
4894	unsigned short ver;
4895
4896	if (!IS_GFX_DEVICE(dev))
4897		return;
4898
4899	ver = (dev->device >> 8) & 0xff;
4900	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4901	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4902	    ver != 0x9a && ver != 0xa7)
4903		return;
4904
4905	if (risky_device(dev))
4906		return;
4907
4908	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4909	iommu_skip_te_disable = 1;
4910}
4911DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4912
4913/* On Tylersburg chipsets, some BIOSes have been known to enable the
4914   ISOCH DMAR unit for the Azalia sound device, but not give it any
4915   TLB entries, which causes it to deadlock. Check for that.  We do
4916   this in a function called from init_dmars(), instead of in a PCI
4917   quirk, because we don't want to print the obnoxious "BIOS broken"
4918   message if VT-d is actually disabled.
4919*/
4920static void __init check_tylersburg_isoch(void)
4921{
4922	struct pci_dev *pdev;
4923	uint32_t vtisochctrl;
4924
4925	/* If there's no Azalia in the system anyway, forget it. */
4926	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4927	if (!pdev)
4928		return;
4929
4930	if (risky_device(pdev)) {
4931		pci_dev_put(pdev);
4932		return;
4933	}
4934
4935	pci_dev_put(pdev);
4936
4937	/* System Management Registers. Might be hidden, in which case
4938	   we can't do the sanity check. But that's OK, because the
4939	   known-broken BIOSes _don't_ actually hide it, so far. */
4940	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4941	if (!pdev)
4942		return;
4943
4944	if (risky_device(pdev)) {
4945		pci_dev_put(pdev);
4946		return;
4947	}
4948
4949	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4950		pci_dev_put(pdev);
4951		return;
4952	}
4953
4954	pci_dev_put(pdev);
4955
4956	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4957	if (vtisochctrl & 1)
4958		return;
4959
4960	/* Drop all bits other than the number of TLB entries */
4961	vtisochctrl &= 0x1c;
4962
4963	/* If we have the recommended number of TLB entries (16), fine. */
4964	if (vtisochctrl == 0x10)
4965		return;
4966
4967	/* Zero TLB entries? You get to ride the short bus to school. */
4968	if (!vtisochctrl) {
4969		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4970		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4971		     dmi_get_system_info(DMI_BIOS_VENDOR),
4972		     dmi_get_system_info(DMI_BIOS_VERSION),
4973		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4974		iommu_identity_mapping |= IDENTMAP_AZALIA;
4975		return;
4976	}
4977
4978	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4979	       vtisochctrl);
4980}
4981
4982/*
4983 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4984 * invalidation completion before posted writes initiated with translated address
4985 * that utilized translations matching the invalidation address range, violating
4986 * the invalidation completion ordering.
4987 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4988 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4989 * under the control of the trusted/privileged host device driver must use this
4990 * quirk.
4991 * Device TLBs are invalidated under the following six conditions:
4992 * 1. Device driver does DMA API unmap IOVA
4993 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4994 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4995 *    exit_mmap() due to crash
4996 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4997 *    VM has to free pages that were unmapped
4998 * 5. Userspace driver unmaps a DMA buffer
4999 * 6. Cache invalidation in vSVA usage (upcoming)
5000 *
5001 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5002 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5003 * invalidate TLB the same way as normal user unmap which will use this quirk.
5004 * The dTLB invalidation after PASID cache flush does not need this quirk.
5005 *
5006 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5007 */
5008void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5009			       unsigned long address, unsigned long mask,
5010			       u32 pasid, u16 qdep)
5011{
5012	u16 sid;
5013
5014	if (likely(!info->dtlb_extra_inval))
5015		return;
5016
5017	sid = PCI_DEVID(info->bus, info->devfn);
5018	if (pasid == PASID_RID2PASID) {
5019		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5020				   qdep, address, mask);
5021	} else {
5022		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5023					 pasid, qdep, address, mask);
5024	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5025}