iommu.c - drivers/iommu/intel/iommu.c - Linux diff v6.2

   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright © 2006-2014 Intel Corporation.
   4 *
   5 * Authors: David Woodhouse <dwmw2@infradead.org>,
   6 *          Ashok Raj <ashok.raj@intel.com>,
   7 *          Shaohua Li <shaohua.li@intel.com>,
   8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9 *          Fenghua Yu <fenghua.yu@intel.com>
  10 *          Joerg Roedel <jroedel@suse.de>
  11 */
  12
  13#define pr_fmt(fmt)     "DMAR: " fmt
  14#define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16#include <linux/crash_dump.h>
  17#include <linux/dma-direct.h>
  18#include <linux/dmi.h>
  19#include <linux/intel-svm.h>
  20#include <linux/memory.h>
  21#include <linux/pci.h>
  22#include <linux/pci-ats.h>
  23#include <linux/spinlock.h>
 
 
 
 
 
 
 
 
 
 
 
 
  24#include <linux/syscore_ops.h>
  25#include <linux/tboot.h>
 
 
 
 
 
 
 
 
 
  26
  27#include "iommu.h"
  28#include "../dma-iommu.h"
  29#include "../irq_remapping.h"
  30#include "../iommu-sva.h"
  31#include "pasid.h"
  32#include "cap_audit.h"
  33
  34#define ROOT_SIZE		VTD_PAGE_SIZE
  35#define CONTEXT_SIZE		VTD_PAGE_SIZE
  36
  37#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  38#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  39#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  40#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  41
  42#define IOAPIC_RANGE_START	(0xfee00000)
  43#define IOAPIC_RANGE_END	(0xfeefffff)
  44#define IOVA_START_ADDR		(0x1000)
  45
  46#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  47
  48#define MAX_AGAW_WIDTH 64
  49#define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  50
  51#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
  52#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
  53
  54/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  55   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  56#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
  57				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  58#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  59
  60/* IO virtual address start page frame number */
  61#define IOVA_START_PFN		(1)
  62
  63#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
  64
  65/* page table handling */
  66#define LEVEL_STRIDE		(9)
  67#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
  68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  69static inline int agaw_to_level(int agaw)
  70{
  71	return agaw + 2;
  72}
  73
  74static inline int agaw_to_width(int agaw)
  75{
  76	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
  77}
  78
  79static inline int width_to_agaw(int width)
  80{
  81	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
  82}
  83
  84static inline unsigned int level_to_offset_bits(int level)
  85{
  86	return (level - 1) * LEVEL_STRIDE;
  87}
  88
  89static inline int pfn_level_offset(u64 pfn, int level)
  90{
  91	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
  92}
  93
  94static inline u64 level_mask(int level)
  95{
  96	return -1ULL << level_to_offset_bits(level);
  97}
  98
  99static inline u64 level_size(int level)
 100{
 101	return 1ULL << level_to_offset_bits(level);
 102}
 103
 104static inline u64 align_to_level(u64 pfn, int level)
 105{
 106	return (pfn + level_size(level) - 1) & level_mask(level);
 107}
 108
 109static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 110{
 111	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 112}
 113
 114/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 115   are never going to work. */
 
 
 
 
 
 116static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 117{
 118	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 119}
 120static inline unsigned long page_to_dma_pfn(struct page *pg)
 121{
 122	return mm_to_dma_pfn(page_to_pfn(pg));
 123}
 124static inline unsigned long virt_to_dma_pfn(void *p)
 125{
 126	return page_to_dma_pfn(virt_to_page(p));
 127}
 128
 
 
 
 129static void __init check_tylersburg_isoch(void);
 130static int rwbf_quirk;
 131
 132/*
 133 * set to 1 to panic kernel if can't successfully enable VT-d
 134 * (used when kernel is launched w/ TXT)
 135 */
 136static int force_on = 0;
 137static int intel_iommu_tboot_noforce;
 138static int no_platform_optin;
 139
 140#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 141
 142/*
 143 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 144 * if marked present.
 145 */
 146static phys_addr_t root_entry_lctp(struct root_entry *re)
 147{
 148	if (!(re->lo & 1))
 149		return 0;
 150
 151	return re->lo & VTD_PAGE_MASK;
 152}
 153
 154/*
 155 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 156 * if marked present.
 157 */
 158static phys_addr_t root_entry_uctp(struct root_entry *re)
 159{
 160	if (!(re->hi & 1))
 161		return 0;
 162
 163	return re->hi & VTD_PAGE_MASK;
 164}
 165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 166static inline void context_set_present(struct context_entry *context)
 167{
 168	context->lo |= 1;
 169}
 170
 171static inline void context_set_fault_enable(struct context_entry *context)
 172{
 173	context->lo &= (((u64)-1) << 2) | 1;
 174}
 175
 176static inline void context_set_translation_type(struct context_entry *context,
 177						unsigned long value)
 178{
 179	context->lo &= (((u64)-1) << 4) | 3;
 180	context->lo |= (value & 3) << 2;
 181}
 182
 183static inline void context_set_address_root(struct context_entry *context,
 184					    unsigned long value)
 185{
 186	context->lo &= ~VTD_PAGE_MASK;
 187	context->lo |= value & VTD_PAGE_MASK;
 188}
 189
 190static inline void context_set_address_width(struct context_entry *context,
 191					     unsigned long value)
 192{
 193	context->hi |= value & 7;
 194}
 195
 196static inline void context_set_domain_id(struct context_entry *context,
 197					 unsigned long value)
 198{
 199	context->hi |= (value & ((1 << 16) - 1)) << 8;
 200}
 201
 202static inline void context_set_pasid(struct context_entry *context)
 203{
 204	context->lo |= CONTEXT_PASIDE;
 205}
 206
 207static inline int context_domain_id(struct context_entry *c)
 208{
 209	return((c->hi >> 8) & 0xffff);
 210}
 211
 212static inline void context_clear_entry(struct context_entry *context)
 213{
 214	context->lo = 0;
 215	context->hi = 0;
 216}
 217
 218static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 219{
 220	if (!iommu->copied_tables)
 221		return false;
 222
 223	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 224}
 225
 226static inline void
 227set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 228{
 229	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 230}
 231
 232static inline void
 233clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 234{
 235	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 236}
 237
 238/*
 239 * This domain is a statically identity mapping domain.
 240 *	1. This domain creats a static 1:1 mapping to all usable memory.
 241 * 	2. It maps to each iommu if successful.
 242 *	3. Each iommu mapps to this domain if successful.
 243 */
 244static struct dmar_domain *si_domain;
 245static int hw_pass_through = 1;
 246
 
 
 
 
 247struct dmar_rmrr_unit {
 248	struct list_head list;		/* list of rmrr units	*/
 249	struct acpi_dmar_header *hdr;	/* ACPI header		*/
 250	u64	base_address;		/* reserved base address*/
 251	u64	end_address;		/* reserved end address */
 252	struct dmar_dev_scope *devices;	/* target devices */
 253	int	devices_cnt;		/* target device count */
 254};
 255
 256struct dmar_atsr_unit {
 257	struct list_head list;		/* list of ATSR units */
 258	struct acpi_dmar_header *hdr;	/* ACPI header */
 259	struct dmar_dev_scope *devices;	/* target devices */
 260	int devices_cnt;		/* target device count */
 261	u8 include_all:1;		/* include all ports */
 262};
 263
 264struct dmar_satc_unit {
 265	struct list_head list;		/* list of SATC units */
 266	struct acpi_dmar_header *hdr;	/* ACPI header */
 267	struct dmar_dev_scope *devices;	/* target devices */
 268	struct intel_iommu *iommu;	/* the corresponding iommu */
 269	int devices_cnt;		/* target device count */
 270	u8 atc_required:1;		/* ATS is required */
 271};
 272
 273static LIST_HEAD(dmar_atsr_units);
 274static LIST_HEAD(dmar_rmrr_units);
 275static LIST_HEAD(dmar_satc_units);
 276
 277#define for_each_rmrr_units(rmrr) \
 278	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 279
 280static void device_block_translation(struct device *dev);
 281static void intel_iommu_domain_free(struct iommu_domain *domain);
 282
 283int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
 284int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 285
 286int intel_iommu_enabled = 0;
 287EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 288
 289static int dmar_map_gfx = 1;
 
 290static int intel_iommu_superpage = 1;
 291static int iommu_identity_mapping;
 292static int iommu_skip_te_disable;
 293
 294#define IDENTMAP_GFX		2
 295#define IDENTMAP_AZALIA		4
 296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 297const struct iommu_ops intel_iommu_ops;
 298
 299static bool translation_pre_enabled(struct intel_iommu *iommu)
 300{
 301	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 302}
 303
 304static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 305{
 306	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 307}
 308
 309static void init_translation_status(struct intel_iommu *iommu)
 310{
 311	u32 gsts;
 312
 313	gsts = readl(iommu->reg + DMAR_GSTS_REG);
 314	if (gsts & DMA_GSTS_TES)
 315		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 316}
 317
 318static int __init intel_iommu_setup(char *str)
 319{
 320	if (!str)
 321		return -EINVAL;
 322
 323	while (*str) {
 324		if (!strncmp(str, "on", 2)) {
 325			dmar_disabled = 0;
 326			pr_info("IOMMU enabled\n");
 327		} else if (!strncmp(str, "off", 3)) {
 328			dmar_disabled = 1;
 329			no_platform_optin = 1;
 330			pr_info("IOMMU disabled\n");
 331		} else if (!strncmp(str, "igfx_off", 8)) {
 332			dmar_map_gfx = 0;
 333			pr_info("Disable GFX device mapping\n");
 334		} else if (!strncmp(str, "forcedac", 8)) {
 335			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 336			iommu_dma_forcedac = true;
 337		} else if (!strncmp(str, "strict", 6)) {
 338			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
 339			iommu_set_dma_strict();
 340		} else if (!strncmp(str, "sp_off", 6)) {
 341			pr_info("Disable supported super page\n");
 342			intel_iommu_superpage = 0;
 343		} else if (!strncmp(str, "sm_on", 5)) {
 344			pr_info("Enable scalable mode if hardware supports\n");
 345			intel_iommu_sm = 1;
 346		} else if (!strncmp(str, "sm_off", 6)) {
 347			pr_info("Scalable mode is disallowed\n");
 348			intel_iommu_sm = 0;
 349		} else if (!strncmp(str, "tboot_noforce", 13)) {
 350			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 351			intel_iommu_tboot_noforce = 1;
 352		} else {
 353			pr_notice("Unknown option - '%s'\n", str);
 354		}
 355
 356		str += strcspn(str, ",");
 357		while (*str == ',')
 358			str++;
 359	}
 360
 361	return 1;
 362}
 363__setup("intel_iommu=", intel_iommu_setup);
 364
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 365void *alloc_pgtable_page(int node)
 366{
 367	struct page *page;
 368	void *vaddr = NULL;
 369
 370	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 371	if (page)
 372		vaddr = page_address(page);
 373	return vaddr;
 374}
 375
 376void free_pgtable_page(void *vaddr)
 377{
 378	free_page((unsigned long)vaddr);
 379}
 380
 381static inline int domain_type_is_si(struct dmar_domain *domain)
 382{
 383	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
 384}
 385
 386static inline int domain_pfn_supported(struct dmar_domain *domain,
 387				       unsigned long pfn)
 388{
 389	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 
 390
 391	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 
 
 392}
 393
 394/*
 395 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
 396 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
 397 * the returned SAGAW.
 398 */
 399static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
 400{
 401	unsigned long fl_sagaw, sl_sagaw;
 
 402
 403	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
 404	sl_sagaw = cap_sagaw(iommu->cap);
 
 
 405
 406	/* Second level only. */
 407	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
 408		return sl_sagaw;
 
 409
 410	/* First level only. */
 411	if (!ecap_slts(iommu->ecap))
 412		return fl_sagaw;
 
 413
 414	return fl_sagaw & sl_sagaw;
 415}
 416
 417static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 418{
 419	unsigned long sagaw;
 420	int agaw;
 421
 422	sagaw = __iommu_calculate_sagaw(iommu);
 423	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
 
 424		if (test_bit(agaw, &sagaw))
 425			break;
 426	}
 427
 428	return agaw;
 429}
 430
 431/*
 432 * Calculate max SAGAW for each iommu.
 433 */
 434int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 435{
 436	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 437}
 438
 439/*
 440 * calculate agaw for each iommu.
 441 * "SAGAW" may be different across iommus, use a default agaw, and
 442 * get a supported less agaw for iommus that don't support the default agaw.
 443 */
 444int iommu_calculate_agaw(struct intel_iommu *iommu)
 445{
 446	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 447}
 448
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 449static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 450{
 451	return sm_supported(iommu) ?
 452			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 453}
 454
 455static void domain_update_iommu_coherency(struct dmar_domain *domain)
 456{
 457	struct iommu_domain_info *info;
 458	struct dmar_drhd_unit *drhd;
 459	struct intel_iommu *iommu;
 460	bool found = false;
 461	unsigned long i;
 462
 463	domain->iommu_coherency = true;
 464	xa_for_each(&domain->iommu_array, i, info) {
 
 465		found = true;
 466		if (!iommu_paging_structure_coherency(info->iommu)) {
 467			domain->iommu_coherency = false;
 468			break;
 469		}
 470	}
 471	if (found)
 472		return;
 473
 474	/* No hardware attached; use lowest common denominator */
 475	rcu_read_lock();
 476	for_each_active_iommu(iommu, drhd) {
 477		if (!iommu_paging_structure_coherency(iommu)) {
 478			domain->iommu_coherency = false;
 479			break;
 480		}
 481	}
 482	rcu_read_unlock();
 483}
 484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 485static int domain_update_iommu_superpage(struct dmar_domain *domain,
 486					 struct intel_iommu *skip)
 487{
 488	struct dmar_drhd_unit *drhd;
 489	struct intel_iommu *iommu;
 490	int mask = 0x3;
 491
 492	if (!intel_iommu_superpage)
 493		return 0;
 494
 495	/* set iommu_superpage to the smallest common denominator */
 496	rcu_read_lock();
 497	for_each_active_iommu(iommu, drhd) {
 498		if (iommu != skip) {
 499			if (domain && domain->use_first_level) {
 500				if (!cap_fl1gp_support(iommu->cap))
 501					mask = 0x1;
 502			} else {
 503				mask &= cap_super_page_val(iommu->cap);
 504			}
 505
 506			if (!mask)
 507				break;
 508		}
 509	}
 510	rcu_read_unlock();
 511
 512	return fls(mask);
 513}
 514
 515static int domain_update_device_node(struct dmar_domain *domain)
 516{
 517	struct device_domain_info *info;
 518	int nid = NUMA_NO_NODE;
 519	unsigned long flags;
 520
 521	spin_lock_irqsave(&domain->lock, flags);
 
 
 
 
 522	list_for_each_entry(info, &domain->devices, link) {
 
 
 
 523		/*
 524		 * There could possibly be multiple device numa nodes as devices
 525		 * within the same domain may sit behind different IOMMUs. There
 526		 * isn't perfect answer in such situation, so we select first
 527		 * come first served policy.
 528		 */
 529		nid = dev_to_node(info->dev);
 530		if (nid != NUMA_NO_NODE)
 531			break;
 532	}
 533	spin_unlock_irqrestore(&domain->lock, flags);
 534
 535	return nid;
 536}
 537
 538static void domain_update_iotlb(struct dmar_domain *domain);
 539
 540/* Return the super pagesize bitmap if supported. */
 541static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
 542{
 543	unsigned long bitmap = 0;
 544
 545	/*
 546	 * 1-level super page supports page size of 2MiB, 2-level super page
 547	 * supports page size of both 2MiB and 1GiB.
 548	 */
 549	if (domain->iommu_superpage == 1)
 550		bitmap |= SZ_2M;
 551	else if (domain->iommu_superpage == 2)
 552		bitmap |= SZ_2M | SZ_1G;
 553
 554	return bitmap;
 555}
 556
 557/* Some capabilities may be different across iommus */
 558static void domain_update_iommu_cap(struct dmar_domain *domain)
 559{
 560	domain_update_iommu_coherency(domain);
 
 561	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 562
 563	/*
 564	 * If RHSA is missing, we should default to the device numa domain
 565	 * as fall back.
 566	 */
 567	if (domain->nid == NUMA_NO_NODE)
 568		domain->nid = domain_update_device_node(domain);
 569
 570	/*
 571	 * First-level translation restricts the input-address to a
 572	 * canonical address (i.e., address bits 63:N have the same
 573	 * value as address bit [N-1], where N is 48-bits with 4-level
 574	 * paging and 57-bits with 5-level paging). Hence, skip bit
 575	 * [N-1].
 576	 */
 577	if (domain->use_first_level)
 578		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
 579	else
 580		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
 581
 582	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
 583	domain_update_iotlb(domain);
 584}
 585
 586struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 587					 u8 devfn, int alloc)
 588{
 589	struct root_entry *root = &iommu->root_entry[bus];
 590	struct context_entry *context;
 591	u64 *entry;
 592
 593	/*
 594	 * Except that the caller requested to allocate a new entry,
 595	 * returning a copied context entry makes no sense.
 596	 */
 597	if (!alloc && context_copied(iommu, bus, devfn))
 598		return NULL;
 599
 600	entry = &root->lo;
 601	if (sm_supported(iommu)) {
 602		if (devfn >= 0x80) {
 603			devfn -= 0x80;
 604			entry = &root->hi;
 605		}
 606		devfn *= 2;
 607	}
 608	if (*entry & 1)
 609		context = phys_to_virt(*entry & VTD_PAGE_MASK);
 610	else {
 611		unsigned long phy_addr;
 612		if (!alloc)
 613			return NULL;
 614
 615		context = alloc_pgtable_page(iommu->node);
 616		if (!context)
 617			return NULL;
 618
 619		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 620		phy_addr = virt_to_phys((void *)context);
 621		*entry = phy_addr | 1;
 622		__iommu_flush_cache(iommu, entry, sizeof(*entry));
 623	}
 624	return &context[devfn];
 625}
 626
 
 
 
 
 
 627/**
 628 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 629 *				 sub-hierarchy of a candidate PCI-PCI bridge
 630 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 631 * @bridge: the candidate PCI-PCI bridge
 632 *
 633 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 634 */
 635static bool
 636is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 637{
 638	struct pci_dev *pdev, *pbridge;
 639
 640	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 641		return false;
 642
 643	pdev = to_pci_dev(dev);
 644	pbridge = to_pci_dev(bridge);
 645
 646	if (pbridge->subordinate &&
 647	    pbridge->subordinate->number <= pdev->bus->number &&
 648	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
 649		return true;
 650
 651	return false;
 652}
 653
 654static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 655{
 656	struct dmar_drhd_unit *drhd;
 657	u32 vtbar;
 658	int rc;
 659
 660	/* We know that this device on this chipset has its own IOMMU.
 661	 * If we find it under a different IOMMU, then the BIOS is lying
 662	 * to us. Hope that the IOMMU for this device is actually
 663	 * disabled, and it needs no translation...
 664	 */
 665	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 666	if (rc) {
 667		/* "can't" happen */
 668		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 669		return false;
 670	}
 671	vtbar &= 0xffff0000;
 672
 673	/* we know that the this iommu should be at offset 0xa000 from vtbar */
 674	drhd = dmar_find_matched_drhd_unit(pdev);
 675	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 676		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 677		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 678		return true;
 679	}
 680
 681	return false;
 682}
 683
 684static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 685{
 686	if (!iommu || iommu->drhd->ignored)
 687		return true;
 688
 689	if (dev_is_pci(dev)) {
 690		struct pci_dev *pdev = to_pci_dev(dev);
 691
 692		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 693		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 694		    quirk_ioat_snb_local_iommu(pdev))
 695			return true;
 696	}
 697
 698	return false;
 699}
 700
 701struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 702{
 703	struct dmar_drhd_unit *drhd = NULL;
 704	struct pci_dev *pdev = NULL;
 705	struct intel_iommu *iommu;
 706	struct device *tmp;
 707	u16 segment = 0;
 708	int i;
 709
 710	if (!dev)
 711		return NULL;
 712
 713	if (dev_is_pci(dev)) {
 714		struct pci_dev *pf_pdev;
 715
 716		pdev = pci_real_dma_dev(to_pci_dev(dev));
 717
 718		/* VFs aren't listed in scope tables; we need to look up
 719		 * the PF instead to find the IOMMU. */
 720		pf_pdev = pci_physfn(pdev);
 721		dev = &pf_pdev->dev;
 722		segment = pci_domain_nr(pdev->bus);
 723	} else if (has_acpi_companion(dev))
 724		dev = &ACPI_COMPANION(dev)->dev;
 725
 726	rcu_read_lock();
 727	for_each_iommu(iommu, drhd) {
 728		if (pdev && segment != drhd->segment)
 729			continue;
 730
 731		for_each_active_dev_scope(drhd->devices,
 732					  drhd->devices_cnt, i, tmp) {
 733			if (tmp == dev) {
 734				/* For a VF use its original BDF# not that of the PF
 735				 * which we used for the IOMMU lookup. Strictly speaking
 736				 * we could do this for all PCI devices; we only need to
 737				 * get the BDF# from the scope table for ACPI matches. */
 738				if (pdev && pdev->is_virtfn)
 739					goto got_pdev;
 740
 741				if (bus && devfn) {
 742					*bus = drhd->devices[i].bus;
 743					*devfn = drhd->devices[i].devfn;
 744				}
 745				goto out;
 746			}
 747
 748			if (is_downstream_to_pci_bridge(dev, tmp))
 749				goto got_pdev;
 750		}
 751
 752		if (pdev && drhd->include_all) {
 753got_pdev:
 754			if (bus && devfn) {
 755				*bus = pdev->bus->number;
 756				*devfn = pdev->devfn;
 757			}
 758			goto out;
 759		}
 760	}
 761	iommu = NULL;
 762out:
 763	if (iommu_is_dummy(iommu, dev))
 764		iommu = NULL;
 765
 766	rcu_read_unlock();
 767
 768	return iommu;
 769}
 770
 771static void domain_flush_cache(struct dmar_domain *domain,
 772			       void *addr, int size)
 773{
 774	if (!domain->iommu_coherency)
 775		clflush_cache_range(addr, size);
 776}
 777
 778static void free_context_table(struct intel_iommu *iommu)
 779{
 780	struct context_entry *context;
 781	int i;
 
 782
 783	if (!iommu->root_entry)
 784		return;
 
 
 
 
 
 785
 
 
 
 
 
 
 
 
 
 
 786	for (i = 0; i < ROOT_ENTRY_NR; i++) {
 787		context = iommu_context_addr(iommu, i, 0, 0);
 788		if (context)
 789			free_pgtable_page(context);
 790
 791		if (!sm_supported(iommu))
 792			continue;
 793
 794		context = iommu_context_addr(iommu, i, 0x80, 0);
 795		if (context)
 796			free_pgtable_page(context);
 797	}
 798
 
 799	free_pgtable_page(iommu->root_entry);
 800	iommu->root_entry = NULL;
 
 
 801}
 802
 803#ifdef CONFIG_DMAR_DEBUG
 804static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
 805			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
 806{
 807	struct dma_pte *pte;
 808	int offset;
 809
 810	while (1) {
 811		offset = pfn_level_offset(pfn, level);
 812		pte = &parent[offset];
 813		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
 814			pr_info("PTE not present at level %d\n", level);
 815			break;
 816		}
 817
 818		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
 819
 820		if (level == 1)
 821			break;
 822
 823		parent = phys_to_virt(dma_pte_addr(pte));
 824		level--;
 825	}
 826}
 827
 828void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
 829			  unsigned long long addr, u32 pasid)
 830{
 831	struct pasid_dir_entry *dir, *pde;
 832	struct pasid_entry *entries, *pte;
 833	struct context_entry *ctx_entry;
 834	struct root_entry *rt_entry;
 835	int i, dir_index, index, level;
 836	u8 devfn = source_id & 0xff;
 837	u8 bus = source_id >> 8;
 838	struct dma_pte *pgtable;
 839
 840	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
 841
 842	/* root entry dump */
 843	rt_entry = &iommu->root_entry[bus];
 844	if (!rt_entry) {
 845		pr_info("root table entry is not present\n");
 846		return;
 847	}
 848
 849	if (sm_supported(iommu))
 850		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
 851			rt_entry->hi, rt_entry->lo);
 852	else
 853		pr_info("root entry: 0x%016llx", rt_entry->lo);
 854
 855	/* context entry dump */
 856	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
 857	if (!ctx_entry) {
 858		pr_info("context table entry is not present\n");
 859		return;
 860	}
 861
 862	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
 863		ctx_entry->hi, ctx_entry->lo);
 864
 865	/* legacy mode does not require PASID entries */
 866	if (!sm_supported(iommu)) {
 867		level = agaw_to_level(ctx_entry->hi & 7);
 868		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 869		goto pgtable_walk;
 870	}
 871
 872	/* get the pointer to pasid directory entry */
 873	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 874	if (!dir) {
 875		pr_info("pasid directory entry is not present\n");
 876		return;
 877	}
 878	/* For request-without-pasid, get the pasid from context entry */
 879	if (intel_iommu_sm && pasid == INVALID_IOASID)
 880		pasid = PASID_RID2PASID;
 881
 882	dir_index = pasid >> PASID_PDE_SHIFT;
 883	pde = &dir[dir_index];
 884	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
 885
 886	/* get the pointer to the pasid table entry */
 887	entries = get_pasid_table_from_pde(pde);
 888	if (!entries) {
 889		pr_info("pasid table entry is not present\n");
 890		return;
 891	}
 892	index = pasid & PASID_PTE_MASK;
 893	pte = &entries[index];
 894	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
 895		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
 896
 897	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
 898		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
 899		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
 900	} else {
 901		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
 902		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
 903	}
 904
 905pgtable_walk:
 906	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
 907}
 908#endif
 909
 910static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 911				      unsigned long pfn, int *target_level)
 912{
 913	struct dma_pte *parent, *pte;
 914	int level = agaw_to_level(domain->agaw);
 915	int offset;
 916
 917	BUG_ON(!domain->pgd);
 918
 919	if (!domain_pfn_supported(domain, pfn))
 920		/* Address beyond IOMMU's addressing capabilities. */
 921		return NULL;
 922
 923	parent = domain->pgd;
 924
 925	while (1) {
 926		void *tmp_page;
 927
 928		offset = pfn_level_offset(pfn, level);
 929		pte = &parent[offset];
 930		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 931			break;
 932		if (level == *target_level)
 933			break;
 934
 935		if (!dma_pte_present(pte)) {
 936			uint64_t pteval;
 937
 938			tmp_page = alloc_pgtable_page(domain->nid);
 939
 940			if (!tmp_page)
 941				return NULL;
 942
 943			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 944			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 945			if (domain->use_first_level)
 946				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
 947
 
 
 948			if (cmpxchg64(&pte->val, 0ULL, pteval))
 949				/* Someone else set it while we were thinking; use theirs. */
 950				free_pgtable_page(tmp_page);
 951			else
 952				domain_flush_cache(domain, pte, sizeof(*pte));
 953		}
 954		if (level == 1)
 955			break;
 956
 957		parent = phys_to_virt(dma_pte_addr(pte));
 958		level--;
 959	}
 960
 961	if (!*target_level)
 962		*target_level = level;
 963
 964	return pte;
 965}
 966
 967/* return address's pte at specific level */
 968static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 969					 unsigned long pfn,
 970					 int level, int *large_page)
 971{
 972	struct dma_pte *parent, *pte;
 973	int total = agaw_to_level(domain->agaw);
 974	int offset;
 975
 976	parent = domain->pgd;
 977	while (level <= total) {
 978		offset = pfn_level_offset(pfn, total);
 979		pte = &parent[offset];
 980		if (level == total)
 981			return pte;
 982
 983		if (!dma_pte_present(pte)) {
 984			*large_page = total;
 985			break;
 986		}
 987
 988		if (dma_pte_superpage(pte)) {
 989			*large_page = total;
 990			return pte;
 991		}
 992
 993		parent = phys_to_virt(dma_pte_addr(pte));
 994		total--;
 995	}
 996	return NULL;
 997}
 998
 999/* clear last level pte, a tlb flush should be followed */
1000static void dma_pte_clear_range(struct dmar_domain *domain,
1001				unsigned long start_pfn,
1002				unsigned long last_pfn)
1003{
1004	unsigned int large_page;
1005	struct dma_pte *first_pte, *pte;
1006
1007	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1008	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1009	BUG_ON(start_pfn > last_pfn);
1010
1011	/* we don't need lock here; nobody else touches the iova range */
1012	do {
1013		large_page = 1;
1014		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1015		if (!pte) {
1016			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1017			continue;
1018		}
1019		do {
1020			dma_clear_pte(pte);
1021			start_pfn += lvl_to_nr_pages(large_page);
1022			pte++;
1023		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1024
1025		domain_flush_cache(domain, first_pte,
1026				   (void *)pte - (void *)first_pte);
1027
1028	} while (start_pfn && start_pfn <= last_pfn);
1029}
1030
1031static void dma_pte_free_level(struct dmar_domain *domain, int level,
1032			       int retain_level, struct dma_pte *pte,
1033			       unsigned long pfn, unsigned long start_pfn,
1034			       unsigned long last_pfn)
1035{
1036	pfn = max(start_pfn, pfn);
1037	pte = &pte[pfn_level_offset(pfn, level)];
1038
1039	do {
1040		unsigned long level_pfn;
1041		struct dma_pte *level_pte;
1042
1043		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1044			goto next;
1045
1046		level_pfn = pfn & level_mask(level);
1047		level_pte = phys_to_virt(dma_pte_addr(pte));
1048
1049		if (level > 2) {
1050			dma_pte_free_level(domain, level - 1, retain_level,
1051					   level_pte, level_pfn, start_pfn,
1052					   last_pfn);
1053		}
1054
1055		/*
1056		 * Free the page table if we're below the level we want to
1057		 * retain and the range covers the entire table.
1058		 */
1059		if (level < retain_level && !(start_pfn > level_pfn ||
1060		      last_pfn < level_pfn + level_size(level) - 1)) {
1061			dma_clear_pte(pte);
1062			domain_flush_cache(domain, pte, sizeof(*pte));
1063			free_pgtable_page(level_pte);
1064		}
1065next:
1066		pfn += level_size(level);
1067	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1068}
1069
1070/*
1071 * clear last level (leaf) ptes and free page table pages below the
1072 * level we wish to keep intact.
1073 */
1074static void dma_pte_free_pagetable(struct dmar_domain *domain,
1075				   unsigned long start_pfn,
1076				   unsigned long last_pfn,
1077				   int retain_level)
1078{
 
 
 
 
1079	dma_pte_clear_range(domain, start_pfn, last_pfn);
1080
1081	/* We don't need lock here; nobody else touches the iova range */
1082	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1083			   domain->pgd, 0, start_pfn, last_pfn);
1084
1085	/* free pgd */
1086	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1087		free_pgtable_page(domain->pgd);
1088		domain->pgd = NULL;
1089	}
1090}
1091
1092/* When a page at a given level is being unlinked from its parent, we don't
1093   need to *modify* it at all. All we need to do is make a list of all the
1094   pages which can be freed just as soon as we've flushed the IOTLB and we
1095   know the hardware page-walk will no longer touch them.
1096   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1097   be freed. */
1098static void dma_pte_list_pagetables(struct dmar_domain *domain,
1099				    int level, struct dma_pte *pte,
1100				    struct list_head *freelist)
1101{
1102	struct page *pg;
1103
1104	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1105	list_add_tail(&pg->lru, freelist);
 
1106
1107	if (level == 1)
1108		return;
1109
1110	pte = page_address(pg);
1111	do {
1112		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1113			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
 
1114		pte++;
1115	} while (!first_pte_in_page(pte));
 
 
1116}
1117
1118static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1119				struct dma_pte *pte, unsigned long pfn,
1120				unsigned long start_pfn, unsigned long last_pfn,
1121				struct list_head *freelist)
 
1122{
1123	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1124
1125	pfn = max(start_pfn, pfn);
1126	pte = &pte[pfn_level_offset(pfn, level)];
1127
1128	do {
1129		unsigned long level_pfn = pfn & level_mask(level);
1130
1131		if (!dma_pte_present(pte))
1132			goto next;
1133
 
 
1134		/* If range covers entire pagetable, free it */
1135		if (start_pfn <= level_pfn &&
1136		    last_pfn >= level_pfn + level_size(level) - 1) {
1137			/* These suborbinate page tables are going away entirely. Don't
1138			   bother to clear them; we're just going to *free* them. */
1139			if (level > 1 && !dma_pte_superpage(pte))
1140				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1141
1142			dma_clear_pte(pte);
1143			if (!first_pte)
1144				first_pte = pte;
1145			last_pte = pte;
1146		} else if (level > 1) {
1147			/* Recurse down into a level that isn't *entirely* obsolete */
1148			dma_pte_clear_level(domain, level - 1,
1149					    phys_to_virt(dma_pte_addr(pte)),
1150					    level_pfn, start_pfn, last_pfn,
1151					    freelist);
1152		}
1153next:
1154		pfn = level_pfn + level_size(level);
1155	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1156
1157	if (first_pte)
1158		domain_flush_cache(domain, first_pte,
1159				   (void *)++last_pte - (void *)first_pte);
 
 
1160}
1161
1162/* We can't just free the pages because the IOMMU may still be walking
1163   the page tables, and may have cached the intermediate levels. The
1164   pages can only be freed after the IOTLB flush has been done. */
1165static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1166			 unsigned long last_pfn, struct list_head *freelist)
 
 
1167{
1168	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1169	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1170	BUG_ON(start_pfn > last_pfn);
1171
1172	/* we don't need lock here; nobody else touches the iova range */
1173	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1174			    domain->pgd, 0, start_pfn, last_pfn, freelist);
 
1175
1176	/* free pgd */
1177	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1178		struct page *pgd_page = virt_to_page(domain->pgd);
1179		list_add_tail(&pgd_page->lru, freelist);
 
 
1180		domain->pgd = NULL;
1181	}
 
 
 
 
 
 
 
 
 
 
 
 
1182}
1183
1184/* iommu handling */
1185static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1186{
1187	struct root_entry *root;
 
1188
1189	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1190	if (!root) {
1191		pr_err("Allocating root entry for %s failed\n",
1192			iommu->name);
1193		return -ENOMEM;
1194	}
1195
1196	__iommu_flush_cache(iommu, root, ROOT_SIZE);
 
 
1197	iommu->root_entry = root;
 
1198
1199	return 0;
1200}
1201
1202static void iommu_set_root_entry(struct intel_iommu *iommu)
1203{
1204	u64 addr;
1205	u32 sts;
1206	unsigned long flag;
1207
1208	addr = virt_to_phys(iommu->root_entry);
1209	if (sm_supported(iommu))
1210		addr |= DMA_RTADDR_SMT;
1211
1212	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1213	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1214
1215	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1216
1217	/* Make sure hardware complete it */
1218	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1219		      readl, (sts & DMA_GSTS_RTPS), sts);
1220
1221	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1222
1223	/*
1224	 * Hardware invalidates all DMA remapping hardware translation
1225	 * caches as part of SRTP flow.
1226	 */
1227	if (cap_esrtps(iommu->cap))
1228		return;
1229
1230	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1231	if (sm_supported(iommu))
1232		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1233	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1234}
1235
1236void iommu_flush_write_buffer(struct intel_iommu *iommu)
1237{
1238	u32 val;
1239	unsigned long flag;
1240
1241	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1242		return;
1243
1244	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1245	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1246
1247	/* Make sure hardware complete it */
1248	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1249		      readl, (!(val & DMA_GSTS_WBFS)), val);
1250
1251	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1252}
1253
1254/* return value determine if we need a write buffer flush */
1255static void __iommu_flush_context(struct intel_iommu *iommu,
1256				  u16 did, u16 source_id, u8 function_mask,
1257				  u64 type)
1258{
1259	u64 val = 0;
1260	unsigned long flag;
1261
1262	switch (type) {
1263	case DMA_CCMD_GLOBAL_INVL:
1264		val = DMA_CCMD_GLOBAL_INVL;
1265		break;
1266	case DMA_CCMD_DOMAIN_INVL:
1267		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1268		break;
1269	case DMA_CCMD_DEVICE_INVL:
1270		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1271			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1272		break;
1273	default:
1274		BUG();
1275	}
1276	val |= DMA_CCMD_ICC;
1277
1278	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1279	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1280
1281	/* Make sure hardware complete it */
1282	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1283		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1284
1285	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1286}
1287
1288/* return value determine if we need a write buffer flush */
1289static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1290				u64 addr, unsigned int size_order, u64 type)
1291{
1292	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1293	u64 val = 0, val_iva = 0;
1294	unsigned long flag;
1295
1296	switch (type) {
1297	case DMA_TLB_GLOBAL_FLUSH:
1298		/* global flush doesn't need set IVA_REG */
1299		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1300		break;
1301	case DMA_TLB_DSI_FLUSH:
1302		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1303		break;
1304	case DMA_TLB_PSI_FLUSH:
1305		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1306		/* IH bit is passed in as part of address */
1307		val_iva = size_order | addr;
1308		break;
1309	default:
1310		BUG();
1311	}
1312	/* Note: set drain read/write */
1313#if 0
1314	/*
1315	 * This is probably to be super secure.. Looks like we can
1316	 * ignore it without any impact.
1317	 */
1318	if (cap_read_drain(iommu->cap))
1319		val |= DMA_TLB_READ_DRAIN;
1320#endif
1321	if (cap_write_drain(iommu->cap))
1322		val |= DMA_TLB_WRITE_DRAIN;
1323
1324	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325	/* Note: Only uses first TLB reg currently */
1326	if (val_iva)
1327		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329
1330	/* Make sure hardware complete it */
1331	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333
1334	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335
1336	/* check IOTLB invalidation granularity */
1337	if (DMA_TLB_IAIG(val) == 0)
1338		pr_err("Flush IOTLB failed\n");
1339	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340		pr_debug("TLB flush request %Lx, actual %Lx\n",
1341			(unsigned long long)DMA_TLB_IIRG(type),
1342			(unsigned long long)DMA_TLB_IAIG(val));
1343}
1344
1345static struct device_domain_info *
1346domain_lookup_dev_info(struct dmar_domain *domain,
1347		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1348{
1349	struct device_domain_info *info;
1350	unsigned long flags;
1351
1352	spin_lock_irqsave(&domain->lock, flags);
1353	list_for_each_entry(info, &domain->devices, link) {
 
 
 
 
1354		if (info->iommu == iommu && info->bus == bus &&
1355		    info->devfn == devfn) {
1356			spin_unlock_irqrestore(&domain->lock, flags);
1357			return info;
 
1358		}
1359	}
1360	spin_unlock_irqrestore(&domain->lock, flags);
1361
1362	return NULL;
1363}
1364
1365static void domain_update_iotlb(struct dmar_domain *domain)
1366{
1367	struct device_domain_info *info;
1368	bool has_iotlb_device = false;
1369	unsigned long flags;
1370
1371	spin_lock_irqsave(&domain->lock, flags);
1372	list_for_each_entry(info, &domain->devices, link) {
 
1373		if (info->ats_enabled) {
1374			has_iotlb_device = true;
1375			break;
1376		}
1377	}
1378	domain->has_iotlb_device = has_iotlb_device;
1379	spin_unlock_irqrestore(&domain->lock, flags);
1380}
1381
1382/*
1383 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1384 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1385 * check because it applies only to the built-in QAT devices and it doesn't
1386 * grant additional privileges.
1387 */
1388#define BUGGY_QAT_DEVID_MASK 0x4940
1389static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1390{
1391	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1392		return false;
1393
1394	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1395		return false;
 
 
 
 
 
 
1396
1397	return true;
1398}
1399
1400static void iommu_enable_pci_caps(struct device_domain_info *info)
1401{
1402	struct pci_dev *pdev;
1403
1404	if (!dev_is_pci(info->dev))
 
 
1405		return;
1406
1407	pdev = to_pci_dev(info->dev);
1408	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1409	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1410	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1411	 * reserved, which should be set to 0.
1412	 */
1413	if (!ecap_dit(info->iommu->ecap))
1414		info->pfsid = 0;
1415	else {
1416		struct pci_dev *pf_pdev;
1417
1418		/* pdev will be returned if device is not a vf */
1419		pf_pdev = pci_physfn(pdev);
1420		info->pfsid = pci_dev_id(pf_pdev);
1421	}
1422
 
1423	/* The PCIe spec, in its wisdom, declares that the behaviour of
1424	   the device if you enable PASID support after ATS support is
1425	   undefined. So always enable PASID support on devices which
1426	   have it, even if we can't yet know if we're ever going to
1427	   use it. */
1428	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1429		info->pasid_enabled = 1;
1430
1431	if (info->pri_supported &&
1432	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1433	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1434		info->pri_enabled = 1;
1435
1436	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1437	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1438		info->ats_enabled = 1;
1439		domain_update_iotlb(info->domain);
1440		info->ats_qdep = pci_ats_queue_depth(pdev);
1441	}
1442}
1443
1444static void iommu_disable_pci_caps(struct device_domain_info *info)
1445{
1446	struct pci_dev *pdev;
1447
 
 
1448	if (!dev_is_pci(info->dev))
1449		return;
1450
1451	pdev = to_pci_dev(info->dev);
1452
1453	if (info->ats_enabled) {
1454		pci_disable_ats(pdev);
1455		info->ats_enabled = 0;
1456		domain_update_iotlb(info->domain);
1457	}
1458
1459	if (info->pri_enabled) {
1460		pci_disable_pri(pdev);
1461		info->pri_enabled = 0;
1462	}
1463
1464	if (info->pasid_enabled) {
1465		pci_disable_pasid(pdev);
1466		info->pasid_enabled = 0;
1467	}
 
1468}
1469
1470static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1471				    u64 addr, unsigned int mask)
1472{
1473	u16 sid, qdep;
1474
1475	if (!info || !info->ats_enabled)
1476		return;
1477
1478	sid = info->bus << 8 | info->devfn;
1479	qdep = info->ats_qdep;
1480	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1481			   qdep, addr, mask);
1482	quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
1483}
1484
1485static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1486				  u64 addr, unsigned mask)
1487{
1488	struct device_domain_info *info;
1489	unsigned long flags;
 
 
1490
1491	if (!domain->has_iotlb_device)
1492		return;
1493
1494	spin_lock_irqsave(&domain->lock, flags);
1495	list_for_each_entry(info, &domain->devices, link)
1496		__iommu_flush_dev_iotlb(info, addr, mask);
1497	spin_unlock_irqrestore(&domain->lock, flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1498}
1499
1500static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1501				  struct dmar_domain *domain,
1502				  unsigned long pfn, unsigned int pages,
1503				  int ih, int map)
1504{
1505	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1506	unsigned int mask = ilog2(aligned_pages);
1507	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1508	u16 did = domain_id_iommu(domain, iommu);
1509
1510	BUG_ON(pages == 0);
1511
1512	if (ih)
1513		ih = 1 << 6;
1514
1515	if (domain->use_first_level) {
1516		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1517	} else {
1518		unsigned long bitmask = aligned_pages - 1;
1519
1520		/*
1521		 * PSI masks the low order bits of the base address. If the
1522		 * address isn't aligned to the mask, then compute a mask value
1523		 * needed to ensure the target range is flushed.
1524		 */
1525		if (unlikely(bitmask & pfn)) {
1526			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1527
1528			/*
1529			 * Since end_pfn <= pfn + bitmask, the only way bits
1530			 * higher than bitmask can differ in pfn and end_pfn is
1531			 * by carrying. This means after masking out bitmask,
1532			 * high bits starting with the first set bit in
1533			 * shared_bits are all equal in both pfn and end_pfn.
1534			 */
1535			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1536			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1537		}
1538
1539		/*
1540		 * Fallback to domain selective flush if no PSI support or
1541		 * the size is too big.
 
1542		 */
1543		if (!cap_pgsel_inv(iommu->cap) ||
1544		    mask > cap_max_amask_val(iommu->cap))
1545			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1546							DMA_TLB_DSI_FLUSH);
1547		else
1548			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1549							DMA_TLB_PSI_FLUSH);
1550	}
1551
1552	/*
1553	 * In caching mode, changes of pages from non-present to present require
1554	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1555	 */
1556	if (!cap_caching_mode(iommu->cap) || !map)
1557		iommu_flush_dev_iotlb(domain, addr, mask);
1558}
1559
1560/* Notification for newly created mappings */
1561static inline void __mapping_notify_one(struct intel_iommu *iommu,
1562					struct dmar_domain *domain,
1563					unsigned long pfn, unsigned int pages)
1564{
1565	/*
1566	 * It's a non-present to present mapping. Only flush if caching mode
1567	 * and second level.
1568	 */
1569	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1570		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1571	else
1572		iommu_flush_write_buffer(iommu);
1573}
1574
1575static void intel_flush_iotlb_all(struct iommu_domain *domain)
1576{
1577	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1578	struct iommu_domain_info *info;
1579	unsigned long idx;
1580
1581	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1582		struct intel_iommu *iommu = info->iommu;
1583		u16 did = domain_id_iommu(dmar_domain, iommu);
1584
1585		if (dmar_domain->use_first_level)
1586			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1587		else
1588			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1589						 DMA_TLB_DSI_FLUSH);
1590
1591		if (!cap_caching_mode(iommu->cap))
1592			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
 
1593	}
1594}
1595
1596static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1597{
1598	u32 pmen;
1599	unsigned long flags;
1600
1601	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1602		return;
1603
1604	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1605	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1606	pmen &= ~DMA_PMEN_EPM;
1607	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1608
1609	/* wait for the protected region status bit to clear */
1610	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1611		readl, !(pmen & DMA_PMEN_PRS), pmen);
1612
1613	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1614}
1615
1616static void iommu_enable_translation(struct intel_iommu *iommu)
1617{
1618	u32 sts;
1619	unsigned long flags;
1620
1621	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1622	iommu->gcmd |= DMA_GCMD_TE;
1623	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1624
1625	/* Make sure hardware complete it */
1626	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1627		      readl, (sts & DMA_GSTS_TES), sts);
1628
1629	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630}
1631
1632static void iommu_disable_translation(struct intel_iommu *iommu)
1633{
1634	u32 sts;
1635	unsigned long flag;
1636
1637	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1638	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1639		return;
1640
1641	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1642	iommu->gcmd &= ~DMA_GCMD_TE;
1643	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1644
1645	/* Make sure hardware complete it */
1646	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1647		      readl, (!(sts & DMA_GSTS_TES)), sts);
1648
1649	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1650}
1651
1652static int iommu_init_domains(struct intel_iommu *iommu)
1653{
1654	u32 ndomains;
 
1655
1656	ndomains = cap_ndoms(iommu->cap);
1657	pr_debug("%s: Number of Domains supported <%d>\n",
1658		 iommu->name, ndomains);
 
1659
1660	spin_lock_init(&iommu->lock);
1661
1662	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1663	if (!iommu->domain_ids)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1664		return -ENOMEM;
 
1665
1666	/*
1667	 * If Caching mode is set, then invalid translations are tagged
1668	 * with domain-id 0, hence we need to pre-allocate it. We also
1669	 * use domain-id 0 as a marker for non-allocated domain-id, so
1670	 * make sure it is not used for a real domain.
1671	 */
1672	set_bit(0, iommu->domain_ids);
1673
1674	/*
1675	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1676	 * entry for first-level or pass-through translation modes should
1677	 * be programmed with a domain id different from those used for
1678	 * second-level or nested translation. We reserve a domain id for
1679	 * this purpose.
1680	 */
1681	if (sm_supported(iommu))
1682		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1683
1684	return 0;
1685}
1686
1687static void disable_dmar_iommu(struct intel_iommu *iommu)
1688{
1689	if (!iommu->domain_ids)
1690		return;
1691
1692	/*
1693	 * All iommu domains must have been detached from the devices,
1694	 * hence there should be no domain IDs in use.
1695	 */
1696	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1697		    > NUM_RESERVED_DID))
1698		return;
1699
 
 
 
 
 
 
 
 
 
 
 
 
1700	if (iommu->gcmd & DMA_GCMD_TE)
1701		iommu_disable_translation(iommu);
1702}
1703
1704static void free_dmar_iommu(struct intel_iommu *iommu)
1705{
1706	if (iommu->domain_ids) {
1707		bitmap_free(iommu->domain_ids);
 
 
 
 
 
 
 
1708		iommu->domain_ids = NULL;
1709	}
1710
1711	if (iommu->copied_tables) {
1712		bitmap_free(iommu->copied_tables);
1713		iommu->copied_tables = NULL;
1714	}
1715
1716	/* free context mapping */
1717	free_context_table(iommu);
1718
1719#ifdef CONFIG_INTEL_IOMMU_SVM
1720	if (pasid_supported(iommu)) {
1721		if (ecap_prs(iommu->ecap))
1722			intel_svm_finish_prq(iommu);
1723	}
1724	if (vccap_pasid(iommu->vccap))
1725		ioasid_unregister_allocator(&iommu->pasid_allocator);
1726
1727#endif
1728}
1729
1730/*
1731 * Check and return whether first level is used by default for
1732 * DMA translation.
1733 */
1734static bool first_level_by_default(unsigned int type)
1735{
1736	/* Only SL is available in legacy mode */
1737	if (!scalable_mode_support())
1738		return false;
1739
1740	/* Only level (either FL or SL) is available, just use it */
1741	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1742		return intel_cap_flts_sanity();
1743
1744	/* Both levels are available, decide it based on domain type */
1745	return type != IOMMU_DOMAIN_UNMANAGED;
1746}
1747
1748static struct dmar_domain *alloc_domain(unsigned int type)
1749{
1750	struct dmar_domain *domain;
1751
1752	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1753	if (!domain)
1754		return NULL;
1755
 
1756	domain->nid = NUMA_NO_NODE;
1757	if (first_level_by_default(type))
1758		domain->use_first_level = true;
 
1759	domain->has_iotlb_device = false;
1760	INIT_LIST_HEAD(&domain->devices);
1761	spin_lock_init(&domain->lock);
1762	xa_init(&domain->iommu_array);
1763
1764	return domain;
1765}
1766
 
1767static int domain_attach_iommu(struct dmar_domain *domain,
1768			       struct intel_iommu *iommu)
1769{
1770	struct iommu_domain_info *info, *curr;
1771	unsigned long ndomains;
1772	int num, ret = -ENOSPC;
1773
1774	info = kzalloc(sizeof(*info), GFP_KERNEL);
1775	if (!info)
1776		return -ENOMEM;
1777
1778	spin_lock(&iommu->lock);
1779	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1780	if (curr) {
1781		curr->refcnt++;
1782		spin_unlock(&iommu->lock);
1783		kfree(info);
1784		return 0;
1785	}
1786
1787	ndomains = cap_ndoms(iommu->cap);
1788	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1789	if (num >= ndomains) {
1790		pr_err("%s: No free domain ids\n", iommu->name);
1791		goto err_unlock;
1792	}
1793
1794	set_bit(num, iommu->domain_ids);
1795	info->refcnt	= 1;
1796	info->did	= num;
1797	info->iommu	= iommu;
1798	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1799			  NULL, info, GFP_ATOMIC);
1800	if (curr) {
1801		ret = xa_err(curr) ? : -EBUSY;
1802		goto err_clear;
1803	}
1804	domain_update_iommu_cap(domain);
1805
1806	spin_unlock(&iommu->lock);
1807	return 0;
1808
1809err_clear:
1810	clear_bit(info->did, iommu->domain_ids);
1811err_unlock:
1812	spin_unlock(&iommu->lock);
1813	kfree(info);
1814	return ret;
1815}
1816
1817static void domain_detach_iommu(struct dmar_domain *domain,
1818				struct intel_iommu *iommu)
1819{
1820	struct iommu_domain_info *info;
 
 
 
 
 
 
 
 
 
1821
1822	spin_lock(&iommu->lock);
1823	info = xa_load(&domain->iommu_array, iommu->seq_id);
1824	if (--info->refcnt == 0) {
1825		clear_bit(info->did, iommu->domain_ids);
1826		xa_erase(&domain->iommu_array, iommu->seq_id);
1827		domain->nid = NUMA_NO_NODE;
1828		domain_update_iommu_cap(domain);
1829		kfree(info);
1830	}
1831	spin_unlock(&iommu->lock);
1832}
1833
1834static inline int guestwidth_to_adjustwidth(int gaw)
1835{
1836	int agaw;
1837	int r = (gaw - 12) % 9;
1838
1839	if (r == 0)
1840		agaw = gaw;
1841	else
1842		agaw = gaw + 9 - r;
1843	if (agaw > 64)
1844		agaw = 64;
1845	return agaw;
1846}
1847
1848static void domain_exit(struct dmar_domain *domain)
1849{
 
 
 
 
 
 
 
 
1850	if (domain->pgd) {
1851		LIST_HEAD(freelist);
1852
1853		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1854		put_pages_list(&freelist);
 
1855	}
1856
1857	if (WARN_ON(!list_empty(&domain->devices)))
1858		return;
1859
1860	kfree(domain);
1861}
1862
1863/*
1864 * Get the PASID directory size for scalable mode context entry.
1865 * Value of X in the PDTS field of a scalable mode context entry
1866 * indicates PASID directory with 2^(X + 7) entries.
1867 */
1868static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1869{
1870	unsigned long pds, max_pde;
1871
1872	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1873	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1874	if (pds < 7)
1875		return 0;
1876
1877	return pds - 7;
1878}
1879
1880/*
1881 * Set the RID_PASID field of a scalable mode context entry. The
1882 * IOMMU hardware will use the PASID value set in this field for
1883 * DMA translations of DMA requests without PASID.
1884 */
1885static inline void
1886context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1887{
1888	context->hi |= pasid & ((1 << 20) - 1);
1889}
1890
1891/*
1892 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1893 * entry.
1894 */
1895static inline void context_set_sm_dte(struct context_entry *context)
1896{
1897	context->lo |= (1 << 2);
1898}
1899
1900/*
1901 * Set the PRE(Page Request Enable) field of a scalable mode context
1902 * entry.
1903 */
1904static inline void context_set_sm_pre(struct context_entry *context)
1905{
1906	context->lo |= (1 << 4);
1907}
1908
1909/* Convert value to context PASID directory size field coding. */
1910#define context_pdts(pds)	(((pds) & 0x7) << 9)
1911
1912static int domain_context_mapping_one(struct dmar_domain *domain,
1913				      struct intel_iommu *iommu,
1914				      struct pasid_table *table,
1915				      u8 bus, u8 devfn)
1916{
1917	struct device_domain_info *info =
1918			domain_lookup_dev_info(domain, iommu, bus, devfn);
1919	u16 did = domain_id_iommu(domain, iommu);
1920	int translation = CONTEXT_TT_MULTI_LEVEL;
 
1921	struct context_entry *context;
 
1922	int ret;
1923
1924	WARN_ON(did == 0);
1925
1926	if (hw_pass_through && domain_type_is_si(domain))
1927		translation = CONTEXT_TT_PASS_THROUGH;
1928
1929	pr_debug("Set context mapping for %02x:%02x.%d\n",
1930		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1931
1932	BUG_ON(!domain->pgd);
1933
 
1934	spin_lock(&iommu->lock);
 
1935	ret = -ENOMEM;
1936	context = iommu_context_addr(iommu, bus, devfn, 1);
1937	if (!context)
1938		goto out_unlock;
1939
1940	ret = 0;
1941	if (context_present(context) && !context_copied(iommu, bus, devfn))
1942		goto out_unlock;
1943
1944	/*
1945	 * For kdump cases, old valid entries may be cached due to the
1946	 * in-flight DMA and copied pgtable, but there is no unmapping
1947	 * behaviour for them, thus we need an explicit cache flush for
1948	 * the newly-mapped device. For kdump, at this point, the device
1949	 * is supposed to finish reset at its driver probe stage, so no
1950	 * in-flight DMA will exist, and we don't need to worry anymore
1951	 * hereafter.
1952	 */
1953	if (context_copied(iommu, bus, devfn)) {
1954		u16 did_old = context_domain_id(context);
1955
1956		if (did_old < cap_ndoms(iommu->cap)) {
1957			iommu->flush.flush_context(iommu, did_old,
1958						   (((u16)bus) << 8) | devfn,
1959						   DMA_CCMD_MASK_NOBIT,
1960						   DMA_CCMD_DEVICE_INVL);
1961			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1962						 DMA_TLB_DSI_FLUSH);
1963		}
1964
1965		clear_context_copied(iommu, bus, devfn);
1966	}
1967
1968	context_clear_entry(context);
1969
1970	if (sm_supported(iommu)) {
1971		unsigned long pds;
1972
1973		WARN_ON(!table);
1974
1975		/* Setup the PASID DIR pointer: */
1976		pds = context_get_sm_pds(table);
1977		context->lo = (u64)virt_to_phys(table->table) |
1978				context_pdts(pds);
1979
1980		/* Setup the RID_PASID field: */
1981		context_set_sm_rid2pasid(context, PASID_RID2PASID);
1982
1983		/*
1984		 * Setup the Device-TLB enable bit and Page request
1985		 * Enable bit:
1986		 */
 
1987		if (info && info->ats_supported)
1988			context_set_sm_dte(context);
1989		if (info && info->pri_supported)
1990			context_set_sm_pre(context);
1991		if (info && info->pasid_supported)
1992			context_set_pasid(context);
1993	} else {
1994		struct dma_pte *pgd = domain->pgd;
1995		int agaw;
1996
1997		context_set_domain_id(context, did);
1998
1999		if (translation != CONTEXT_TT_PASS_THROUGH) {
2000			/*
2001			 * Skip top levels of page tables for iommu which has
2002			 * less agaw than default. Unnecessary for PT mode.
2003			 */
2004			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2005				ret = -ENOMEM;
2006				pgd = phys_to_virt(dma_pte_addr(pgd));
2007				if (!dma_pte_present(pgd))
2008					goto out_unlock;
2009			}
2010
 
2011			if (info && info->ats_supported)
2012				translation = CONTEXT_TT_DEV_IOTLB;
2013			else
2014				translation = CONTEXT_TT_MULTI_LEVEL;
2015
2016			context_set_address_root(context, virt_to_phys(pgd));
2017			context_set_address_width(context, agaw);
2018		} else {
2019			/*
2020			 * In pass through mode, AW must be programmed to
2021			 * indicate the largest AGAW value supported by
2022			 * hardware. And ASR is ignored by hardware.
2023			 */
2024			context_set_address_width(context, iommu->msagaw);
2025		}
2026
2027		context_set_translation_type(context, translation);
2028	}
2029
2030	context_set_fault_enable(context);
2031	context_set_present(context);
2032	if (!ecap_coherent(iommu->ecap))
2033		clflush_cache_range(context, sizeof(*context));
2034
2035	/*
2036	 * It's a non-present to present mapping. If hardware doesn't cache
2037	 * non-present entry we only need to flush the write-buffer. If the
2038	 * _does_ cache non-present entries, then it does so in the special
2039	 * domain #0, which we have to flush:
2040	 */
2041	if (cap_caching_mode(iommu->cap)) {
2042		iommu->flush.flush_context(iommu, 0,
2043					   (((u16)bus) << 8) | devfn,
2044					   DMA_CCMD_MASK_NOBIT,
2045					   DMA_CCMD_DEVICE_INVL);
2046		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2047	} else {
2048		iommu_flush_write_buffer(iommu);
2049	}
 
2050
2051	ret = 0;
2052
2053out_unlock:
2054	spin_unlock(&iommu->lock);
 
2055
2056	return ret;
2057}
2058
2059struct domain_context_mapping_data {
2060	struct dmar_domain *domain;
2061	struct intel_iommu *iommu;
2062	struct pasid_table *table;
2063};
2064
2065static int domain_context_mapping_cb(struct pci_dev *pdev,
2066				     u16 alias, void *opaque)
2067{
2068	struct domain_context_mapping_data *data = opaque;
2069
2070	return domain_context_mapping_one(data->domain, data->iommu,
2071					  data->table, PCI_BUS_NUM(alias),
2072					  alias & 0xff);
2073}
2074
2075static int
2076domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2077{
2078	struct domain_context_mapping_data data;
2079	struct pasid_table *table;
2080	struct intel_iommu *iommu;
2081	u8 bus, devfn;
2082
2083	iommu = device_to_iommu(dev, &bus, &devfn);
2084	if (!iommu)
2085		return -ENODEV;
2086
2087	table = intel_pasid_get_table(dev);
2088
2089	if (!dev_is_pci(dev))
2090		return domain_context_mapping_one(domain, iommu, table,
2091						  bus, devfn);
2092
2093	data.domain = domain;
2094	data.iommu = iommu;
2095	data.table = table;
2096
2097	return pci_for_each_dma_alias(to_pci_dev(dev),
2098				      &domain_context_mapping_cb, &data);
2099}
2100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2101/* Returns a number of VTD pages, but aligned to MM page size */
2102static inline unsigned long aligned_nrpages(unsigned long host_addr,
2103					    size_t size)
2104{
2105	host_addr &= ~PAGE_MASK;
2106	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2107}
2108
2109/* Return largest possible superpage level for a given mapping */
2110static inline int hardware_largepage_caps(struct dmar_domain *domain,
2111					  unsigned long iov_pfn,
2112					  unsigned long phy_pfn,
2113					  unsigned long pages)
2114{
2115	int support, level = 1;
2116	unsigned long pfnmerge;
2117
2118	support = domain->iommu_superpage;
2119
2120	/* To use a large page, the virtual *and* physical addresses
2121	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2122	   of them will mean we have to use smaller pages. So just
2123	   merge them and check both at once. */
2124	pfnmerge = iov_pfn | phy_pfn;
2125
2126	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2127		pages >>= VTD_STRIDE_SHIFT;
2128		if (!pages)
2129			break;
2130		pfnmerge >>= VTD_STRIDE_SHIFT;
2131		level++;
2132		support--;
2133	}
2134	return level;
2135}
2136
2137/*
2138 * Ensure that old small page tables are removed to make room for superpage(s).
2139 * We're going to add new large pages, so make sure we don't remove their parent
2140 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2141 */
2142static void switch_to_super_page(struct dmar_domain *domain,
2143				 unsigned long start_pfn,
2144				 unsigned long end_pfn, int level)
2145{
2146	unsigned long lvl_pages = lvl_to_nr_pages(level);
2147	struct iommu_domain_info *info;
2148	struct dma_pte *pte = NULL;
2149	unsigned long i;
2150
2151	while (start_pfn <= end_pfn) {
2152		if (!pte)
2153			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2154
2155		if (dma_pte_present(pte)) {
2156			dma_pte_free_pagetable(domain, start_pfn,
2157					       start_pfn + lvl_pages - 1,
2158					       level + 1);
2159
2160			xa_for_each(&domain->iommu_array, i, info)
2161				iommu_flush_iotlb_psi(info->iommu, domain,
2162						      start_pfn, lvl_pages,
2163						      0, 0);
2164		}
2165
2166		pte++;
2167		start_pfn += lvl_pages;
2168		if (first_pte_in_page(pte))
2169			pte = NULL;
2170	}
2171}
2172
2173static int
2174__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2175		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2176{
2177	struct dma_pte *first_pte = NULL, *pte = NULL;
2178	unsigned int largepage_lvl = 0;
2179	unsigned long lvl_pages = 0;
 
2180	phys_addr_t pteval;
2181	u64 attr;
2182
2183	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2184
2185	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2186		return -EINVAL;
2187
2188	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2189	attr |= DMA_FL_PTE_PRESENT;
2190	if (domain->use_first_level) {
2191		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2192		if (prot & DMA_PTE_WRITE)
2193			attr |= DMA_FL_PTE_DIRTY;
 
 
 
 
2194	}
2195
2196	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2197
2198	while (nr_pages > 0) {
2199		uint64_t tmp;
2200
2201		if (!pte) {
2202			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2203					phys_pfn, nr_pages);
2204
2205			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2206			if (!pte)
2207				return -ENOMEM;
2208			first_pte = pte;
2209
2210			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2211
2212			/* It is large page*/
2213			if (largepage_lvl > 1) {
2214				unsigned long end_pfn;
2215				unsigned long pages_to_remove;
2216
2217				pteval |= DMA_PTE_LARGE_PAGE;
2218				pages_to_remove = min_t(unsigned long, nr_pages,
2219							nr_pte_to_next_page(pte) * lvl_pages);
2220				end_pfn = iov_pfn + pages_to_remove - 1;
2221				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2222			} else {
2223				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2224			}
2225
2226		}
2227		/* We don't need lock here, nobody else
2228		 * touches the iova range
2229		 */
2230		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2231		if (tmp) {
2232			static int dumps = 5;
2233			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2234				iov_pfn, tmp, (unsigned long long)pteval);
2235			if (dumps) {
2236				dumps--;
2237				debug_dma_dump_mappings(NULL);
2238			}
2239			WARN_ON(1);
2240		}
2241
 
 
 
 
2242		nr_pages -= lvl_pages;
2243		iov_pfn += lvl_pages;
2244		phys_pfn += lvl_pages;
2245		pteval += lvl_pages * VTD_PAGE_SIZE;
2246
2247		/* If the next PTE would be the first in a new page, then we
2248		 * need to flush the cache on the entries we've just written.
2249		 * And then we'll need to recalculate 'pte', so clear it and
2250		 * let it get set again in the if (!pte) block above.
2251		 *
2252		 * If we're done (!nr_pages) we need to flush the cache too.
2253		 *
2254		 * Also if we've been setting superpages, we may need to
2255		 * recalculate 'pte' and switch back to smaller pages for the
2256		 * end of the mapping, if the trailing size is not enough to
2257		 * use another superpage (i.e. nr_pages < lvl_pages).
 
 
 
2258		 */
2259		pte++;
2260		if (!nr_pages || first_pte_in_page(pte) ||
2261		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2262			domain_flush_cache(domain, first_pte,
2263					   (void *)pte - (void *)first_pte);
2264			pte = NULL;
2265		}
2266	}
2267
2268	return 0;
2269}
2270
2271static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2272{
2273	struct intel_iommu *iommu = info->iommu;
2274	struct context_entry *context;
 
2275	u16 did_old;
2276
2277	if (!iommu)
2278		return;
2279
2280	spin_lock(&iommu->lock);
2281	context = iommu_context_addr(iommu, bus, devfn, 0);
2282	if (!context) {
2283		spin_unlock(&iommu->lock);
2284		return;
2285	}
2286
2287	if (sm_supported(iommu)) {
2288		if (hw_pass_through && domain_type_is_si(info->domain))
2289			did_old = FLPT_DEFAULT_DID;
2290		else
2291			did_old = domain_id_iommu(info->domain, iommu);
2292	} else {
2293		did_old = context_domain_id(context);
2294	}
2295
2296	context_clear_entry(context);
2297	__iommu_flush_cache(iommu, context, sizeof(*context));
2298	spin_unlock(&iommu->lock);
2299	iommu->flush.flush_context(iommu,
2300				   did_old,
2301				   (((u16)bus) << 8) | devfn,
2302				   DMA_CCMD_MASK_NOBIT,
2303				   DMA_CCMD_DEVICE_INVL);
2304
2305	if (sm_supported(iommu))
2306		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2307
2308	iommu->flush.flush_iotlb(iommu,
2309				 did_old,
2310				 0,
2311				 0,
2312				 DMA_TLB_DSI_FLUSH);
2313
2314	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2315}
2316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2317static int domain_setup_first_level(struct intel_iommu *iommu,
2318				    struct dmar_domain *domain,
2319				    struct device *dev,
2320				    u32 pasid)
2321{
2322	struct dma_pte *pgd = domain->pgd;
2323	int agaw, level;
2324	int flags = 0;
2325
2326	/*
2327	 * Skip top levels of page tables for iommu which has
2328	 * less agaw than default. Unnecessary for PT mode.
2329	 */
2330	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2331		pgd = phys_to_virt(dma_pte_addr(pgd));
2332		if (!dma_pte_present(pgd))
2333			return -ENOMEM;
2334	}
2335
2336	level = agaw_to_level(agaw);
2337	if (level != 4 && level != 5)
2338		return -EINVAL;
2339
2340	if (pasid != PASID_RID2PASID)
2341		flags |= PASID_FLAG_SUPERVISOR_MODE;
2342	if (level == 5)
2343		flags |= PASID_FLAG_FL5LP;
2344
2345	if (domain->force_snooping)
2346		flags |= PASID_FLAG_PAGE_SNOOP;
2347
2348	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2349					     domain_id_iommu(domain, iommu),
2350					     flags);
2351}
2352
2353static bool dev_is_real_dma_subdevice(struct device *dev)
2354{
2355	return dev && dev_is_pci(dev) &&
2356	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2357}
2358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2359static int iommu_domain_identity_map(struct dmar_domain *domain,
2360				     unsigned long first_vpfn,
2361				     unsigned long last_vpfn)
2362{
2363	/*
2364	 * RMRR range might have overlap with physical memory range,
2365	 * clear it first
2366	 */
2367	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2368
2369	return __domain_mapping(domain, first_vpfn,
2370				first_vpfn, last_vpfn - first_vpfn + 1,
2371				DMA_PTE_READ|DMA_PTE_WRITE);
2372}
2373
2374static int md_domain_init(struct dmar_domain *domain, int guest_width);
2375
2376static int __init si_domain_init(int hw)
2377{
2378	struct dmar_rmrr_unit *rmrr;
2379	struct device *dev;
2380	int i, nid, ret;
2381
2382	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2383	if (!si_domain)
2384		return -EFAULT;
2385
2386	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2387		domain_exit(si_domain);
2388		si_domain = NULL;
2389		return -EFAULT;
2390	}
2391
2392	if (hw)
2393		return 0;
2394
2395	for_each_online_node(nid) {
2396		unsigned long start_pfn, end_pfn;
2397		int i;
2398
2399		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2400			ret = iommu_domain_identity_map(si_domain,
2401					mm_to_dma_pfn(start_pfn),
2402					mm_to_dma_pfn(end_pfn));
2403			if (ret)
2404				return ret;
2405		}
2406	}
2407
2408	/*
2409	 * Identity map the RMRRs so that devices with RMRRs could also use
2410	 * the si_domain.
2411	 */
2412	for_each_rmrr_units(rmrr) {
2413		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2414					  i, dev) {
2415			unsigned long long start = rmrr->base_address;
2416			unsigned long long end = rmrr->end_address;
2417
2418			if (WARN_ON(end < start ||
2419				    end >> agaw_to_width(si_domain->agaw)))
2420				continue;
2421
2422			ret = iommu_domain_identity_map(si_domain,
2423					mm_to_dma_pfn(start >> PAGE_SHIFT),
2424					mm_to_dma_pfn(end >> PAGE_SHIFT));
2425			if (ret)
2426				return ret;
2427		}
2428	}
2429
2430	return 0;
2431}
2432
2433static int dmar_domain_attach_device(struct dmar_domain *domain,
2434				     struct device *dev)
2435{
2436	struct device_domain_info *info = dev_iommu_priv_get(dev);
2437	struct intel_iommu *iommu;
2438	unsigned long flags;
2439	u8 bus, devfn;
2440	int ret;
2441
2442	iommu = device_to_iommu(dev, &bus, &devfn);
2443	if (!iommu)
2444		return -ENODEV;
2445
2446	ret = domain_attach_iommu(domain, iommu);
2447	if (ret)
2448		return ret;
2449	info->domain = domain;
2450	spin_lock_irqsave(&domain->lock, flags);
2451	list_add(&info->link, &domain->devices);
2452	spin_unlock_irqrestore(&domain->lock, flags);
2453
2454	/* PASID table is mandatory for a PCI device in scalable mode. */
2455	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2456		/* Setup the PASID entry for requests without PASID: */
2457		if (hw_pass_through && domain_type_is_si(domain))
2458			ret = intel_pasid_setup_pass_through(iommu, domain,
2459					dev, PASID_RID2PASID);
2460		else if (domain->use_first_level)
2461			ret = domain_setup_first_level(iommu, domain, dev,
2462					PASID_RID2PASID);
2463		else
2464			ret = intel_pasid_setup_second_level(iommu, domain,
2465					dev, PASID_RID2PASID);
2466		if (ret) {
2467			dev_err(dev, "Setup RID2PASID failed\n");
2468			device_block_translation(dev);
2469			return ret;
2470		}
2471	}
2472
2473	ret = domain_context_mapping(domain, dev);
2474	if (ret) {
2475		dev_err(dev, "Domain context map failed\n");
2476		device_block_translation(dev);
2477		return ret;
2478	}
2479
2480	iommu_enable_pci_caps(info);
2481
2482	return 0;
2483}
2484
2485static bool device_has_rmrr(struct device *dev)
2486{
2487	struct dmar_rmrr_unit *rmrr;
2488	struct device *tmp;
2489	int i;
2490
2491	rcu_read_lock();
2492	for_each_rmrr_units(rmrr) {
2493		/*
2494		 * Return TRUE if this RMRR contains the device that
2495		 * is passed in.
2496		 */
2497		for_each_active_dev_scope(rmrr->devices,
2498					  rmrr->devices_cnt, i, tmp)
2499			if (tmp == dev ||
2500			    is_downstream_to_pci_bridge(dev, tmp)) {
2501				rcu_read_unlock();
2502				return true;
2503			}
2504	}
2505	rcu_read_unlock();
2506	return false;
2507}
2508
2509/**
2510 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2511 * is relaxable (ie. is allowed to be not enforced under some conditions)
2512 * @dev: device handle
2513 *
2514 * We assume that PCI USB devices with RMRRs have them largely
2515 * for historical reasons and that the RMRR space is not actively used post
2516 * boot.  This exclusion may change if vendors begin to abuse it.
2517 *
2518 * The same exception is made for graphics devices, with the requirement that
2519 * any use of the RMRR regions will be torn down before assigning the device
2520 * to a guest.
2521 *
2522 * Return: true if the RMRR is relaxable, false otherwise
2523 */
2524static bool device_rmrr_is_relaxable(struct device *dev)
2525{
2526	struct pci_dev *pdev;
2527
2528	if (!dev_is_pci(dev))
2529		return false;
2530
2531	pdev = to_pci_dev(dev);
2532	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2533		return true;
2534	else
2535		return false;
2536}
2537
2538/*
2539 * There are a couple cases where we need to restrict the functionality of
2540 * devices associated with RMRRs.  The first is when evaluating a device for
2541 * identity mapping because problems exist when devices are moved in and out
2542 * of domains and their respective RMRR information is lost.  This means that
2543 * a device with associated RMRRs will never be in a "passthrough" domain.
2544 * The second is use of the device through the IOMMU API.  This interface
2545 * expects to have full control of the IOVA space for the device.  We cannot
2546 * satisfy both the requirement that RMRR access is maintained and have an
2547 * unencumbered IOVA space.  We also have no ability to quiesce the device's
2548 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2549 * We therefore prevent devices associated with an RMRR from participating in
2550 * the IOMMU API, which eliminates them from device assignment.
2551 *
2552 * In both cases, devices which have relaxable RMRRs are not concerned by this
2553 * restriction. See device_rmrr_is_relaxable comment.
2554 */
2555static bool device_is_rmrr_locked(struct device *dev)
2556{
2557	if (!device_has_rmrr(dev))
2558		return false;
2559
2560	if (device_rmrr_is_relaxable(dev))
2561		return false;
2562
2563	return true;
2564}
2565
2566/*
2567 * Return the required default domain type for a specific device.
2568 *
2569 * @dev: the device in query
2570 * @startup: true if this is during early boot
2571 *
2572 * Returns:
2573 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2574 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2575 *  - 0: both identity and dynamic domains work for this device
2576 */
2577static int device_def_domain_type(struct device *dev)
2578{
2579	if (dev_is_pci(dev)) {
2580		struct pci_dev *pdev = to_pci_dev(dev);
2581
2582		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2583			return IOMMU_DOMAIN_IDENTITY;
2584
2585		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2586			return IOMMU_DOMAIN_IDENTITY;
2587	}
2588
2589	return 0;
2590}
2591
2592static void intel_iommu_init_qi(struct intel_iommu *iommu)
2593{
2594	/*
2595	 * Start from the sane iommu hardware state.
2596	 * If the queued invalidation is already initialized by us
2597	 * (for example, while enabling interrupt-remapping) then
2598	 * we got the things already rolling from a sane state.
2599	 */
2600	if (!iommu->qi) {
2601		/*
2602		 * Clear any previous faults.
2603		 */
2604		dmar_fault(-1, iommu);
2605		/*
2606		 * Disable queued invalidation if supported and already enabled
2607		 * before OS handover.
2608		 */
2609		dmar_disable_qi(iommu);
2610	}
2611
2612	if (dmar_enable_qi(iommu)) {
2613		/*
2614		 * Queued Invalidate not enabled, use Register Based Invalidate
2615		 */
2616		iommu->flush.flush_context = __iommu_flush_context;
2617		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2618		pr_info("%s: Using Register based invalidation\n",
2619			iommu->name);
2620	} else {
2621		iommu->flush.flush_context = qi_flush_context;
2622		iommu->flush.flush_iotlb = qi_flush_iotlb;
2623		pr_info("%s: Using Queued invalidation\n", iommu->name);
2624	}
2625}
2626
2627static int copy_context_table(struct intel_iommu *iommu,
2628			      struct root_entry *old_re,
2629			      struct context_entry **tbl,
2630			      int bus, bool ext)
2631{
2632	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2633	struct context_entry *new_ce = NULL, ce;
2634	struct context_entry *old_ce = NULL;
2635	struct root_entry re;
2636	phys_addr_t old_ce_phys;
2637
2638	tbl_idx = ext ? bus * 2 : bus;
2639	memcpy(&re, old_re, sizeof(re));
2640
2641	for (devfn = 0; devfn < 256; devfn++) {
2642		/* First calculate the correct index */
2643		idx = (ext ? devfn * 2 : devfn) % 256;
2644
2645		if (idx == 0) {
2646			/* First save what we may have and clean up */
2647			if (new_ce) {
2648				tbl[tbl_idx] = new_ce;
2649				__iommu_flush_cache(iommu, new_ce,
2650						    VTD_PAGE_SIZE);
2651				pos = 1;
2652			}
2653
2654			if (old_ce)
2655				memunmap(old_ce);
2656
2657			ret = 0;
2658			if (devfn < 0x80)
2659				old_ce_phys = root_entry_lctp(&re);
2660			else
2661				old_ce_phys = root_entry_uctp(&re);
2662
2663			if (!old_ce_phys) {
2664				if (ext && devfn == 0) {
2665					/* No LCTP, try UCTP */
2666					devfn = 0x7f;
2667					continue;
2668				} else {
2669					goto out;
2670				}
2671			}
2672
2673			ret = -ENOMEM;
2674			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2675					MEMREMAP_WB);
2676			if (!old_ce)
2677				goto out;
2678
2679			new_ce = alloc_pgtable_page(iommu->node);
2680			if (!new_ce)
2681				goto out_unmap;
2682
2683			ret = 0;
2684		}
2685
2686		/* Now copy the context entry */
2687		memcpy(&ce, old_ce + idx, sizeof(ce));
2688
2689		if (!context_present(&ce))
2690			continue;
2691
2692		did = context_domain_id(&ce);
2693		if (did >= 0 && did < cap_ndoms(iommu->cap))
2694			set_bit(did, iommu->domain_ids);
2695
2696		set_context_copied(iommu, bus, devfn);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2697		new_ce[idx] = ce;
2698	}
2699
2700	tbl[tbl_idx + pos] = new_ce;
2701
2702	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2703
2704out_unmap:
2705	memunmap(old_ce);
2706
2707out:
2708	return ret;
2709}
2710
2711static int copy_translation_tables(struct intel_iommu *iommu)
2712{
2713	struct context_entry **ctxt_tbls;
2714	struct root_entry *old_rt;
2715	phys_addr_t old_rt_phys;
2716	int ctxt_table_entries;
 
2717	u64 rtaddr_reg;
2718	int bus, ret;
2719	bool new_ext, ext;
2720
2721	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2722	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2723	new_ext    = !!sm_supported(iommu);
2724
2725	/*
2726	 * The RTT bit can only be changed when translation is disabled,
2727	 * but disabling translation means to open a window for data
2728	 * corruption. So bail out and don't copy anything if we would
2729	 * have to change the bit.
2730	 */
2731	if (new_ext != ext)
2732		return -EINVAL;
2733
2734	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2735	if (!iommu->copied_tables)
2736		return -ENOMEM;
2737
2738	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2739	if (!old_rt_phys)
2740		return -EINVAL;
2741
2742	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2743	if (!old_rt)
2744		return -ENOMEM;
2745
2746	/* This is too big for the stack - allocate it from slab */
2747	ctxt_table_entries = ext ? 512 : 256;
2748	ret = -ENOMEM;
2749	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2750	if (!ctxt_tbls)
2751		goto out_unmap;
2752
2753	for (bus = 0; bus < 256; bus++) {
2754		ret = copy_context_table(iommu, &old_rt[bus],
2755					 ctxt_tbls, bus, ext);
2756		if (ret) {
2757			pr_err("%s: Failed to copy context table for bus %d\n",
2758				iommu->name, bus);
2759			continue;
2760		}
2761	}
2762
2763	spin_lock(&iommu->lock);
2764
2765	/* Context tables are copied, now write them to the root_entry table */
2766	for (bus = 0; bus < 256; bus++) {
2767		int idx = ext ? bus * 2 : bus;
2768		u64 val;
2769
2770		if (ctxt_tbls[idx]) {
2771			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2772			iommu->root_entry[bus].lo = val;
2773		}
2774
2775		if (!ext || !ctxt_tbls[idx + 1])
2776			continue;
2777
2778		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2779		iommu->root_entry[bus].hi = val;
2780	}
2781
2782	spin_unlock(&iommu->lock);
2783
2784	kfree(ctxt_tbls);
2785
2786	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2787
2788	ret = 0;
2789
2790out_unmap:
2791	memunmap(old_rt);
2792
2793	return ret;
2794}
2795
2796#ifdef CONFIG_INTEL_IOMMU_SVM
2797static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2798{
2799	struct intel_iommu *iommu = data;
2800	ioasid_t ioasid;
2801
2802	if (!iommu)
2803		return INVALID_IOASID;
2804	/*
2805	 * VT-d virtual command interface always uses the full 20 bit
2806	 * PASID range. Host can partition guest PASID range based on
2807	 * policies but it is out of guest's control.
2808	 */
2809	if (min < PASID_MIN || max > intel_pasid_max_id)
2810		return INVALID_IOASID;
2811
2812	if (vcmd_alloc_pasid(iommu, &ioasid))
2813		return INVALID_IOASID;
2814
2815	return ioasid;
2816}
2817
2818static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2819{
2820	struct intel_iommu *iommu = data;
2821
2822	if (!iommu)
2823		return;
2824	/*
2825	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2826	 * We can only free the PASID when all the devices are unbound.
2827	 */
2828	if (ioasid_find(NULL, ioasid, NULL)) {
2829		pr_alert("Cannot free active IOASID %d\n", ioasid);
2830		return;
2831	}
2832	vcmd_free_pasid(iommu, ioasid);
2833}
2834
2835static void register_pasid_allocator(struct intel_iommu *iommu)
2836{
2837	/*
2838	 * If we are running in the host, no need for custom allocator
2839	 * in that PASIDs are allocated from the host system-wide.
2840	 */
2841	if (!cap_caching_mode(iommu->cap))
2842		return;
2843
2844	if (!sm_supported(iommu)) {
2845		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2846		return;
2847	}
2848
2849	/*
2850	 * Register a custom PASID allocator if we are running in a guest,
2851	 * guest PASID must be obtained via virtual command interface.
2852	 * There can be multiple vIOMMUs in each guest but only one allocator
2853	 * is active. All vIOMMU allocators will eventually be calling the same
2854	 * host allocator.
2855	 */
2856	if (!vccap_pasid(iommu->vccap))
2857		return;
2858
2859	pr_info("Register custom PASID allocator\n");
2860	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2861	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2862	iommu->pasid_allocator.pdata = (void *)iommu;
2863	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2864		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2865		/*
2866		 * Disable scalable mode on this IOMMU if there
2867		 * is no custom allocator. Mixing SM capable vIOMMU
2868		 * and non-SM vIOMMU are not supported.
2869		 */
2870		intel_iommu_sm = 0;
2871	}
2872}
2873#endif
2874
2875static int __init init_dmars(void)
2876{
2877	struct dmar_drhd_unit *drhd;
2878	struct intel_iommu *iommu;
2879	int ret;
2880
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2881	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2882	if (ret)
2883		goto free_iommu;
2884
2885	for_each_iommu(iommu, drhd) {
2886		if (drhd->ignored) {
2887			iommu_disable_translation(iommu);
2888			continue;
2889		}
2890
2891		/*
2892		 * Find the max pasid size of all IOMMU's in the system.
2893		 * We need to ensure the system pasid table is no bigger
2894		 * than the smallest supported.
2895		 */
2896		if (pasid_supported(iommu)) {
2897			u32 temp = 2 << ecap_pss(iommu->ecap);
2898
2899			intel_pasid_max_id = min_t(u32, temp,
2900						   intel_pasid_max_id);
2901		}
2902
 
 
2903		intel_iommu_init_qi(iommu);
2904
2905		ret = iommu_init_domains(iommu);
2906		if (ret)
2907			goto free_iommu;
2908
2909		init_translation_status(iommu);
2910
2911		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2912			iommu_disable_translation(iommu);
2913			clear_translation_pre_enabled(iommu);
2914			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2915				iommu->name);
2916		}
2917
2918		/*
2919		 * TBD:
2920		 * we could share the same root & context tables
2921		 * among all IOMMU's. Need to Split it later.
2922		 */
2923		ret = iommu_alloc_root_entry(iommu);
2924		if (ret)
2925			goto free_iommu;
2926
2927		if (translation_pre_enabled(iommu)) {
2928			pr_info("Translation already enabled - trying to copy translation structures\n");
2929
2930			ret = copy_translation_tables(iommu);
2931			if (ret) {
2932				/*
2933				 * We found the IOMMU with translation
2934				 * enabled - but failed to copy over the
2935				 * old root-entry table. Try to proceed
2936				 * by disabling translation now and
2937				 * allocating a clean root-entry table.
2938				 * This might cause DMAR faults, but
2939				 * probably the dump will still succeed.
2940				 */
2941				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2942				       iommu->name);
2943				iommu_disable_translation(iommu);
2944				clear_translation_pre_enabled(iommu);
2945			} else {
2946				pr_info("Copied translation tables from previous kernel for %s\n",
2947					iommu->name);
2948			}
2949		}
2950
2951		if (!ecap_pass_through(iommu->ecap))
2952			hw_pass_through = 0;
2953		intel_svm_check(iommu);
2954	}
2955
2956	/*
2957	 * Now that qi is enabled on all iommus, set the root entry and flush
2958	 * caches. This is required on some Intel X58 chipsets, otherwise the
2959	 * flush_context function will loop forever and the boot hangs.
2960	 */
2961	for_each_active_iommu(iommu, drhd) {
2962		iommu_flush_write_buffer(iommu);
2963#ifdef CONFIG_INTEL_IOMMU_SVM
2964		register_pasid_allocator(iommu);
2965#endif
2966		iommu_set_root_entry(iommu);
2967	}
2968
2969#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2970	dmar_map_gfx = 0;
2971#endif
2972
2973	if (!dmar_map_gfx)
2974		iommu_identity_mapping |= IDENTMAP_GFX;
2975
2976	check_tylersburg_isoch();
2977
2978	ret = si_domain_init(hw_pass_through);
2979	if (ret)
2980		goto free_iommu;
2981
2982	/*
2983	 * for each drhd
2984	 *   enable fault log
2985	 *   global invalidate context cache
2986	 *   global invalidate iotlb
2987	 *   enable translation
2988	 */
2989	for_each_iommu(iommu, drhd) {
2990		if (drhd->ignored) {
2991			/*
2992			 * we always have to disable PMRs or DMA may fail on
2993			 * this device
2994			 */
2995			if (force_on)
2996				iommu_disable_protect_mem_regions(iommu);
2997			continue;
2998		}
2999
3000		iommu_flush_write_buffer(iommu);
3001
3002#ifdef CONFIG_INTEL_IOMMU_SVM
3003		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3004			/*
3005			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3006			 * could cause possible lock race condition.
3007			 */
3008			up_write(&dmar_global_lock);
3009			ret = intel_svm_enable_prq(iommu);
3010			down_write(&dmar_global_lock);
3011			if (ret)
3012				goto free_iommu;
3013		}
3014#endif
3015		ret = dmar_set_interrupt(iommu);
3016		if (ret)
3017			goto free_iommu;
3018	}
3019
3020	return 0;
3021
3022free_iommu:
3023	for_each_active_iommu(iommu, drhd) {
3024		disable_dmar_iommu(iommu);
3025		free_dmar_iommu(iommu);
3026	}
3027	if (si_domain) {
3028		domain_exit(si_domain);
3029		si_domain = NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3030	}
3031
3032	return ret;
3033}
3034
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3035static void __init init_no_remapping_devices(void)
3036{
3037	struct dmar_drhd_unit *drhd;
3038	struct device *dev;
3039	int i;
3040
3041	for_each_drhd_unit(drhd) {
3042		if (!drhd->include_all) {
3043			for_each_active_dev_scope(drhd->devices,
3044						  drhd->devices_cnt, i, dev)
3045				break;
3046			/* ignore DMAR unit if no devices exist */
3047			if (i == drhd->devices_cnt)
3048				drhd->ignored = 1;
3049		}
3050	}
3051
3052	for_each_active_drhd_unit(drhd) {
3053		if (drhd->include_all)
3054			continue;
3055
3056		for_each_active_dev_scope(drhd->devices,
3057					  drhd->devices_cnt, i, dev)
3058			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3059				break;
3060		if (i < drhd->devices_cnt)
3061			continue;
3062
3063		/* This IOMMU has *only* gfx devices. Either bypass it or
3064		   set the gfx_mapped flag, as appropriate */
3065		drhd->gfx_dedicated = 1;
3066		if (!dmar_map_gfx)
3067			drhd->ignored = 1;
3068	}
3069}
3070
3071#ifdef CONFIG_SUSPEND
3072static int init_iommu_hw(void)
3073{
3074	struct dmar_drhd_unit *drhd;
3075	struct intel_iommu *iommu = NULL;
3076
3077	for_each_active_iommu(iommu, drhd)
3078		if (iommu->qi)
3079			dmar_reenable_qi(iommu);
3080
3081	for_each_iommu(iommu, drhd) {
3082		if (drhd->ignored) {
3083			/*
3084			 * we always have to disable PMRs or DMA may fail on
3085			 * this device
3086			 */
3087			if (force_on)
3088				iommu_disable_protect_mem_regions(iommu);
3089			continue;
3090		}
3091
3092		iommu_flush_write_buffer(iommu);
3093		iommu_set_root_entry(iommu);
3094		iommu_enable_translation(iommu);
3095		iommu_disable_protect_mem_regions(iommu);
3096	}
3097
3098	return 0;
3099}
3100
3101static void iommu_flush_all(void)
3102{
3103	struct dmar_drhd_unit *drhd;
3104	struct intel_iommu *iommu;
3105
3106	for_each_active_iommu(iommu, drhd) {
3107		iommu->flush.flush_context(iommu, 0, 0, 0,
3108					   DMA_CCMD_GLOBAL_INVL);
3109		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3110					 DMA_TLB_GLOBAL_FLUSH);
3111	}
3112}
3113
3114static int iommu_suspend(void)
3115{
3116	struct dmar_drhd_unit *drhd;
3117	struct intel_iommu *iommu = NULL;
3118	unsigned long flag;
3119
3120	for_each_active_iommu(iommu, drhd) {
3121		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3122					     GFP_KERNEL);
3123		if (!iommu->iommu_state)
3124			goto nomem;
3125	}
3126
3127	iommu_flush_all();
3128
3129	for_each_active_iommu(iommu, drhd) {
3130		iommu_disable_translation(iommu);
3131
3132		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3133
3134		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3135			readl(iommu->reg + DMAR_FECTL_REG);
3136		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3137			readl(iommu->reg + DMAR_FEDATA_REG);
3138		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3139			readl(iommu->reg + DMAR_FEADDR_REG);
3140		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3141			readl(iommu->reg + DMAR_FEUADDR_REG);
3142
3143		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3144	}
3145	return 0;
3146
3147nomem:
3148	for_each_active_iommu(iommu, drhd)
3149		kfree(iommu->iommu_state);
3150
3151	return -ENOMEM;
3152}
3153
3154static void iommu_resume(void)
3155{
3156	struct dmar_drhd_unit *drhd;
3157	struct intel_iommu *iommu = NULL;
3158	unsigned long flag;
3159
3160	if (init_iommu_hw()) {
3161		if (force_on)
3162			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3163		else
3164			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3165		return;
3166	}
3167
3168	for_each_active_iommu(iommu, drhd) {
3169
3170		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3171
3172		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3173			iommu->reg + DMAR_FECTL_REG);
3174		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3175			iommu->reg + DMAR_FEDATA_REG);
3176		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3177			iommu->reg + DMAR_FEADDR_REG);
3178		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3179			iommu->reg + DMAR_FEUADDR_REG);
3180
3181		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3182	}
3183
3184	for_each_active_iommu(iommu, drhd)
3185		kfree(iommu->iommu_state);
3186}
3187
3188static struct syscore_ops iommu_syscore_ops = {
3189	.resume		= iommu_resume,
3190	.suspend	= iommu_suspend,
3191};
3192
3193static void __init init_iommu_pm_ops(void)
3194{
3195	register_syscore_ops(&iommu_syscore_ops);
3196}
3197
3198#else
3199static inline void init_iommu_pm_ops(void) {}
3200#endif	/* CONFIG_PM */
3201
3202static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3203{
3204	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3205	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3206	    rmrr->end_address <= rmrr->base_address ||
3207	    arch_rmrr_sanity_check(rmrr))
3208		return -EINVAL;
3209
3210	return 0;
3211}
3212
3213int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3214{
3215	struct acpi_dmar_reserved_memory *rmrr;
3216	struct dmar_rmrr_unit *rmrru;
3217
3218	rmrr = (struct acpi_dmar_reserved_memory *)header;
3219	if (rmrr_sanity_check(rmrr)) {
3220		pr_warn(FW_BUG
3221			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3222			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3223			   rmrr->base_address, rmrr->end_address,
3224			   dmi_get_system_info(DMI_BIOS_VENDOR),
3225			   dmi_get_system_info(DMI_BIOS_VERSION),
3226			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3227		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3228	}
3229
3230	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3231	if (!rmrru)
3232		goto out;
3233
3234	rmrru->hdr = header;
3235
3236	rmrru->base_address = rmrr->base_address;
3237	rmrru->end_address = rmrr->end_address;
3238
3239	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3240				((void *)rmrr) + rmrr->header.length,
3241				&rmrru->devices_cnt);
3242	if (rmrru->devices_cnt && rmrru->devices == NULL)
3243		goto free_rmrru;
3244
3245	list_add(&rmrru->list, &dmar_rmrr_units);
3246
3247	return 0;
3248free_rmrru:
3249	kfree(rmrru);
3250out:
3251	return -ENOMEM;
3252}
3253
3254static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3255{
3256	struct dmar_atsr_unit *atsru;
3257	struct acpi_dmar_atsr *tmp;
3258
3259	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3260				dmar_rcu_check()) {
3261		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3262		if (atsr->segment != tmp->segment)
3263			continue;
3264		if (atsr->header.length != tmp->header.length)
3265			continue;
3266		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3267			return atsru;
3268	}
3269
3270	return NULL;
3271}
3272
3273int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3274{
3275	struct acpi_dmar_atsr *atsr;
3276	struct dmar_atsr_unit *atsru;
3277
3278	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3279		return 0;
3280
3281	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3282	atsru = dmar_find_atsr(atsr);
3283	if (atsru)
3284		return 0;
3285
3286	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3287	if (!atsru)
3288		return -ENOMEM;
3289
3290	/*
3291	 * If memory is allocated from slab by ACPI _DSM method, we need to
3292	 * copy the memory content because the memory buffer will be freed
3293	 * on return.
3294	 */
3295	atsru->hdr = (void *)(atsru + 1);
3296	memcpy(atsru->hdr, hdr, hdr->length);
3297	atsru->include_all = atsr->flags & 0x1;
3298	if (!atsru->include_all) {
3299		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3300				(void *)atsr + atsr->header.length,
3301				&atsru->devices_cnt);
3302		if (atsru->devices_cnt && atsru->devices == NULL) {
3303			kfree(atsru);
3304			return -ENOMEM;
3305		}
3306	}
3307
3308	list_add_rcu(&atsru->list, &dmar_atsr_units);
3309
3310	return 0;
3311}
3312
3313static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3314{
3315	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3316	kfree(atsru);
3317}
3318
3319int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3320{
3321	struct acpi_dmar_atsr *atsr;
3322	struct dmar_atsr_unit *atsru;
3323
3324	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3325	atsru = dmar_find_atsr(atsr);
3326	if (atsru) {
3327		list_del_rcu(&atsru->list);
3328		synchronize_rcu();
3329		intel_iommu_free_atsr(atsru);
3330	}
3331
3332	return 0;
3333}
3334
3335int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3336{
3337	int i;
3338	struct device *dev;
3339	struct acpi_dmar_atsr *atsr;
3340	struct dmar_atsr_unit *atsru;
3341
3342	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3343	atsru = dmar_find_atsr(atsr);
3344	if (!atsru)
3345		return 0;
3346
3347	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3348		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3349					  i, dev)
3350			return -EBUSY;
3351	}
3352
3353	return 0;
3354}
3355
3356static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3357{
3358	struct dmar_satc_unit *satcu;
3359	struct acpi_dmar_satc *tmp;
3360
3361	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3362				dmar_rcu_check()) {
3363		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3364		if (satc->segment != tmp->segment)
3365			continue;
3366		if (satc->header.length != tmp->header.length)
3367			continue;
3368		if (memcmp(satc, tmp, satc->header.length) == 0)
3369			return satcu;
3370	}
3371
3372	return NULL;
3373}
3374
3375int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3376{
3377	struct acpi_dmar_satc *satc;
3378	struct dmar_satc_unit *satcu;
3379
3380	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3381		return 0;
3382
3383	satc = container_of(hdr, struct acpi_dmar_satc, header);
3384	satcu = dmar_find_satc(satc);
3385	if (satcu)
3386		return 0;
3387
3388	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3389	if (!satcu)
3390		return -ENOMEM;
3391
3392	satcu->hdr = (void *)(satcu + 1);
3393	memcpy(satcu->hdr, hdr, hdr->length);
3394	satcu->atc_required = satc->flags & 0x1;
3395	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3396					      (void *)satc + satc->header.length,
3397					      &satcu->devices_cnt);
3398	if (satcu->devices_cnt && !satcu->devices) {
3399		kfree(satcu);
3400		return -ENOMEM;
3401	}
3402	list_add_rcu(&satcu->list, &dmar_satc_units);
3403
3404	return 0;
3405}
3406
3407static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3408{
3409	int sp, ret;
3410	struct intel_iommu *iommu = dmaru->iommu;
3411
 
 
 
3412	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3413	if (ret)
3414		goto out;
3415
3416	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3417		pr_warn("%s: Doesn't support hardware pass through.\n",
3418			iommu->name);
3419		return -ENXIO;
3420	}
3421
 
 
 
 
 
3422	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3423	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3424		pr_warn("%s: Doesn't support large page.\n",
3425			iommu->name);
3426		return -ENXIO;
3427	}
3428
3429	/*
3430	 * Disable translation if already enabled prior to OS handover.
3431	 */
3432	if (iommu->gcmd & DMA_GCMD_TE)
3433		iommu_disable_translation(iommu);
3434
 
3435	ret = iommu_init_domains(iommu);
3436	if (ret == 0)
3437		ret = iommu_alloc_root_entry(iommu);
3438	if (ret)
3439		goto out;
3440
3441	intel_svm_check(iommu);
3442
3443	if (dmaru->ignored) {
3444		/*
3445		 * we always have to disable PMRs or DMA may fail on this device
3446		 */
3447		if (force_on)
3448			iommu_disable_protect_mem_regions(iommu);
3449		return 0;
3450	}
3451
3452	intel_iommu_init_qi(iommu);
3453	iommu_flush_write_buffer(iommu);
3454
3455#ifdef CONFIG_INTEL_IOMMU_SVM
3456	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3457		ret = intel_svm_enable_prq(iommu);
3458		if (ret)
3459			goto disable_iommu;
3460	}
3461#endif
3462	ret = dmar_set_interrupt(iommu);
3463	if (ret)
3464		goto disable_iommu;
3465
3466	iommu_set_root_entry(iommu);
3467	iommu_enable_translation(iommu);
3468
3469	iommu_disable_protect_mem_regions(iommu);
3470	return 0;
3471
3472disable_iommu:
3473	disable_dmar_iommu(iommu);
3474out:
3475	free_dmar_iommu(iommu);
3476	return ret;
3477}
3478
3479int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3480{
3481	int ret = 0;
3482	struct intel_iommu *iommu = dmaru->iommu;
3483
3484	if (!intel_iommu_enabled)
3485		return 0;
3486	if (iommu == NULL)
3487		return -EINVAL;
3488
3489	if (insert) {
3490		ret = intel_iommu_add(dmaru);
3491	} else {
3492		disable_dmar_iommu(iommu);
3493		free_dmar_iommu(iommu);
3494	}
3495
3496	return ret;
3497}
3498
3499static void intel_iommu_free_dmars(void)
3500{
3501	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3502	struct dmar_atsr_unit *atsru, *atsr_n;
3503	struct dmar_satc_unit *satcu, *satc_n;
3504
3505	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3506		list_del(&rmrru->list);
3507		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3508		kfree(rmrru);
3509	}
3510
3511	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3512		list_del(&atsru->list);
3513		intel_iommu_free_atsr(atsru);
3514	}
3515	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3516		list_del(&satcu->list);
3517		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3518		kfree(satcu);
3519	}
3520}
3521
3522static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3523{
3524	struct dmar_satc_unit *satcu;
3525	struct acpi_dmar_satc *satc;
3526	struct device *tmp;
3527	int i;
3528
3529	dev = pci_physfn(dev);
3530	rcu_read_lock();
3531
3532	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3533		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3534		if (satc->segment != pci_domain_nr(dev->bus))
3535			continue;
3536		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3537			if (to_pci_dev(tmp) == dev)
3538				goto out;
3539	}
3540	satcu = NULL;
3541out:
3542	rcu_read_unlock();
3543	return satcu;
3544}
3545
3546static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3547{
3548	int i, ret = 1;
3549	struct pci_bus *bus;
3550	struct pci_dev *bridge = NULL;
3551	struct device *tmp;
3552	struct acpi_dmar_atsr *atsr;
3553	struct dmar_atsr_unit *atsru;
3554	struct dmar_satc_unit *satcu;
3555
3556	dev = pci_physfn(dev);
3557	satcu = dmar_find_matched_satc_unit(dev);
3558	if (satcu)
3559		/*
3560		 * This device supports ATS as it is in SATC table.
3561		 * When IOMMU is in legacy mode, enabling ATS is done
3562		 * automatically by HW for the device that requires
3563		 * ATS, hence OS should not enable this device ATS
3564		 * to avoid duplicated TLB invalidation.
3565		 */
3566		return !(satcu->atc_required && !sm_supported(iommu));
3567
3568	for (bus = dev->bus; bus; bus = bus->parent) {
3569		bridge = bus->self;
3570		/* If it's an integrated device, allow ATS */
3571		if (!bridge)
3572			return 1;
3573		/* Connected via non-PCIe: no ATS */
3574		if (!pci_is_pcie(bridge) ||
3575		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3576			return 0;
3577		/* If we found the root port, look it up in the ATSR */
3578		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3579			break;
3580	}
3581
3582	rcu_read_lock();
3583	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3584		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3585		if (atsr->segment != pci_domain_nr(dev->bus))
3586			continue;
3587
3588		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3589			if (tmp == &bridge->dev)
3590				goto out;
3591
3592		if (atsru->include_all)
3593			goto out;
3594	}
3595	ret = 0;
3596out:
3597	rcu_read_unlock();
3598
3599	return ret;
3600}
3601
3602int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3603{
3604	int ret;
3605	struct dmar_rmrr_unit *rmrru;
3606	struct dmar_atsr_unit *atsru;
3607	struct dmar_satc_unit *satcu;
3608	struct acpi_dmar_atsr *atsr;
3609	struct acpi_dmar_reserved_memory *rmrr;
3610	struct acpi_dmar_satc *satc;
3611
3612	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3613		return 0;
3614
3615	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3616		rmrr = container_of(rmrru->hdr,
3617				    struct acpi_dmar_reserved_memory, header);
3618		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3619			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3620				((void *)rmrr) + rmrr->header.length,
3621				rmrr->segment, rmrru->devices,
3622				rmrru->devices_cnt);
3623			if (ret < 0)
3624				return ret;
3625		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3626			dmar_remove_dev_scope(info, rmrr->segment,
3627				rmrru->devices, rmrru->devices_cnt);
3628		}
3629	}
3630
3631	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3632		if (atsru->include_all)
3633			continue;
3634
3635		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3636		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3637			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3638					(void *)atsr + atsr->header.length,
3639					atsr->segment, atsru->devices,
3640					atsru->devices_cnt);
3641			if (ret > 0)
3642				break;
3643			else if (ret < 0)
3644				return ret;
3645		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3646			if (dmar_remove_dev_scope(info, atsr->segment,
3647					atsru->devices, atsru->devices_cnt))
3648				break;
3649		}
3650	}
3651	list_for_each_entry(satcu, &dmar_satc_units, list) {
3652		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3653		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3654			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3655					(void *)satc + satc->header.length,
3656					satc->segment, satcu->devices,
3657					satcu->devices_cnt);
3658			if (ret > 0)
3659				break;
3660			else if (ret < 0)
3661				return ret;
3662		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3663			if (dmar_remove_dev_scope(info, satc->segment,
3664					satcu->devices, satcu->devices_cnt))
3665				break;
3666		}
3667	}
3668
3669	return 0;
3670}
3671
3672static int intel_iommu_memory_notifier(struct notifier_block *nb,
3673				       unsigned long val, void *v)
3674{
3675	struct memory_notify *mhp = v;
3676	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3677	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3678			mhp->nr_pages - 1);
3679
3680	switch (val) {
3681	case MEM_GOING_ONLINE:
3682		if (iommu_domain_identity_map(si_domain,
3683					      start_vpfn, last_vpfn)) {
3684			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3685				start_vpfn, last_vpfn);
3686			return NOTIFY_BAD;
3687		}
3688		break;
3689
3690	case MEM_OFFLINE:
3691	case MEM_CANCEL_ONLINE:
3692		{
3693			struct dmar_drhd_unit *drhd;
3694			struct intel_iommu *iommu;
3695			LIST_HEAD(freelist);
3696
3697			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
 
 
3698
3699			rcu_read_lock();
3700			for_each_active_iommu(iommu, drhd)
3701				iommu_flush_iotlb_psi(iommu, si_domain,
3702					start_vpfn, mhp->nr_pages,
3703					list_empty(&freelist), 0);
3704			rcu_read_unlock();
3705			put_pages_list(&freelist);
3706		}
3707		break;
3708	}
3709
3710	return NOTIFY_OK;
3711}
3712
3713static struct notifier_block intel_iommu_memory_nb = {
3714	.notifier_call = intel_iommu_memory_notifier,
3715	.priority = 0
3716};
3717
3718static void intel_disable_iommus(void)
3719{
3720	struct intel_iommu *iommu = NULL;
3721	struct dmar_drhd_unit *drhd;
3722
3723	for_each_iommu(iommu, drhd)
3724		iommu_disable_translation(iommu);
3725}
3726
3727void intel_iommu_shutdown(void)
3728{
3729	struct dmar_drhd_unit *drhd;
3730	struct intel_iommu *iommu = NULL;
3731
3732	if (no_iommu || dmar_disabled)
3733		return;
3734
3735	down_write(&dmar_global_lock);
3736
3737	/* Disable PMRs explicitly here. */
3738	for_each_iommu(iommu, drhd)
3739		iommu_disable_protect_mem_regions(iommu);
3740
3741	/* Make sure the IOMMUs are switched off */
3742	intel_disable_iommus();
3743
3744	up_write(&dmar_global_lock);
3745}
3746
3747static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3748{
3749	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3750
3751	return container_of(iommu_dev, struct intel_iommu, iommu);
3752}
3753
3754static ssize_t version_show(struct device *dev,
3755			    struct device_attribute *attr, char *buf)
3756{
3757	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3758	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3759	return sprintf(buf, "%d:%d\n",
3760		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3761}
3762static DEVICE_ATTR_RO(version);
3763
3764static ssize_t address_show(struct device *dev,
3765			    struct device_attribute *attr, char *buf)
3766{
3767	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3768	return sprintf(buf, "%llx\n", iommu->reg_phys);
3769}
3770static DEVICE_ATTR_RO(address);
3771
3772static ssize_t cap_show(struct device *dev,
3773			struct device_attribute *attr, char *buf)
3774{
3775	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3776	return sprintf(buf, "%llx\n", iommu->cap);
3777}
3778static DEVICE_ATTR_RO(cap);
3779
3780static ssize_t ecap_show(struct device *dev,
3781			 struct device_attribute *attr, char *buf)
3782{
3783	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3784	return sprintf(buf, "%llx\n", iommu->ecap);
3785}
3786static DEVICE_ATTR_RO(ecap);
3787
3788static ssize_t domains_supported_show(struct device *dev,
3789				      struct device_attribute *attr, char *buf)
3790{
3791	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3792	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3793}
3794static DEVICE_ATTR_RO(domains_supported);
3795
3796static ssize_t domains_used_show(struct device *dev,
3797				 struct device_attribute *attr, char *buf)
3798{
3799	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3800	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3801						  cap_ndoms(iommu->cap)));
3802}
3803static DEVICE_ATTR_RO(domains_used);
3804
3805static struct attribute *intel_iommu_attrs[] = {
3806	&dev_attr_version.attr,
3807	&dev_attr_address.attr,
3808	&dev_attr_cap.attr,
3809	&dev_attr_ecap.attr,
3810	&dev_attr_domains_supported.attr,
3811	&dev_attr_domains_used.attr,
3812	NULL,
3813};
3814
3815static struct attribute_group intel_iommu_group = {
3816	.name = "intel-iommu",
3817	.attrs = intel_iommu_attrs,
3818};
3819
3820const struct attribute_group *intel_iommu_groups[] = {
3821	&intel_iommu_group,
3822	NULL,
3823};
3824
3825static inline bool has_external_pci(void)
3826{
3827	struct pci_dev *pdev = NULL;
3828
3829	for_each_pci_dev(pdev)
3830		if (pdev->external_facing) {
3831			pci_dev_put(pdev);
3832			return true;
3833		}
3834
3835	return false;
3836}
3837
3838static int __init platform_optin_force_iommu(void)
3839{
3840	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3841		return 0;
3842
3843	if (no_iommu || dmar_disabled)
3844		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3845
3846	/*
3847	 * If Intel-IOMMU is disabled by default, we will apply identity
3848	 * map for all devices except those marked as being untrusted.
3849	 */
3850	if (dmar_disabled)
3851		iommu_set_default_passthrough(false);
3852
3853	dmar_disabled = 0;
3854	no_iommu = 0;
3855
3856	return 1;
3857}
3858
3859static int __init probe_acpi_namespace_devices(void)
3860{
3861	struct dmar_drhd_unit *drhd;
3862	/* To avoid a -Wunused-but-set-variable warning. */
3863	struct intel_iommu *iommu __maybe_unused;
3864	struct device *dev;
3865	int i, ret = 0;
3866
3867	for_each_active_iommu(iommu, drhd) {
3868		for_each_active_dev_scope(drhd->devices,
3869					  drhd->devices_cnt, i, dev) {
3870			struct acpi_device_physical_node *pn;
3871			struct iommu_group *group;
3872			struct acpi_device *adev;
3873
3874			if (dev->bus != &acpi_bus_type)
3875				continue;
3876
3877			adev = to_acpi_device(dev);
3878			mutex_lock(&adev->physical_node_lock);
3879			list_for_each_entry(pn,
3880					    &adev->physical_node_list, node) {
3881				group = iommu_group_get(pn->dev);
3882				if (group) {
3883					iommu_group_put(group);
3884					continue;
3885				}
3886
 
3887				ret = iommu_probe_device(pn->dev);
3888				if (ret)
3889					break;
3890			}
3891			mutex_unlock(&adev->physical_node_lock);
3892
3893			if (ret)
3894				return ret;
3895		}
3896	}
3897
3898	return 0;
3899}
3900
3901static __init int tboot_force_iommu(void)
3902{
3903	if (!tboot_enabled())
3904		return 0;
3905
3906	if (no_iommu || dmar_disabled)
3907		pr_warn("Forcing Intel-IOMMU to enabled\n");
3908
3909	dmar_disabled = 0;
3910	no_iommu = 0;
3911
3912	return 1;
3913}
3914
3915int __init intel_iommu_init(void)
3916{
3917	int ret = -ENODEV;
3918	struct dmar_drhd_unit *drhd;
3919	struct intel_iommu *iommu;
3920
3921	/*
3922	 * Intel IOMMU is required for a TXT/tboot launch or platform
3923	 * opt in, so enforce that.
3924	 */
3925	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3926		    platform_optin_force_iommu();
3927
 
 
 
 
 
 
3928	down_write(&dmar_global_lock);
3929	if (dmar_table_init()) {
3930		if (force_on)
3931			panic("tboot: Failed to initialize DMAR table\n");
3932		goto out_free_dmar;
3933	}
3934
3935	if (dmar_dev_scope_init() < 0) {
3936		if (force_on)
3937			panic("tboot: Failed to initialize DMAR device scope\n");
3938		goto out_free_dmar;
3939	}
3940
3941	up_write(&dmar_global_lock);
3942
3943	/*
3944	 * The bus notifier takes the dmar_global_lock, so lockdep will
3945	 * complain later when we register it under the lock.
3946	 */
3947	dmar_register_bus_notifier();
3948
3949	down_write(&dmar_global_lock);
3950
3951	if (!no_iommu)
3952		intel_iommu_debugfs_init();
3953
3954	if (no_iommu || dmar_disabled) {
3955		/*
3956		 * We exit the function here to ensure IOMMU's remapping and
3957		 * mempool aren't setup, which means that the IOMMU's PMRs
3958		 * won't be disabled via the call to init_dmars(). So disable
3959		 * it explicitly here. The PMRs were setup by tboot prior to
3960		 * calling SENTER, but the kernel is expected to reset/tear
3961		 * down the PMRs.
3962		 */
3963		if (intel_iommu_tboot_noforce) {
3964			for_each_iommu(iommu, drhd)
3965				iommu_disable_protect_mem_regions(iommu);
3966		}
3967
3968		/*
3969		 * Make sure the IOMMUs are switched off, even when we
3970		 * boot into a kexec kernel and the previous kernel left
3971		 * them enabled
3972		 */
3973		intel_disable_iommus();
3974		goto out_free_dmar;
3975	}
3976
3977	if (list_empty(&dmar_rmrr_units))
3978		pr_info("No RMRR found\n");
3979
3980	if (list_empty(&dmar_atsr_units))
3981		pr_info("No ATSR found\n");
3982
3983	if (list_empty(&dmar_satc_units))
3984		pr_info("No SATC found\n");
3985
 
 
 
3986	init_no_remapping_devices();
3987
3988	ret = init_dmars();
3989	if (ret) {
3990		if (force_on)
3991			panic("tboot: Failed to initialize DMARs\n");
3992		pr_err("Initialization failed\n");
3993		goto out_free_dmar;
3994	}
3995	up_write(&dmar_global_lock);
3996
3997	init_iommu_pm_ops();
3998
3999	down_read(&dmar_global_lock);
4000	for_each_active_iommu(iommu, drhd) {
4001		/*
4002		 * The flush queue implementation does not perform
4003		 * page-selective invalidations that are required for efficient
4004		 * TLB flushes in virtual environments.  The benefit of batching
4005		 * is likely to be much lower than the overhead of synchronizing
4006		 * the virtual and physical IOMMU page-tables.
4007		 */
4008		if (cap_caching_mode(iommu->cap)) {
4009			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4010			iommu_set_dma_strict();
4011		}
4012		iommu_device_sysfs_add(&iommu->iommu, NULL,
4013				       intel_iommu_groups,
4014				       "%s", iommu->name);
4015		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4016	}
4017	up_read(&dmar_global_lock);
4018
 
 
4019	if (si_domain && !hw_pass_through)
4020		register_memory_notifier(&intel_iommu_memory_nb);
4021
4022	down_read(&dmar_global_lock);
4023	if (probe_acpi_namespace_devices())
4024		pr_warn("ACPI name space devices didn't probe correctly\n");
4025
4026	/* Finally, we enable the DMA remapping hardware. */
4027	for_each_iommu(iommu, drhd) {
4028		if (!drhd->ignored && !translation_pre_enabled(iommu))
4029			iommu_enable_translation(iommu);
4030
4031		iommu_disable_protect_mem_regions(iommu);
4032	}
4033	up_read(&dmar_global_lock);
4034
4035	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4036
4037	intel_iommu_enabled = 1;
4038
4039	return 0;
4040
4041out_free_dmar:
4042	intel_iommu_free_dmars();
4043	up_write(&dmar_global_lock);
 
4044	return ret;
4045}
4046
4047static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4048{
4049	struct device_domain_info *info = opaque;
4050
4051	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4052	return 0;
4053}
4054
4055/*
4056 * NB - intel-iommu lacks any sort of reference counting for the users of
4057 * dependent devices.  If multiple endpoints have intersecting dependent
4058 * devices, unbinding the driver from any one of them will possibly leave
4059 * the others unable to operate.
4060 */
4061static void domain_context_clear(struct device_domain_info *info)
4062{
4063	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4064		return;
4065
4066	pci_for_each_dma_alias(to_pci_dev(info->dev),
4067			       &domain_context_clear_one_cb, info);
4068}
4069
4070static void dmar_remove_one_dev_info(struct device *dev)
4071{
4072	struct device_domain_info *info = dev_iommu_priv_get(dev);
4073	struct dmar_domain *domain = info->domain;
4074	struct intel_iommu *iommu = info->iommu;
4075	unsigned long flags;
4076
4077	if (!dev_is_real_dma_subdevice(info->dev)) {
 
 
 
 
 
 
 
 
4078		if (dev_is_pci(info->dev) && sm_supported(iommu))
4079			intel_pasid_tear_down_entry(iommu, info->dev,
4080					PASID_RID2PASID, false);
4081
4082		iommu_disable_pci_caps(info);
4083		domain_context_clear(info);
 
4084	}
4085
4086	spin_lock_irqsave(&domain->lock, flags);
4087	list_del(&info->link);
4088	spin_unlock_irqrestore(&domain->lock, flags);
4089
 
4090	domain_detach_iommu(domain, iommu);
4091	info->domain = NULL;
 
 
4092}
4093
4094/*
4095 * Clear the page table pointer in context or pasid table entries so that
4096 * all DMA requests without PASID from the device are blocked. If the page
4097 * table has been set, clean up the data structures.
4098 */
4099static void device_block_translation(struct device *dev)
4100{
4101	struct device_domain_info *info = dev_iommu_priv_get(dev);
4102	struct intel_iommu *iommu = info->iommu;
4103	unsigned long flags;
4104
4105	iommu_disable_pci_caps(info);
4106	if (!dev_is_real_dma_subdevice(dev)) {
4107		if (sm_supported(iommu))
4108			intel_pasid_tear_down_entry(iommu, dev,
4109						    PASID_RID2PASID, false);
4110		else
4111			domain_context_clear(info);
4112	}
4113
4114	if (!info->domain)
4115		return;
4116
4117	spin_lock_irqsave(&info->domain->lock, flags);
4118	list_del(&info->link);
4119	spin_unlock_irqrestore(&info->domain->lock, flags);
4120
4121	domain_detach_iommu(info->domain, iommu);
4122	info->domain = NULL;
4123}
4124
4125static int md_domain_init(struct dmar_domain *domain, int guest_width)
4126{
4127	int adjust_width;
4128
4129	/* calculate AGAW */
4130	domain->gaw = guest_width;
4131	adjust_width = guestwidth_to_adjustwidth(guest_width);
4132	domain->agaw = width_to_agaw(adjust_width);
4133
4134	domain->iommu_coherency = false;
 
4135	domain->iommu_superpage = 0;
4136	domain->max_addr = 0;
4137
4138	/* always allocate the top pgd */
4139	domain->pgd = alloc_pgtable_page(domain->nid);
4140	if (!domain->pgd)
4141		return -ENOMEM;
4142	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4143	return 0;
4144}
4145
4146static int blocking_domain_attach_dev(struct iommu_domain *domain,
4147				      struct device *dev)
4148{
4149	device_block_translation(dev);
4150	return 0;
4151}
4152
4153static struct iommu_domain blocking_domain = {
4154	.ops = &(const struct iommu_domain_ops) {
4155		.attach_dev	= blocking_domain_attach_dev,
4156		.free		= intel_iommu_domain_free
4157	}
4158};
4159
4160static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4161{
4162	struct dmar_domain *dmar_domain;
4163	struct iommu_domain *domain;
4164
4165	switch (type) {
4166	case IOMMU_DOMAIN_BLOCKED:
4167		return &blocking_domain;
4168	case IOMMU_DOMAIN_DMA:
4169	case IOMMU_DOMAIN_DMA_FQ:
4170	case IOMMU_DOMAIN_UNMANAGED:
4171		dmar_domain = alloc_domain(type);
4172		if (!dmar_domain) {
4173			pr_err("Can't allocate dmar_domain\n");
4174			return NULL;
4175		}
4176		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4177			pr_err("Domain initialization failed\n");
4178			domain_exit(dmar_domain);
4179			return NULL;
4180		}
4181
 
 
 
 
4182		domain = &dmar_domain->domain;
4183		domain->geometry.aperture_start = 0;
4184		domain->geometry.aperture_end   =
4185				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4186		domain->geometry.force_aperture = true;
4187
4188		return domain;
4189	case IOMMU_DOMAIN_IDENTITY:
4190		return &si_domain->domain;
4191	case IOMMU_DOMAIN_SVA:
4192		return intel_svm_domain_alloc();
4193	default:
4194		return NULL;
4195	}
4196
4197	return NULL;
4198}
4199
4200static void intel_iommu_domain_free(struct iommu_domain *domain)
4201{
4202	if (domain != &si_domain->domain && domain != &blocking_domain)
4203		domain_exit(to_dmar_domain(domain));
4204}
4205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4206static int prepare_domain_attach_device(struct iommu_domain *domain,
4207					struct device *dev)
4208{
4209	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4210	struct intel_iommu *iommu;
4211	int addr_width;
4212
4213	iommu = device_to_iommu(dev, NULL, NULL);
4214	if (!iommu)
4215		return -ENODEV;
4216
4217	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
 
 
 
4218		return -EINVAL;
 
4219
4220	/* check if this iommu agaw is sufficient for max mapped address */
4221	addr_width = agaw_to_width(iommu->agaw);
4222	if (addr_width > cap_mgaw(iommu->cap))
4223		addr_width = cap_mgaw(iommu->cap);
4224
4225	if (dmar_domain->max_addr > (1LL << addr_width))
4226		return -EINVAL;
 
 
 
 
4227	dmar_domain->gaw = addr_width;
4228
4229	/*
4230	 * Knock out extra levels of page tables if necessary
4231	 */
4232	while (iommu->agaw < dmar_domain->agaw) {
4233		struct dma_pte *pte;
4234
4235		pte = dmar_domain->pgd;
4236		if (dma_pte_present(pte)) {
4237			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4238			free_pgtable_page(pte);
4239		}
4240		dmar_domain->agaw--;
4241	}
4242
4243	return 0;
4244}
4245
4246static int intel_iommu_attach_device(struct iommu_domain *domain,
4247				     struct device *dev)
4248{
4249	struct device_domain_info *info = dev_iommu_priv_get(dev);
4250	int ret;
4251
4252	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4253	    device_is_rmrr_locked(dev)) {
4254		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4255		return -EPERM;
4256	}
4257
4258	if (info->domain)
4259		device_block_translation(dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4260
4261	ret = prepare_domain_attach_device(domain, dev);
4262	if (ret)
4263		return ret;
4264
4265	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4266}
4267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4268static int intel_iommu_map(struct iommu_domain *domain,
4269			   unsigned long iova, phys_addr_t hpa,
4270			   size_t size, int iommu_prot, gfp_t gfp)
4271{
4272	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4273	u64 max_addr;
4274	int prot = 0;
4275
4276	if (iommu_prot & IOMMU_READ)
4277		prot |= DMA_PTE_READ;
4278	if (iommu_prot & IOMMU_WRITE)
4279		prot |= DMA_PTE_WRITE;
4280	if (dmar_domain->set_pte_snp)
4281		prot |= DMA_PTE_SNP;
4282
4283	max_addr = iova + size;
4284	if (dmar_domain->max_addr < max_addr) {
4285		u64 end;
4286
4287		/* check if minimum agaw is sufficient for mapped address */
4288		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4289		if (end < max_addr) {
4290			pr_err("%s: iommu width (%d) is not "
4291			       "sufficient for the mapped address (%llx)\n",
4292			       __func__, dmar_domain->gaw, max_addr);
4293			return -EFAULT;
4294		}
4295		dmar_domain->max_addr = max_addr;
4296	}
4297	/* Round up size to next multiple of PAGE_SIZE, if it and
4298	   the low bits of hpa would take us onto the next page */
4299	size = aligned_nrpages(hpa, size);
4300	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4301				hpa >> VTD_PAGE_SHIFT, size, prot);
4302}
4303
4304static int intel_iommu_map_pages(struct iommu_domain *domain,
4305				 unsigned long iova, phys_addr_t paddr,
4306				 size_t pgsize, size_t pgcount,
4307				 int prot, gfp_t gfp, size_t *mapped)
4308{
4309	unsigned long pgshift = __ffs(pgsize);
4310	size_t size = pgcount << pgshift;
4311	int ret;
4312
4313	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4314		return -EINVAL;
4315
4316	if (!IS_ALIGNED(iova | paddr, pgsize))
4317		return -EINVAL;
4318
4319	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4320	if (!ret && mapped)
4321		*mapped = size;
4322
4323	return ret;
4324}
4325
4326static size_t intel_iommu_unmap(struct iommu_domain *domain,
4327				unsigned long iova, size_t size,
4328				struct iommu_iotlb_gather *gather)
4329{
4330	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4331	unsigned long start_pfn, last_pfn;
4332	int level = 0;
4333
4334	/* Cope with horrid API which requires us to unmap more than the
4335	   size argument if it happens to be a large-page mapping. */
4336	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4337
4338	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4339		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4340
4341	start_pfn = iova >> VTD_PAGE_SHIFT;
4342	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4343
4344	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
 
4345
4346	if (dmar_domain->max_addr == iova + size)
4347		dmar_domain->max_addr = iova;
4348
4349	iommu_iotlb_gather_add_page(domain, gather, iova, size);
4350
4351	return size;
4352}
4353
4354static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4355				      unsigned long iova,
4356				      size_t pgsize, size_t pgcount,
4357				      struct iommu_iotlb_gather *gather)
4358{
4359	unsigned long pgshift = __ffs(pgsize);
4360	size_t size = pgcount << pgshift;
4361
4362	return intel_iommu_unmap(domain, iova, size, gather);
4363}
4364
4365static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4366				 struct iommu_iotlb_gather *gather)
4367{
4368	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4369	unsigned long iova_pfn = IOVA_PFN(gather->start);
4370	size_t size = gather->end - gather->start;
4371	struct iommu_domain_info *info;
4372	unsigned long start_pfn;
4373	unsigned long nrpages;
4374	unsigned long i;
4375
4376	nrpages = aligned_nrpages(gather->start, size);
4377	start_pfn = mm_to_dma_pfn(iova_pfn);
4378
4379	xa_for_each(&dmar_domain->iommu_array, i, info)
4380		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4381				      start_pfn, nrpages,
4382				      list_empty(&gather->freelist), 0);
4383
4384	put_pages_list(&gather->freelist);
4385}
4386
4387static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4388					    dma_addr_t iova)
4389{
4390	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4391	struct dma_pte *pte;
4392	int level = 0;
4393	u64 phys = 0;
4394
4395	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4396	if (pte && dma_pte_present(pte))
4397		phys = dma_pte_addr(pte) +
4398			(iova & (BIT_MASK(level_to_offset_bits(level) +
4399						VTD_PAGE_SHIFT) - 1));
4400
4401	return phys;
4402}
4403
4404static bool domain_support_force_snooping(struct dmar_domain *domain)
4405{
4406	struct device_domain_info *info;
4407	bool support = true;
4408
4409	assert_spin_locked(&domain->lock);
4410	list_for_each_entry(info, &domain->devices, link) {
4411		if (!ecap_sc_support(info->iommu->ecap)) {
4412			support = false;
4413			break;
4414		}
4415	}
4416
4417	return support;
4418}
4419
4420static void domain_set_force_snooping(struct dmar_domain *domain)
4421{
4422	struct device_domain_info *info;
4423
4424	assert_spin_locked(&domain->lock);
4425	/*
4426	 * Second level page table supports per-PTE snoop control. The
4427	 * iommu_map() interface will handle this by setting SNP bit.
4428	 */
4429	if (!domain->use_first_level) {
4430		domain->set_pte_snp = true;
4431		return;
4432	}
4433
4434	list_for_each_entry(info, &domain->devices, link)
4435		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4436						     PASID_RID2PASID);
4437}
4438
4439static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4440{
4441	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4442	unsigned long flags;
4443
4444	if (dmar_domain->force_snooping)
4445		return true;
4446
4447	spin_lock_irqsave(&dmar_domain->lock, flags);
4448	if (!domain_support_force_snooping(dmar_domain)) {
4449		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4450		return false;
4451	}
4452
4453	domain_set_force_snooping(dmar_domain);
4454	dmar_domain->force_snooping = true;
4455	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4456
4457	return true;
4458}
4459
4460static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4461{
4462	struct device_domain_info *info = dev_iommu_priv_get(dev);
4463
4464	switch (cap) {
4465	case IOMMU_CAP_CACHE_COHERENCY:
4466		return true;
4467	case IOMMU_CAP_INTR_REMAP:
4468		return irq_remapping_enabled == 1;
4469	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4470		return dmar_platform_optin();
4471	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4472		return ecap_sc_support(info->iommu->ecap);
4473	default:
4474		return false;
4475	}
4476}
4477
4478static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4479{
4480	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4481	struct device_domain_info *info;
4482	struct intel_iommu *iommu;
4483	u8 bus, devfn;
4484	int ret;
4485
4486	iommu = device_to_iommu(dev, &bus, &devfn);
4487	if (!iommu || !iommu->iommu.ops)
4488		return ERR_PTR(-ENODEV);
4489
4490	info = kzalloc(sizeof(*info), GFP_KERNEL);
4491	if (!info)
4492		return ERR_PTR(-ENOMEM);
4493
4494	if (dev_is_real_dma_subdevice(dev)) {
4495		info->bus = pdev->bus->number;
4496		info->devfn = pdev->devfn;
4497		info->segment = pci_domain_nr(pdev->bus);
4498	} else {
4499		info->bus = bus;
4500		info->devfn = devfn;
4501		info->segment = iommu->segment;
4502	}
4503
4504	info->dev = dev;
4505	info->iommu = iommu;
4506	if (dev_is_pci(dev)) {
4507		if (ecap_dev_iotlb_support(iommu->ecap) &&
4508		    pci_ats_supported(pdev) &&
4509		    dmar_ats_supported(pdev, iommu)) {
4510			info->ats_supported = 1;
4511			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4512		}
4513		if (sm_supported(iommu)) {
4514			if (pasid_supported(iommu)) {
4515				int features = pci_pasid_features(pdev);
4516
4517				if (features >= 0)
4518					info->pasid_supported = features | 1;
4519			}
4520
4521			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4522			    pci_pri_supported(pdev))
4523				info->pri_supported = 1;
4524		}
4525	}
4526
4527	dev_iommu_priv_set(dev, info);
4528
4529	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4530		ret = intel_pasid_alloc_table(dev);
4531		if (ret) {
4532			dev_err(dev, "PASID table allocation failed\n");
4533			dev_iommu_priv_set(dev, NULL);
4534			kfree(info);
4535			return ERR_PTR(ret);
4536		}
4537	}
4538
4539	return &iommu->iommu;
4540}
4541
4542static void intel_iommu_release_device(struct device *dev)
4543{
4544	struct device_domain_info *info = dev_iommu_priv_get(dev);
 
 
 
 
4545
4546	dmar_remove_one_dev_info(dev);
4547	intel_pasid_free_table(dev);
4548	dev_iommu_priv_set(dev, NULL);
4549	kfree(info);
4550	set_dma_ops(dev, NULL);
4551}
4552
4553static void intel_iommu_probe_finalize(struct device *dev)
4554{
4555	set_dma_ops(dev, NULL);
4556	iommu_setup_dma_ops(dev, 0, U64_MAX);
 
 
 
 
4557}
4558
4559static void intel_iommu_get_resv_regions(struct device *device,
4560					 struct list_head *head)
4561{
4562	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4563	struct iommu_resv_region *reg;
4564	struct dmar_rmrr_unit *rmrr;
4565	struct device *i_dev;
4566	int i;
4567
4568	rcu_read_lock();
4569	for_each_rmrr_units(rmrr) {
4570		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4571					  i, i_dev) {
4572			struct iommu_resv_region *resv;
4573			enum iommu_resv_type type;
4574			size_t length;
4575
4576			if (i_dev != device &&
4577			    !is_downstream_to_pci_bridge(device, i_dev))
4578				continue;
4579
4580			length = rmrr->end_address - rmrr->base_address + 1;
4581
4582			type = device_rmrr_is_relaxable(device) ?
4583				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4584
4585			resv = iommu_alloc_resv_region(rmrr->base_address,
4586						       length, prot, type,
4587						       GFP_ATOMIC);
4588			if (!resv)
4589				break;
4590
4591			list_add_tail(&resv->list, head);
4592		}
4593	}
4594	rcu_read_unlock();
4595
4596#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4597	if (dev_is_pci(device)) {
4598		struct pci_dev *pdev = to_pci_dev(device);
4599
4600		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4601			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4602					IOMMU_RESV_DIRECT_RELAXABLE,
4603					GFP_KERNEL);
4604			if (reg)
4605				list_add_tail(&reg->list, head);
4606		}
4607	}
4608#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4609
4610	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4611				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4612				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4613	if (!reg)
4614		return;
4615	list_add_tail(&reg->list, head);
4616}
4617
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4618static struct iommu_group *intel_iommu_device_group(struct device *dev)
4619{
4620	if (dev_is_pci(dev))
4621		return pci_device_group(dev);
4622	return generic_device_group(dev);
4623}
4624
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4625static int intel_iommu_enable_sva(struct device *dev)
4626{
4627	struct device_domain_info *info = dev_iommu_priv_get(dev);
4628	struct intel_iommu *iommu;
4629	int ret;
4630
4631	if (!info || dmar_disabled)
4632		return -EINVAL;
4633
4634	iommu = info->iommu;
4635	if (!iommu)
4636		return -EINVAL;
4637
4638	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4639		return -ENODEV;
4640
 
 
 
4641	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4642		return -EINVAL;
4643
4644	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4645	if (!ret)
4646		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4647
4648	return ret;
4649}
4650
4651static int intel_iommu_disable_sva(struct device *dev)
4652{
4653	struct device_domain_info *info = dev_iommu_priv_get(dev);
4654	struct intel_iommu *iommu = info->iommu;
4655	int ret;
4656
4657	ret = iommu_unregister_device_fault_handler(dev);
4658	if (!ret)
4659		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4660
4661	return ret;
4662}
4663
4664static int intel_iommu_enable_iopf(struct device *dev)
 
 
 
 
 
 
 
 
 
 
4665{
4666	struct device_domain_info *info = dev_iommu_priv_get(dev);
 
4667
4668	if (info && info->pri_supported)
4669		return 0;
 
 
 
 
4670
4671	return -ENODEV;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4672}
4673
4674static int
4675intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4676{
4677	switch (feat) {
 
 
 
4678	case IOMMU_DEV_FEAT_IOPF:
4679		return intel_iommu_enable_iopf(dev);
4680
4681	case IOMMU_DEV_FEAT_SVA:
4682		return intel_iommu_enable_sva(dev);
4683
4684	default:
4685		return -ENODEV;
4686	}
4687}
4688
4689static int
4690intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4691{
4692	switch (feat) {
 
 
 
4693	case IOMMU_DEV_FEAT_IOPF:
4694		return 0;
4695
4696	case IOMMU_DEV_FEAT_SVA:
4697		return intel_iommu_disable_sva(dev);
4698
4699	default:
4700		return -ENODEV;
4701	}
4702}
4703
4704static bool intel_iommu_is_attach_deferred(struct device *dev)
 
 
 
 
 
 
 
 
 
 
 
 
4705{
4706	struct device_domain_info *info = dev_iommu_priv_get(dev);
4707
4708	return translation_pre_enabled(info->iommu) && !info->domain;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4709}
4710
4711/*
4712 * Check that the device does not live on an external facing PCI port that is
4713 * marked as untrusted. Such devices should not be able to apply quirks and
4714 * thus not be able to bypass the IOMMU restrictions.
4715 */
4716static bool risky_device(struct pci_dev *pdev)
4717{
4718	if (pdev->untrusted) {
4719		pci_info(pdev,
4720			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4721			 pdev->vendor, pdev->device);
4722		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4723		return true;
4724	}
4725	return false;
4726}
4727
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4728static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4729				       unsigned long iova, size_t size)
4730{
4731	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4732	unsigned long pages = aligned_nrpages(iova, size);
4733	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4734	struct iommu_domain_info *info;
4735	unsigned long i;
4736
4737	xa_for_each(&dmar_domain->iommu_array, i, info)
4738		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4739}
4740
4741static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4742{
4743	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4744	struct iommu_domain *domain;
4745
4746	/* Domain type specific cleanup: */
4747	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4748	if (domain) {
4749		switch (domain->type) {
4750		case IOMMU_DOMAIN_SVA:
4751			intel_svm_remove_dev_pasid(dev, pasid);
4752			break;
4753		default:
4754			/* should never reach here */
4755			WARN_ON(1);
4756			break;
4757		}
4758	}
4759
4760	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4761}
4762
4763const struct iommu_ops intel_iommu_ops = {
4764	.capable		= intel_iommu_capable,
4765	.domain_alloc		= intel_iommu_domain_alloc,
 
 
 
 
 
 
 
 
 
 
 
 
 
4766	.probe_device		= intel_iommu_probe_device,
4767	.probe_finalize		= intel_iommu_probe_finalize,
4768	.release_device		= intel_iommu_release_device,
4769	.get_resv_regions	= intel_iommu_get_resv_regions,
 
4770	.device_group		= intel_iommu_device_group,
 
 
4771	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4772	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4773	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4774	.def_domain_type	= device_def_domain_type,
4775	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4776	.pgsize_bitmap		= SZ_4K,
4777#ifdef CONFIG_INTEL_IOMMU_SVM
 
 
 
 
 
 
4778	.page_response		= intel_svm_page_response,
4779#endif
4780	.default_domain_ops = &(const struct iommu_domain_ops) {
4781		.attach_dev		= intel_iommu_attach_device,
4782		.map_pages		= intel_iommu_map_pages,
4783		.unmap_pages		= intel_iommu_unmap_pages,
4784		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4785		.flush_iotlb_all        = intel_flush_iotlb_all,
4786		.iotlb_sync		= intel_iommu_tlb_sync,
4787		.iova_to_phys		= intel_iommu_iova_to_phys,
4788		.free			= intel_iommu_domain_free,
4789		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4790	}
4791};
4792
4793static void quirk_iommu_igfx(struct pci_dev *dev)
4794{
4795	if (risky_device(dev))
4796		return;
4797
4798	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4799	dmar_map_gfx = 0;
4800}
4801
4802/* G4x/GM45 integrated gfx dmar support is totally busted. */
4803DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4804DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4805DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4806DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4807DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4808DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4809DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4810
4811/* Broadwell igfx malfunctions with dmar */
4812DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4813DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4814DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4815DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4816DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4817DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4818DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4819DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4820DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4821DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4822DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4823DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4824DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4825DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4826DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4827DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4828DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4829DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4830DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4831DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4832DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4833DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4834DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4835DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4836
4837static void quirk_iommu_rwbf(struct pci_dev *dev)
4838{
4839	if (risky_device(dev))
4840		return;
4841
4842	/*
4843	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4844	 * but needs it. Same seems to hold for the desktop versions.
4845	 */
4846	pci_info(dev, "Forcing write-buffer flush capability\n");
4847	rwbf_quirk = 1;
4848}
4849
4850DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4851DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4852DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4853DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4854DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4855DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4856DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4857
4858#define GGC 0x52
4859#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4860#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4861#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4862#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4863#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4864#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4865#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4866#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4867
4868static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4869{
4870	unsigned short ggc;
4871
4872	if (risky_device(dev))
4873		return;
4874
4875	if (pci_read_config_word(dev, GGC, &ggc))
4876		return;
4877
4878	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4879		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4880		dmar_map_gfx = 0;
4881	} else if (dmar_map_gfx) {
4882		/* we have to ensure the gfx device is idle before we flush */
4883		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4884		iommu_set_dma_strict();
4885	}
4886}
4887DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4888DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4889DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4890DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4891
4892static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4893{
4894	unsigned short ver;
4895
4896	if (!IS_GFX_DEVICE(dev))
4897		return;
4898
4899	ver = (dev->device >> 8) & 0xff;
4900	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4901	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4902	    ver != 0x9a && ver != 0xa7)
4903		return;
4904
4905	if (risky_device(dev))
4906		return;
4907
4908	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4909	iommu_skip_te_disable = 1;
4910}
4911DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4912
4913/* On Tylersburg chipsets, some BIOSes have been known to enable the
4914   ISOCH DMAR unit for the Azalia sound device, but not give it any
4915   TLB entries, which causes it to deadlock. Check for that.  We do
4916   this in a function called from init_dmars(), instead of in a PCI
4917   quirk, because we don't want to print the obnoxious "BIOS broken"
4918   message if VT-d is actually disabled.
4919*/
4920static void __init check_tylersburg_isoch(void)
4921{
4922	struct pci_dev *pdev;
4923	uint32_t vtisochctrl;
4924
4925	/* If there's no Azalia in the system anyway, forget it. */
4926	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4927	if (!pdev)
4928		return;
4929
4930	if (risky_device(pdev)) {
4931		pci_dev_put(pdev);
4932		return;
4933	}
4934
4935	pci_dev_put(pdev);
4936
4937	/* System Management Registers. Might be hidden, in which case
4938	   we can't do the sanity check. But that's OK, because the
4939	   known-broken BIOSes _don't_ actually hide it, so far. */
4940	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4941	if (!pdev)
4942		return;
4943
4944	if (risky_device(pdev)) {
4945		pci_dev_put(pdev);
4946		return;
4947	}
4948
4949	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4950		pci_dev_put(pdev);
4951		return;
4952	}
4953
4954	pci_dev_put(pdev);
4955
4956	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4957	if (vtisochctrl & 1)
4958		return;
4959
4960	/* Drop all bits other than the number of TLB entries */
4961	vtisochctrl &= 0x1c;
4962
4963	/* If we have the recommended number of TLB entries (16), fine. */
4964	if (vtisochctrl == 0x10)
4965		return;
4966
4967	/* Zero TLB entries? You get to ride the short bus to school. */
4968	if (!vtisochctrl) {
4969		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4970		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4971		     dmi_get_system_info(DMI_BIOS_VENDOR),
4972		     dmi_get_system_info(DMI_BIOS_VERSION),
4973		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4974		iommu_identity_mapping |= IDENTMAP_AZALIA;
4975		return;
4976	}
4977
4978	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4979	       vtisochctrl);
4980}
4981
4982/*
4983 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4984 * invalidation completion before posted writes initiated with translated address
4985 * that utilized translations matching the invalidation address range, violating
4986 * the invalidation completion ordering.
4987 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4988 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4989 * under the control of the trusted/privileged host device driver must use this
4990 * quirk.
4991 * Device TLBs are invalidated under the following six conditions:
4992 * 1. Device driver does DMA API unmap IOVA
4993 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4994 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4995 *    exit_mmap() due to crash
4996 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4997 *    VM has to free pages that were unmapped
4998 * 5. Userspace driver unmaps a DMA buffer
4999 * 6. Cache invalidation in vSVA usage (upcoming)
5000 *
5001 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5002 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5003 * invalidate TLB the same way as normal user unmap which will use this quirk.
5004 * The dTLB invalidation after PASID cache flush does not need this quirk.
5005 *
5006 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5007 */
5008void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5009			       unsigned long address, unsigned long mask,
5010			       u32 pasid, u16 qdep)
5011{
5012	u16 sid;
5013
5014	if (likely(!info->dtlb_extra_inval))
5015		return;
5016
5017	sid = PCI_DEVID(info->bus, info->devfn);
5018	if (pasid == PASID_RID2PASID) {
5019		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5020				   qdep, address, mask);
5021	} else {
5022		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5023					 pasid, qdep, address, mask);
5024	}
5025}

   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright © 2006-2014 Intel Corporation.
   4 *
   5 * Authors: David Woodhouse <dwmw2@infradead.org>,
   6 *          Ashok Raj <ashok.raj@intel.com>,
   7 *          Shaohua Li <shaohua.li@intel.com>,
   8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9 *          Fenghua Yu <fenghua.yu@intel.com>
  10 *          Joerg Roedel <jroedel@suse.de>
  11 */
  12
  13#define pr_fmt(fmt)     "DMAR: " fmt
  14#define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16#include <linux/init.h>
  17#include <linux/bitmap.h>
  18#include <linux/debugfs.h>
  19#include <linux/export.h>
  20#include <linux/slab.h>
  21#include <linux/irq.h>
  22#include <linux/interrupt.h>
  23#include <linux/spinlock.h>
  24#include <linux/pci.h>
  25#include <linux/dmar.h>
  26#include <linux/dma-map-ops.h>
  27#include <linux/mempool.h>
  28#include <linux/memory.h>
  29#include <linux/cpu.h>
  30#include <linux/timer.h>
  31#include <linux/io.h>
  32#include <linux/iova.h>
  33#include <linux/iommu.h>
  34#include <linux/dma-iommu.h>
  35#include <linux/intel-iommu.h>
  36#include <linux/syscore_ops.h>
  37#include <linux/tboot.h>
  38#include <linux/dmi.h>
  39#include <linux/pci-ats.h>
  40#include <linux/memblock.h>
  41#include <linux/dma-direct.h>
  42#include <linux/crash_dump.h>
  43#include <linux/numa.h>
  44#include <asm/irq_remapping.h>
  45#include <asm/cacheflush.h>
  46#include <asm/iommu.h>
  47
 
 
  48#include "../irq_remapping.h"
  49#include "../iommu-sva-lib.h"
  50#include "pasid.h"
  51#include "cap_audit.h"
  52
  53#define ROOT_SIZE		VTD_PAGE_SIZE
  54#define CONTEXT_SIZE		VTD_PAGE_SIZE
  55
  56#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  57#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  58#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  59#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  60
  61#define IOAPIC_RANGE_START	(0xfee00000)
  62#define IOAPIC_RANGE_END	(0xfeefffff)
  63#define IOVA_START_ADDR		(0x1000)
  64
  65#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  66
  67#define MAX_AGAW_WIDTH 64
  68#define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  69
  70#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
  71#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
  72
  73/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  74   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  75#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
  76				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  77#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  78
  79/* IO virtual address start page frame number */
  80#define IOVA_START_PFN		(1)
  81
  82#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
  83
  84/* page table handling */
  85#define LEVEL_STRIDE		(9)
  86#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
  87
  88/*
  89 * This bitmap is used to advertise the page sizes our hardware support
  90 * to the IOMMU core, which will then use this information to split
  91 * physically contiguous memory regions it is mapping into page sizes
  92 * that we support.
  93 *
  94 * Traditionally the IOMMU core just handed us the mappings directly,
  95 * after making sure the size is an order of a 4KiB page and that the
  96 * mapping has natural alignment.
  97 *
  98 * To retain this behavior, we currently advertise that we support
  99 * all page sizes that are an order of 4KiB.
 100 *
 101 * If at some point we'd like to utilize the IOMMU core's new behavior,
 102 * we could change this to advertise the real page sizes we support.
 103 */
 104#define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
 105
 106static inline int agaw_to_level(int agaw)
 107{
 108	return agaw + 2;
 109}
 110
 111static inline int agaw_to_width(int agaw)
 112{
 113	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 114}
 115
 116static inline int width_to_agaw(int width)
 117{
 118	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 119}
 120
 121static inline unsigned int level_to_offset_bits(int level)
 122{
 123	return (level - 1) * LEVEL_STRIDE;
 124}
 125
 126static inline int pfn_level_offset(u64 pfn, int level)
 127{
 128	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 129}
 130
 131static inline u64 level_mask(int level)
 132{
 133	return -1ULL << level_to_offset_bits(level);
 134}
 135
 136static inline u64 level_size(int level)
 137{
 138	return 1ULL << level_to_offset_bits(level);
 139}
 140
 141static inline u64 align_to_level(u64 pfn, int level)
 142{
 143	return (pfn + level_size(level) - 1) & level_mask(level);
 144}
 145
 146static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 147{
 148	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 149}
 150
 151/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 152   are never going to work. */
 153static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 154{
 155	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 156}
 157
 158static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 159{
 160	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 161}
 162static inline unsigned long page_to_dma_pfn(struct page *pg)
 163{
 164	return mm_to_dma_pfn(page_to_pfn(pg));
 165}
 166static inline unsigned long virt_to_dma_pfn(void *p)
 167{
 168	return page_to_dma_pfn(virt_to_page(p));
 169}
 170
 171/* global iommu list, set NULL for ignored DMAR units */
 172static struct intel_iommu **g_iommus;
 173
 174static void __init check_tylersburg_isoch(void);
 175static int rwbf_quirk;
 176
 177/*
 178 * set to 1 to panic kernel if can't successfully enable VT-d
 179 * (used when kernel is launched w/ TXT)
 180 */
 181static int force_on = 0;
 182static int intel_iommu_tboot_noforce;
 183static int no_platform_optin;
 184
 185#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 186
 187/*
 188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 189 * if marked present.
 190 */
 191static phys_addr_t root_entry_lctp(struct root_entry *re)
 192{
 193	if (!(re->lo & 1))
 194		return 0;
 195
 196	return re->lo & VTD_PAGE_MASK;
 197}
 198
 199/*
 200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 201 * if marked present.
 202 */
 203static phys_addr_t root_entry_uctp(struct root_entry *re)
 204{
 205	if (!(re->hi & 1))
 206		return 0;
 207
 208	return re->hi & VTD_PAGE_MASK;
 209}
 210
 211static inline void context_clear_pasid_enable(struct context_entry *context)
 212{
 213	context->lo &= ~(1ULL << 11);
 214}
 215
 216static inline bool context_pasid_enabled(struct context_entry *context)
 217{
 218	return !!(context->lo & (1ULL << 11));
 219}
 220
 221static inline void context_set_copied(struct context_entry *context)
 222{
 223	context->hi |= (1ull << 3);
 224}
 225
 226static inline bool context_copied(struct context_entry *context)
 227{
 228	return !!(context->hi & (1ULL << 3));
 229}
 230
 231static inline bool __context_present(struct context_entry *context)
 232{
 233	return (context->lo & 1);
 234}
 235
 236bool context_present(struct context_entry *context)
 237{
 238	return context_pasid_enabled(context) ?
 239	     __context_present(context) :
 240	     __context_present(context) && !context_copied(context);
 241}
 242
 243static inline void context_set_present(struct context_entry *context)
 244{
 245	context->lo |= 1;
 246}
 247
 248static inline void context_set_fault_enable(struct context_entry *context)
 249{
 250	context->lo &= (((u64)-1) << 2) | 1;
 251}
 252
 253static inline void context_set_translation_type(struct context_entry *context,
 254						unsigned long value)
 255{
 256	context->lo &= (((u64)-1) << 4) | 3;
 257	context->lo |= (value & 3) << 2;
 258}
 259
 260static inline void context_set_address_root(struct context_entry *context,
 261					    unsigned long value)
 262{
 263	context->lo &= ~VTD_PAGE_MASK;
 264	context->lo |= value & VTD_PAGE_MASK;
 265}
 266
 267static inline void context_set_address_width(struct context_entry *context,
 268					     unsigned long value)
 269{
 270	context->hi |= value & 7;
 271}
 272
 273static inline void context_set_domain_id(struct context_entry *context,
 274					 unsigned long value)
 275{
 276	context->hi |= (value & ((1 << 16) - 1)) << 8;
 277}
 278
 
 
 
 
 
 279static inline int context_domain_id(struct context_entry *c)
 280{
 281	return((c->hi >> 8) & 0xffff);
 282}
 283
 284static inline void context_clear_entry(struct context_entry *context)
 285{
 286	context->lo = 0;
 287	context->hi = 0;
 288}
 289
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 290/*
 291 * This domain is a statically identity mapping domain.
 292 *	1. This domain creats a static 1:1 mapping to all usable memory.
 293 * 	2. It maps to each iommu if successful.
 294 *	3. Each iommu mapps to this domain if successful.
 295 */
 296static struct dmar_domain *si_domain;
 297static int hw_pass_through = 1;
 298
 299#define for_each_domain_iommu(idx, domain)			\
 300	for (idx = 0; idx < g_num_of_iommus; idx++)		\
 301		if (domain->iommu_refcnt[idx])
 302
 303struct dmar_rmrr_unit {
 304	struct list_head list;		/* list of rmrr units	*/
 305	struct acpi_dmar_header *hdr;	/* ACPI header		*/
 306	u64	base_address;		/* reserved base address*/
 307	u64	end_address;		/* reserved end address */
 308	struct dmar_dev_scope *devices;	/* target devices */
 309	int	devices_cnt;		/* target device count */
 310};
 311
 312struct dmar_atsr_unit {
 313	struct list_head list;		/* list of ATSR units */
 314	struct acpi_dmar_header *hdr;	/* ACPI header */
 315	struct dmar_dev_scope *devices;	/* target devices */
 316	int devices_cnt;		/* target device count */
 317	u8 include_all:1;		/* include all ports */
 318};
 319
 320struct dmar_satc_unit {
 321	struct list_head list;		/* list of SATC units */
 322	struct acpi_dmar_header *hdr;	/* ACPI header */
 323	struct dmar_dev_scope *devices;	/* target devices */
 324	struct intel_iommu *iommu;	/* the corresponding iommu */
 325	int devices_cnt;		/* target device count */
 326	u8 atc_required:1;		/* ATS is required */
 327};
 328
 329static LIST_HEAD(dmar_atsr_units);
 330static LIST_HEAD(dmar_rmrr_units);
 331static LIST_HEAD(dmar_satc_units);
 332
 333#define for_each_rmrr_units(rmrr) \
 334	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 335
 336/* bitmap for indexing intel_iommus */
 337static int g_num_of_iommus;
 338
 339static void domain_exit(struct dmar_domain *domain);
 340static void domain_remove_dev_info(struct dmar_domain *domain);
 341static void dmar_remove_one_dev_info(struct device *dev);
 342static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 343static int intel_iommu_attach_device(struct iommu_domain *domain,
 344				     struct device *dev);
 345static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 346					    dma_addr_t iova);
 347
 348#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 349int dmar_disabled = 0;
 350#else
 351int dmar_disabled = 1;
 352#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
 353
 354#ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
 355int intel_iommu_sm = 1;
 356#else
 357int intel_iommu_sm;
 358#endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
 359
 360int intel_iommu_enabled = 0;
 361EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 362
 363static int dmar_map_gfx = 1;
 364static int intel_iommu_strict;
 365static int intel_iommu_superpage = 1;
 366static int iommu_identity_mapping;
 367static int iommu_skip_te_disable;
 368
 369#define IDENTMAP_GFX		2
 370#define IDENTMAP_AZALIA		4
 371
 372int intel_iommu_gfx_mapped;
 373EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 374
 375#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
 376struct device_domain_info *get_domain_info(struct device *dev)
 377{
 378	struct device_domain_info *info;
 379
 380	if (!dev)
 381		return NULL;
 382
 383	info = dev_iommu_priv_get(dev);
 384	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
 385		return NULL;
 386
 387	return info;
 388}
 389
 390DEFINE_SPINLOCK(device_domain_lock);
 391static LIST_HEAD(device_domain_list);
 392
 393/*
 394 * Iterate over elements in device_domain_list and call the specified
 395 * callback @fn against each element.
 396 */
 397int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 398				     void *data), void *data)
 399{
 400	int ret = 0;
 401	unsigned long flags;
 402	struct device_domain_info *info;
 403
 404	spin_lock_irqsave(&device_domain_lock, flags);
 405	list_for_each_entry(info, &device_domain_list, global) {
 406		ret = fn(info, data);
 407		if (ret) {
 408			spin_unlock_irqrestore(&device_domain_lock, flags);
 409			return ret;
 410		}
 411	}
 412	spin_unlock_irqrestore(&device_domain_lock, flags);
 413
 414	return 0;
 415}
 416
 417const struct iommu_ops intel_iommu_ops;
 418
 419static bool translation_pre_enabled(struct intel_iommu *iommu)
 420{
 421	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 422}
 423
 424static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 425{
 426	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 427}
 428
 429static void init_translation_status(struct intel_iommu *iommu)
 430{
 431	u32 gsts;
 432
 433	gsts = readl(iommu->reg + DMAR_GSTS_REG);
 434	if (gsts & DMA_GSTS_TES)
 435		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 436}
 437
 438static int __init intel_iommu_setup(char *str)
 439{
 440	if (!str)
 441		return -EINVAL;
 
 442	while (*str) {
 443		if (!strncmp(str, "on", 2)) {
 444			dmar_disabled = 0;
 445			pr_info("IOMMU enabled\n");
 446		} else if (!strncmp(str, "off", 3)) {
 447			dmar_disabled = 1;
 448			no_platform_optin = 1;
 449			pr_info("IOMMU disabled\n");
 450		} else if (!strncmp(str, "igfx_off", 8)) {
 451			dmar_map_gfx = 0;
 452			pr_info("Disable GFX device mapping\n");
 453		} else if (!strncmp(str, "forcedac", 8)) {
 454			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 455			iommu_dma_forcedac = true;
 456		} else if (!strncmp(str, "strict", 6)) {
 457			pr_info("Disable batched IOTLB flush\n");
 458			intel_iommu_strict = 1;
 459		} else if (!strncmp(str, "sp_off", 6)) {
 460			pr_info("Disable supported super page\n");
 461			intel_iommu_superpage = 0;
 462		} else if (!strncmp(str, "sm_on", 5)) {
 463			pr_info("Intel-IOMMU: scalable mode supported\n");
 464			intel_iommu_sm = 1;
 
 
 
 465		} else if (!strncmp(str, "tboot_noforce", 13)) {
 466			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 467			intel_iommu_tboot_noforce = 1;
 
 
 468		}
 469
 470		str += strcspn(str, ",");
 471		while (*str == ',')
 472			str++;
 473	}
 474	return 0;
 
 475}
 476__setup("intel_iommu=", intel_iommu_setup);
 477
 478static struct kmem_cache *iommu_domain_cache;
 479static struct kmem_cache *iommu_devinfo_cache;
 480
 481static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 482{
 483	struct dmar_domain **domains;
 484	int idx = did >> 8;
 485
 486	domains = iommu->domains[idx];
 487	if (!domains)
 488		return NULL;
 489
 490	return domains[did & 0xff];
 491}
 492
 493static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 494			     struct dmar_domain *domain)
 495{
 496	struct dmar_domain **domains;
 497	int idx = did >> 8;
 498
 499	if (!iommu->domains[idx]) {
 500		size_t size = 256 * sizeof(struct dmar_domain *);
 501		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 502	}
 503
 504	domains = iommu->domains[idx];
 505	if (WARN_ON(!domains))
 506		return;
 507	else
 508		domains[did & 0xff] = domain;
 509}
 510
 511void *alloc_pgtable_page(int node)
 512{
 513	struct page *page;
 514	void *vaddr = NULL;
 515
 516	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 517	if (page)
 518		vaddr = page_address(page);
 519	return vaddr;
 520}
 521
 522void free_pgtable_page(void *vaddr)
 523{
 524	free_page((unsigned long)vaddr);
 525}
 526
 527static inline void *alloc_domain_mem(void)
 528{
 529	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 530}
 531
 532static void free_domain_mem(void *vaddr)
 
 533{
 534	kmem_cache_free(iommu_domain_cache, vaddr);
 535}
 536
 537static inline void * alloc_devinfo_mem(void)
 538{
 539	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 540}
 541
 542static inline void free_devinfo_mem(void *vaddr)
 
 
 
 
 
 543{
 544	kmem_cache_free(iommu_devinfo_cache, vaddr);
 545}
 546
 547static inline int domain_type_is_si(struct dmar_domain *domain)
 548{
 549	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 550}
 551
 552static inline bool domain_use_first_level(struct dmar_domain *domain)
 553{
 554	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
 555}
 556
 557static inline int domain_pfn_supported(struct dmar_domain *domain,
 558				       unsigned long pfn)
 559{
 560	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 561
 562	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 563}
 564
 565static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 566{
 567	unsigned long sagaw;
 568	int agaw;
 569
 570	sagaw = cap_sagaw(iommu->cap);
 571	for (agaw = width_to_agaw(max_gaw);
 572	     agaw >= 0; agaw--) {
 573		if (test_bit(agaw, &sagaw))
 574			break;
 575	}
 576
 577	return agaw;
 578}
 579
 580/*
 581 * Calculate max SAGAW for each iommu.
 582 */
 583int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 584{
 585	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 586}
 587
 588/*
 589 * calculate agaw for each iommu.
 590 * "SAGAW" may be different across iommus, use a default agaw, and
 591 * get a supported less agaw for iommus that don't support the default agaw.
 592 */
 593int iommu_calculate_agaw(struct intel_iommu *iommu)
 594{
 595	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 596}
 597
 598/* This functionin only returns single iommu in a domain */
 599struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 600{
 601	int iommu_id;
 602
 603	/* si_domain and vm domain should not get here. */
 604	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
 605		return NULL;
 606
 607	for_each_domain_iommu(iommu_id, domain)
 608		break;
 609
 610	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 611		return NULL;
 612
 613	return g_iommus[iommu_id];
 614}
 615
 616static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 617{
 618	return sm_supported(iommu) ?
 619			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 620}
 621
 622static void domain_update_iommu_coherency(struct dmar_domain *domain)
 623{
 
 624	struct dmar_drhd_unit *drhd;
 625	struct intel_iommu *iommu;
 626	bool found = false;
 627	int i;
 628
 629	domain->iommu_coherency = true;
 630
 631	for_each_domain_iommu(i, domain) {
 632		found = true;
 633		if (!iommu_paging_structure_coherency(g_iommus[i])) {
 634			domain->iommu_coherency = false;
 635			break;
 636		}
 637	}
 638	if (found)
 639		return;
 640
 641	/* No hardware attached; use lowest common denominator */
 642	rcu_read_lock();
 643	for_each_active_iommu(iommu, drhd) {
 644		if (!iommu_paging_structure_coherency(iommu)) {
 645			domain->iommu_coherency = false;
 646			break;
 647		}
 648	}
 649	rcu_read_unlock();
 650}
 651
 652static bool domain_update_iommu_snooping(struct intel_iommu *skip)
 653{
 654	struct dmar_drhd_unit *drhd;
 655	struct intel_iommu *iommu;
 656	bool ret = true;
 657
 658	rcu_read_lock();
 659	for_each_active_iommu(iommu, drhd) {
 660		if (iommu != skip) {
 661			/*
 662			 * If the hardware is operating in the scalable mode,
 663			 * the snooping control is always supported since we
 664			 * always set PASID-table-entry.PGSNP bit if the domain
 665			 * is managed outside (UNMANAGED).
 666			 */
 667			if (!sm_supported(iommu) &&
 668			    !ecap_sc_support(iommu->ecap)) {
 669				ret = false;
 670				break;
 671			}
 672		}
 673	}
 674	rcu_read_unlock();
 675
 676	return ret;
 677}
 678
 679static int domain_update_iommu_superpage(struct dmar_domain *domain,
 680					 struct intel_iommu *skip)
 681{
 682	struct dmar_drhd_unit *drhd;
 683	struct intel_iommu *iommu;
 684	int mask = 0x3;
 685
 686	if (!intel_iommu_superpage)
 687		return 0;
 688
 689	/* set iommu_superpage to the smallest common denominator */
 690	rcu_read_lock();
 691	for_each_active_iommu(iommu, drhd) {
 692		if (iommu != skip) {
 693			if (domain && domain_use_first_level(domain)) {
 694				if (!cap_fl1gp_support(iommu->cap))
 695					mask = 0x1;
 696			} else {
 697				mask &= cap_super_page_val(iommu->cap);
 698			}
 699
 700			if (!mask)
 701				break;
 702		}
 703	}
 704	rcu_read_unlock();
 705
 706	return fls(mask);
 707}
 708
 709static int domain_update_device_node(struct dmar_domain *domain)
 710{
 711	struct device_domain_info *info;
 712	int nid = NUMA_NO_NODE;
 
 713
 714	assert_spin_locked(&device_domain_lock);
 715
 716	if (list_empty(&domain->devices))
 717		return NUMA_NO_NODE;
 718
 719	list_for_each_entry(info, &domain->devices, link) {
 720		if (!info->dev)
 721			continue;
 722
 723		/*
 724		 * There could possibly be multiple device numa nodes as devices
 725		 * within the same domain may sit behind different IOMMUs. There
 726		 * isn't perfect answer in such situation, so we select first
 727		 * come first served policy.
 728		 */
 729		nid = dev_to_node(info->dev);
 730		if (nid != NUMA_NO_NODE)
 731			break;
 732	}
 
 733
 734	return nid;
 735}
 736
 737static void domain_update_iotlb(struct dmar_domain *domain);
 738
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 739/* Some capabilities may be different across iommus */
 740static void domain_update_iommu_cap(struct dmar_domain *domain)
 741{
 742	domain_update_iommu_coherency(domain);
 743	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 744	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 745
 746	/*
 747	 * If RHSA is missing, we should default to the device numa domain
 748	 * as fall back.
 749	 */
 750	if (domain->nid == NUMA_NO_NODE)
 751		domain->nid = domain_update_device_node(domain);
 752
 753	/*
 754	 * First-level translation restricts the input-address to a
 755	 * canonical address (i.e., address bits 63:N have the same
 756	 * value as address bit [N-1], where N is 48-bits with 4-level
 757	 * paging and 57-bits with 5-level paging). Hence, skip bit
 758	 * [N-1].
 759	 */
 760	if (domain_use_first_level(domain))
 761		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
 762	else
 763		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
 764
 
 765	domain_update_iotlb(domain);
 766}
 767
 768struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 769					 u8 devfn, int alloc)
 770{
 771	struct root_entry *root = &iommu->root_entry[bus];
 772	struct context_entry *context;
 773	u64 *entry;
 774
 
 
 
 
 
 
 
 775	entry = &root->lo;
 776	if (sm_supported(iommu)) {
 777		if (devfn >= 0x80) {
 778			devfn -= 0x80;
 779			entry = &root->hi;
 780		}
 781		devfn *= 2;
 782	}
 783	if (*entry & 1)
 784		context = phys_to_virt(*entry & VTD_PAGE_MASK);
 785	else {
 786		unsigned long phy_addr;
 787		if (!alloc)
 788			return NULL;
 789
 790		context = alloc_pgtable_page(iommu->node);
 791		if (!context)
 792			return NULL;
 793
 794		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 795		phy_addr = virt_to_phys((void *)context);
 796		*entry = phy_addr | 1;
 797		__iommu_flush_cache(iommu, entry, sizeof(*entry));
 798	}
 799	return &context[devfn];
 800}
 801
 802static bool attach_deferred(struct device *dev)
 803{
 804	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
 805}
 806
 807/**
 808 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 809 *				 sub-hierarchy of a candidate PCI-PCI bridge
 810 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 811 * @bridge: the candidate PCI-PCI bridge
 812 *
 813 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 814 */
 815static bool
 816is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 817{
 818	struct pci_dev *pdev, *pbridge;
 819
 820	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 821		return false;
 822
 823	pdev = to_pci_dev(dev);
 824	pbridge = to_pci_dev(bridge);
 825
 826	if (pbridge->subordinate &&
 827	    pbridge->subordinate->number <= pdev->bus->number &&
 828	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
 829		return true;
 830
 831	return false;
 832}
 833
 834static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 835{
 836	struct dmar_drhd_unit *drhd;
 837	u32 vtbar;
 838	int rc;
 839
 840	/* We know that this device on this chipset has its own IOMMU.
 841	 * If we find it under a different IOMMU, then the BIOS is lying
 842	 * to us. Hope that the IOMMU for this device is actually
 843	 * disabled, and it needs no translation...
 844	 */
 845	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 846	if (rc) {
 847		/* "can't" happen */
 848		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 849		return false;
 850	}
 851	vtbar &= 0xffff0000;
 852
 853	/* we know that the this iommu should be at offset 0xa000 from vtbar */
 854	drhd = dmar_find_matched_drhd_unit(pdev);
 855	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 856		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 857		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 858		return true;
 859	}
 860
 861	return false;
 862}
 863
 864static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 865{
 866	if (!iommu || iommu->drhd->ignored)
 867		return true;
 868
 869	if (dev_is_pci(dev)) {
 870		struct pci_dev *pdev = to_pci_dev(dev);
 871
 872		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 873		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 874		    quirk_ioat_snb_local_iommu(pdev))
 875			return true;
 876	}
 877
 878	return false;
 879}
 880
 881struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 882{
 883	struct dmar_drhd_unit *drhd = NULL;
 884	struct pci_dev *pdev = NULL;
 885	struct intel_iommu *iommu;
 886	struct device *tmp;
 887	u16 segment = 0;
 888	int i;
 889
 890	if (!dev)
 891		return NULL;
 892
 893	if (dev_is_pci(dev)) {
 894		struct pci_dev *pf_pdev;
 895
 896		pdev = pci_real_dma_dev(to_pci_dev(dev));
 897
 898		/* VFs aren't listed in scope tables; we need to look up
 899		 * the PF instead to find the IOMMU. */
 900		pf_pdev = pci_physfn(pdev);
 901		dev = &pf_pdev->dev;
 902		segment = pci_domain_nr(pdev->bus);
 903	} else if (has_acpi_companion(dev))
 904		dev = &ACPI_COMPANION(dev)->dev;
 905
 906	rcu_read_lock();
 907	for_each_iommu(iommu, drhd) {
 908		if (pdev && segment != drhd->segment)
 909			continue;
 910
 911		for_each_active_dev_scope(drhd->devices,
 912					  drhd->devices_cnt, i, tmp) {
 913			if (tmp == dev) {
 914				/* For a VF use its original BDF# not that of the PF
 915				 * which we used for the IOMMU lookup. Strictly speaking
 916				 * we could do this for all PCI devices; we only need to
 917				 * get the BDF# from the scope table for ACPI matches. */
 918				if (pdev && pdev->is_virtfn)
 919					goto got_pdev;
 920
 921				if (bus && devfn) {
 922					*bus = drhd->devices[i].bus;
 923					*devfn = drhd->devices[i].devfn;
 924				}
 925				goto out;
 926			}
 927
 928			if (is_downstream_to_pci_bridge(dev, tmp))
 929				goto got_pdev;
 930		}
 931
 932		if (pdev && drhd->include_all) {
 933		got_pdev:
 934			if (bus && devfn) {
 935				*bus = pdev->bus->number;
 936				*devfn = pdev->devfn;
 937			}
 938			goto out;
 939		}
 940	}
 941	iommu = NULL;
 942 out:
 943	if (iommu_is_dummy(iommu, dev))
 944		iommu = NULL;
 945
 946	rcu_read_unlock();
 947
 948	return iommu;
 949}
 950
 951static void domain_flush_cache(struct dmar_domain *domain,
 952			       void *addr, int size)
 953{
 954	if (!domain->iommu_coherency)
 955		clflush_cache_range(addr, size);
 956}
 957
 958static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 959{
 960	struct context_entry *context;
 961	int ret = 0;
 962	unsigned long flags;
 963
 964	spin_lock_irqsave(&iommu->lock, flags);
 965	context = iommu_context_addr(iommu, bus, devfn, 0);
 966	if (context)
 967		ret = context_present(context);
 968	spin_unlock_irqrestore(&iommu->lock, flags);
 969	return ret;
 970}
 971
 972static void free_context_table(struct intel_iommu *iommu)
 973{
 974	int i;
 975	unsigned long flags;
 976	struct context_entry *context;
 977
 978	spin_lock_irqsave(&iommu->lock, flags);
 979	if (!iommu->root_entry) {
 980		goto out;
 981	}
 982	for (i = 0; i < ROOT_ENTRY_NR; i++) {
 983		context = iommu_context_addr(iommu, i, 0, 0);
 984		if (context)
 985			free_pgtable_page(context);
 986
 987		if (!sm_supported(iommu))
 988			continue;
 989
 990		context = iommu_context_addr(iommu, i, 0x80, 0);
 991		if (context)
 992			free_pgtable_page(context);
 
 993
 994	}
 995	free_pgtable_page(iommu->root_entry);
 996	iommu->root_entry = NULL;
 997out:
 998	spin_unlock_irqrestore(&iommu->lock, flags);
 999}
1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1001static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1002				      unsigned long pfn, int *target_level)
1003{
1004	struct dma_pte *parent, *pte;
1005	int level = agaw_to_level(domain->agaw);
1006	int offset;
1007
1008	BUG_ON(!domain->pgd);
1009
1010	if (!domain_pfn_supported(domain, pfn))
1011		/* Address beyond IOMMU's addressing capabilities. */
1012		return NULL;
1013
1014	parent = domain->pgd;
1015
1016	while (1) {
1017		void *tmp_page;
1018
1019		offset = pfn_level_offset(pfn, level);
1020		pte = &parent[offset];
1021		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1022			break;
1023		if (level == *target_level)
1024			break;
1025
1026		if (!dma_pte_present(pte)) {
1027			uint64_t pteval;
1028
1029			tmp_page = alloc_pgtable_page(domain->nid);
1030
1031			if (!tmp_page)
1032				return NULL;
1033
1034			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1035			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1036			if (domain_use_first_level(domain)) {
1037				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1038				if (domain->domain.type == IOMMU_DOMAIN_DMA)
1039					pteval |= DMA_FL_PTE_ACCESS;
1040			}
1041			if (cmpxchg64(&pte->val, 0ULL, pteval))
1042				/* Someone else set it while we were thinking; use theirs. */
1043				free_pgtable_page(tmp_page);
1044			else
1045				domain_flush_cache(domain, pte, sizeof(*pte));
1046		}
1047		if (level == 1)
1048			break;
1049
1050		parent = phys_to_virt(dma_pte_addr(pte));
1051		level--;
1052	}
1053
1054	if (!*target_level)
1055		*target_level = level;
1056
1057	return pte;
1058}
1059
1060/* return address's pte at specific level */
1061static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1062					 unsigned long pfn,
1063					 int level, int *large_page)
1064{
1065	struct dma_pte *parent, *pte;
1066	int total = agaw_to_level(domain->agaw);
1067	int offset;
1068
1069	parent = domain->pgd;
1070	while (level <= total) {
1071		offset = pfn_level_offset(pfn, total);
1072		pte = &parent[offset];
1073		if (level == total)
1074			return pte;
1075
1076		if (!dma_pte_present(pte)) {
1077			*large_page = total;
1078			break;
1079		}
1080
1081		if (dma_pte_superpage(pte)) {
1082			*large_page = total;
1083			return pte;
1084		}
1085
1086		parent = phys_to_virt(dma_pte_addr(pte));
1087		total--;
1088	}
1089	return NULL;
1090}
1091
1092/* clear last level pte, a tlb flush should be followed */
1093static void dma_pte_clear_range(struct dmar_domain *domain,
1094				unsigned long start_pfn,
1095				unsigned long last_pfn)
1096{
1097	unsigned int large_page;
1098	struct dma_pte *first_pte, *pte;
1099
1100	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1101	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1102	BUG_ON(start_pfn > last_pfn);
1103
1104	/* we don't need lock here; nobody else touches the iova range */
1105	do {
1106		large_page = 1;
1107		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1108		if (!pte) {
1109			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1110			continue;
1111		}
1112		do {
1113			dma_clear_pte(pte);
1114			start_pfn += lvl_to_nr_pages(large_page);
1115			pte++;
1116		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1117
1118		domain_flush_cache(domain, first_pte,
1119				   (void *)pte - (void *)first_pte);
1120
1121	} while (start_pfn && start_pfn <= last_pfn);
1122}
1123
1124static void dma_pte_free_level(struct dmar_domain *domain, int level,
1125			       int retain_level, struct dma_pte *pte,
1126			       unsigned long pfn, unsigned long start_pfn,
1127			       unsigned long last_pfn)
1128{
1129	pfn = max(start_pfn, pfn);
1130	pte = &pte[pfn_level_offset(pfn, level)];
1131
1132	do {
1133		unsigned long level_pfn;
1134		struct dma_pte *level_pte;
1135
1136		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1137			goto next;
1138
1139		level_pfn = pfn & level_mask(level);
1140		level_pte = phys_to_virt(dma_pte_addr(pte));
1141
1142		if (level > 2) {
1143			dma_pte_free_level(domain, level - 1, retain_level,
1144					   level_pte, level_pfn, start_pfn,
1145					   last_pfn);
1146		}
1147
1148		/*
1149		 * Free the page table if we're below the level we want to
1150		 * retain and the range covers the entire table.
1151		 */
1152		if (level < retain_level && !(start_pfn > level_pfn ||
1153		      last_pfn < level_pfn + level_size(level) - 1)) {
1154			dma_clear_pte(pte);
1155			domain_flush_cache(domain, pte, sizeof(*pte));
1156			free_pgtable_page(level_pte);
1157		}
1158next:
1159		pfn += level_size(level);
1160	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1161}
1162
1163/*
1164 * clear last level (leaf) ptes and free page table pages below the
1165 * level we wish to keep intact.
1166 */
1167static void dma_pte_free_pagetable(struct dmar_domain *domain,
1168				   unsigned long start_pfn,
1169				   unsigned long last_pfn,
1170				   int retain_level)
1171{
1172	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1173	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1174	BUG_ON(start_pfn > last_pfn);
1175
1176	dma_pte_clear_range(domain, start_pfn, last_pfn);
1177
1178	/* We don't need lock here; nobody else touches the iova range */
1179	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1180			   domain->pgd, 0, start_pfn, last_pfn);
1181
1182	/* free pgd */
1183	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1184		free_pgtable_page(domain->pgd);
1185		domain->pgd = NULL;
1186	}
1187}
1188
1189/* When a page at a given level is being unlinked from its parent, we don't
1190   need to *modify* it at all. All we need to do is make a list of all the
1191   pages which can be freed just as soon as we've flushed the IOTLB and we
1192   know the hardware page-walk will no longer touch them.
1193   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1194   be freed. */
1195static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1196					    int level, struct dma_pte *pte,
1197					    struct page *freelist)
1198{
1199	struct page *pg;
1200
1201	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1202	pg->freelist = freelist;
1203	freelist = pg;
1204
1205	if (level == 1)
1206		return freelist;
1207
1208	pte = page_address(pg);
1209	do {
1210		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1211			freelist = dma_pte_list_pagetables(domain, level - 1,
1212							   pte, freelist);
1213		pte++;
1214	} while (!first_pte_in_page(pte));
1215
1216	return freelist;
1217}
1218
1219static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1220					struct dma_pte *pte, unsigned long pfn,
1221					unsigned long start_pfn,
1222					unsigned long last_pfn,
1223					struct page *freelist)
1224{
1225	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1226
1227	pfn = max(start_pfn, pfn);
1228	pte = &pte[pfn_level_offset(pfn, level)];
1229
1230	do {
1231		unsigned long level_pfn;
1232
1233		if (!dma_pte_present(pte))
1234			goto next;
1235
1236		level_pfn = pfn & level_mask(level);
1237
1238		/* If range covers entire pagetable, free it */
1239		if (start_pfn <= level_pfn &&
1240		    last_pfn >= level_pfn + level_size(level) - 1) {
1241			/* These suborbinate page tables are going away entirely. Don't
1242			   bother to clear them; we're just going to *free* them. */
1243			if (level > 1 && !dma_pte_superpage(pte))
1244				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1245
1246			dma_clear_pte(pte);
1247			if (!first_pte)
1248				first_pte = pte;
1249			last_pte = pte;
1250		} else if (level > 1) {
1251			/* Recurse down into a level that isn't *entirely* obsolete */
1252			freelist = dma_pte_clear_level(domain, level - 1,
1253						       phys_to_virt(dma_pte_addr(pte)),
1254						       level_pfn, start_pfn, last_pfn,
1255						       freelist);
1256		}
1257next:
1258		pfn += level_size(level);
1259	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1260
1261	if (first_pte)
1262		domain_flush_cache(domain, first_pte,
1263				   (void *)++last_pte - (void *)first_pte);
1264
1265	return freelist;
1266}
1267
1268/* We can't just free the pages because the IOMMU may still be walking
1269   the page tables, and may have cached the intermediate levels. The
1270   pages can only be freed after the IOTLB flush has been done. */
1271static struct page *domain_unmap(struct dmar_domain *domain,
1272				 unsigned long start_pfn,
1273				 unsigned long last_pfn,
1274				 struct page *freelist)
1275{
1276	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1277	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1278	BUG_ON(start_pfn > last_pfn);
1279
1280	/* we don't need lock here; nobody else touches the iova range */
1281	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1282				       domain->pgd, 0, start_pfn, last_pfn,
1283				       freelist);
1284
1285	/* free pgd */
1286	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1287		struct page *pgd_page = virt_to_page(domain->pgd);
1288		pgd_page->freelist = freelist;
1289		freelist = pgd_page;
1290
1291		domain->pgd = NULL;
1292	}
1293
1294	return freelist;
1295}
1296
1297static void dma_free_pagelist(struct page *freelist)
1298{
1299	struct page *pg;
1300
1301	while ((pg = freelist)) {
1302		freelist = pg->freelist;
1303		free_pgtable_page(page_address(pg));
1304	}
1305}
1306
1307/* iommu handling */
1308static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1309{
1310	struct root_entry *root;
1311	unsigned long flags;
1312
1313	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1314	if (!root) {
1315		pr_err("Allocating root entry for %s failed\n",
1316			iommu->name);
1317		return -ENOMEM;
1318	}
1319
1320	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1321
1322	spin_lock_irqsave(&iommu->lock, flags);
1323	iommu->root_entry = root;
1324	spin_unlock_irqrestore(&iommu->lock, flags);
1325
1326	return 0;
1327}
1328
1329static void iommu_set_root_entry(struct intel_iommu *iommu)
1330{
1331	u64 addr;
1332	u32 sts;
1333	unsigned long flag;
1334
1335	addr = virt_to_phys(iommu->root_entry);
1336	if (sm_supported(iommu))
1337		addr |= DMA_RTADDR_SMT;
1338
1339	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1341
1342	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1343
1344	/* Make sure hardware complete it */
1345	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1346		      readl, (sts & DMA_GSTS_RTPS), sts);
1347
1348	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1349
 
 
 
 
 
 
 
1350	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1351	if (sm_supported(iommu))
1352		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1353	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1354}
1355
1356void iommu_flush_write_buffer(struct intel_iommu *iommu)
1357{
1358	u32 val;
1359	unsigned long flag;
1360
1361	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1362		return;
1363
1364	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1365	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1366
1367	/* Make sure hardware complete it */
1368	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1369		      readl, (!(val & DMA_GSTS_WBFS)), val);
1370
1371	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1372}
1373
1374/* return value determine if we need a write buffer flush */
1375static void __iommu_flush_context(struct intel_iommu *iommu,
1376				  u16 did, u16 source_id, u8 function_mask,
1377				  u64 type)
1378{
1379	u64 val = 0;
1380	unsigned long flag;
1381
1382	switch (type) {
1383	case DMA_CCMD_GLOBAL_INVL:
1384		val = DMA_CCMD_GLOBAL_INVL;
1385		break;
1386	case DMA_CCMD_DOMAIN_INVL:
1387		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1388		break;
1389	case DMA_CCMD_DEVICE_INVL:
1390		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1391			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1392		break;
1393	default:
1394		BUG();
1395	}
1396	val |= DMA_CCMD_ICC;
1397
1398	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1399	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1400
1401	/* Make sure hardware complete it */
1402	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1403		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1404
1405	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1406}
1407
1408/* return value determine if we need a write buffer flush */
1409static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1410				u64 addr, unsigned int size_order, u64 type)
1411{
1412	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1413	u64 val = 0, val_iva = 0;
1414	unsigned long flag;
1415
1416	switch (type) {
1417	case DMA_TLB_GLOBAL_FLUSH:
1418		/* global flush doesn't need set IVA_REG */
1419		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1420		break;
1421	case DMA_TLB_DSI_FLUSH:
1422		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1423		break;
1424	case DMA_TLB_PSI_FLUSH:
1425		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1426		/* IH bit is passed in as part of address */
1427		val_iva = size_order | addr;
1428		break;
1429	default:
1430		BUG();
1431	}
1432	/* Note: set drain read/write */
1433#if 0
1434	/*
1435	 * This is probably to be super secure.. Looks like we can
1436	 * ignore it without any impact.
1437	 */
1438	if (cap_read_drain(iommu->cap))
1439		val |= DMA_TLB_READ_DRAIN;
1440#endif
1441	if (cap_write_drain(iommu->cap))
1442		val |= DMA_TLB_WRITE_DRAIN;
1443
1444	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1445	/* Note: Only uses first TLB reg currently */
1446	if (val_iva)
1447		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1448	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1449
1450	/* Make sure hardware complete it */
1451	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1452		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1453
1454	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1455
1456	/* check IOTLB invalidation granularity */
1457	if (DMA_TLB_IAIG(val) == 0)
1458		pr_err("Flush IOTLB failed\n");
1459	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1460		pr_debug("TLB flush request %Lx, actual %Lx\n",
1461			(unsigned long long)DMA_TLB_IIRG(type),
1462			(unsigned long long)DMA_TLB_IAIG(val));
1463}
1464
1465static struct device_domain_info *
1466iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1467			 u8 bus, u8 devfn)
1468{
1469	struct device_domain_info *info;
 
1470
1471	assert_spin_locked(&device_domain_lock);
1472
1473	if (!iommu->qi)
1474		return NULL;
1475
1476	list_for_each_entry(info, &domain->devices, link)
1477		if (info->iommu == iommu && info->bus == bus &&
1478		    info->devfn == devfn) {
1479			if (info->ats_supported && info->dev)
1480				return info;
1481			break;
1482		}
 
 
1483
1484	return NULL;
1485}
1486
1487static void domain_update_iotlb(struct dmar_domain *domain)
1488{
1489	struct device_domain_info *info;
1490	bool has_iotlb_device = false;
 
1491
1492	assert_spin_locked(&device_domain_lock);
1493
1494	list_for_each_entry(info, &domain->devices, link)
1495		if (info->ats_enabled) {
1496			has_iotlb_device = true;
1497			break;
1498		}
 
 
 
 
1499
1500	if (!has_iotlb_device) {
1501		struct subdev_domain_info *sinfo;
 
 
 
 
 
 
 
 
 
1502
1503		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1504			info = get_domain_info(sinfo->pdev);
1505			if (info && info->ats_enabled) {
1506				has_iotlb_device = true;
1507				break;
1508			}
1509		}
1510	}
1511
1512	domain->has_iotlb_device = has_iotlb_device;
1513}
1514
1515static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1516{
1517	struct pci_dev *pdev;
1518
1519	assert_spin_locked(&device_domain_lock);
1520
1521	if (!info || !dev_is_pci(info->dev))
1522		return;
1523
1524	pdev = to_pci_dev(info->dev);
1525	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1526	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1527	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1528	 * reserved, which should be set to 0.
1529	 */
1530	if (!ecap_dit(info->iommu->ecap))
1531		info->pfsid = 0;
1532	else {
1533		struct pci_dev *pf_pdev;
1534
1535		/* pdev will be returned if device is not a vf */
1536		pf_pdev = pci_physfn(pdev);
1537		info->pfsid = pci_dev_id(pf_pdev);
1538	}
1539
1540#ifdef CONFIG_INTEL_IOMMU_SVM
1541	/* The PCIe spec, in its wisdom, declares that the behaviour of
1542	   the device if you enable PASID support after ATS support is
1543	   undefined. So always enable PASID support on devices which
1544	   have it, even if we can't yet know if we're ever going to
1545	   use it. */
1546	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1547		info->pasid_enabled = 1;
1548
1549	if (info->pri_supported &&
1550	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1551	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1552		info->pri_enabled = 1;
1553#endif
1554	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1555	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1556		info->ats_enabled = 1;
1557		domain_update_iotlb(info->domain);
1558		info->ats_qdep = pci_ats_queue_depth(pdev);
1559	}
1560}
1561
1562static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1563{
1564	struct pci_dev *pdev;
1565
1566	assert_spin_locked(&device_domain_lock);
1567
1568	if (!dev_is_pci(info->dev))
1569		return;
1570
1571	pdev = to_pci_dev(info->dev);
1572
1573	if (info->ats_enabled) {
1574		pci_disable_ats(pdev);
1575		info->ats_enabled = 0;
1576		domain_update_iotlb(info->domain);
1577	}
1578#ifdef CONFIG_INTEL_IOMMU_SVM
1579	if (info->pri_enabled) {
1580		pci_disable_pri(pdev);
1581		info->pri_enabled = 0;
1582	}
 
1583	if (info->pasid_enabled) {
1584		pci_disable_pasid(pdev);
1585		info->pasid_enabled = 0;
1586	}
1587#endif
1588}
1589
1590static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1591				    u64 addr, unsigned int mask)
1592{
1593	u16 sid, qdep;
1594
1595	if (!info || !info->ats_enabled)
1596		return;
1597
1598	sid = info->bus << 8 | info->devfn;
1599	qdep = info->ats_qdep;
1600	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1601			   qdep, addr, mask);
 
1602}
1603
1604static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1605				  u64 addr, unsigned mask)
1606{
 
1607	unsigned long flags;
1608	struct device_domain_info *info;
1609	struct subdev_domain_info *sinfo;
1610
1611	if (!domain->has_iotlb_device)
1612		return;
1613
1614	spin_lock_irqsave(&device_domain_lock, flags);
1615	list_for_each_entry(info, &domain->devices, link)
1616		__iommu_flush_dev_iotlb(info, addr, mask);
1617
1618	list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1619		info = get_domain_info(sinfo->pdev);
1620		__iommu_flush_dev_iotlb(info, addr, mask);
1621	}
1622	spin_unlock_irqrestore(&device_domain_lock, flags);
1623}
1624
1625static void domain_flush_piotlb(struct intel_iommu *iommu,
1626				struct dmar_domain *domain,
1627				u64 addr, unsigned long npages, bool ih)
1628{
1629	u16 did = domain->iommu_did[iommu->seq_id];
1630
1631	if (domain->default_pasid)
1632		qi_flush_piotlb(iommu, did, domain->default_pasid,
1633				addr, npages, ih);
1634
1635	if (!list_empty(&domain->devices))
1636		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1637}
1638
1639static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1640				  struct dmar_domain *domain,
1641				  unsigned long pfn, unsigned int pages,
1642				  int ih, int map)
1643{
1644	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
 
1645	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1646	u16 did = domain->iommu_did[iommu->seq_id];
1647
1648	BUG_ON(pages == 0);
1649
1650	if (ih)
1651		ih = 1 << 6;
1652
1653	if (domain_use_first_level(domain)) {
1654		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1655	} else {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1656		/*
1657		 * Fallback to domain selective flush if no PSI support or
1658		 * the size is too big. PSI requires page size to be 2 ^ x,
1659		 * and the base address is naturally aligned to the size.
1660		 */
1661		if (!cap_pgsel_inv(iommu->cap) ||
1662		    mask > cap_max_amask_val(iommu->cap))
1663			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1664							DMA_TLB_DSI_FLUSH);
1665		else
1666			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1667							DMA_TLB_PSI_FLUSH);
1668	}
1669
1670	/*
1671	 * In caching mode, changes of pages from non-present to present require
1672	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1673	 */
1674	if (!cap_caching_mode(iommu->cap) || !map)
1675		iommu_flush_dev_iotlb(domain, addr, mask);
1676}
1677
1678/* Notification for newly created mappings */
1679static inline void __mapping_notify_one(struct intel_iommu *iommu,
1680					struct dmar_domain *domain,
1681					unsigned long pfn, unsigned int pages)
1682{
1683	/*
1684	 * It's a non-present to present mapping. Only flush if caching mode
1685	 * and second level.
1686	 */
1687	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1688		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1689	else
1690		iommu_flush_write_buffer(iommu);
1691}
1692
1693static void intel_flush_iotlb_all(struct iommu_domain *domain)
1694{
1695	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1696	int idx;
 
1697
1698	for_each_domain_iommu(idx, dmar_domain) {
1699		struct intel_iommu *iommu = g_iommus[idx];
1700		u16 did = dmar_domain->iommu_did[iommu->seq_id];
1701
1702		if (domain_use_first_level(dmar_domain))
1703			domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1704		else
1705			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1706						 DMA_TLB_DSI_FLUSH);
1707
1708		if (!cap_caching_mode(iommu->cap))
1709			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1710					      0, MAX_AGAW_PFN_WIDTH);
1711	}
1712}
1713
1714static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1715{
1716	u32 pmen;
1717	unsigned long flags;
1718
1719	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1720		return;
1721
1722	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1723	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1724	pmen &= ~DMA_PMEN_EPM;
1725	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1726
1727	/* wait for the protected region status bit to clear */
1728	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1729		readl, !(pmen & DMA_PMEN_PRS), pmen);
1730
1731	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1732}
1733
1734static void iommu_enable_translation(struct intel_iommu *iommu)
1735{
1736	u32 sts;
1737	unsigned long flags;
1738
1739	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1740	iommu->gcmd |= DMA_GCMD_TE;
1741	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1742
1743	/* Make sure hardware complete it */
1744	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1745		      readl, (sts & DMA_GSTS_TES), sts);
1746
1747	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1748}
1749
1750static void iommu_disable_translation(struct intel_iommu *iommu)
1751{
1752	u32 sts;
1753	unsigned long flag;
1754
1755	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1756	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1757		return;
1758
1759	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1760	iommu->gcmd &= ~DMA_GCMD_TE;
1761	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1762
1763	/* Make sure hardware complete it */
1764	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1765		      readl, (!(sts & DMA_GSTS_TES)), sts);
1766
1767	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1768}
1769
1770static int iommu_init_domains(struct intel_iommu *iommu)
1771{
1772	u32 ndomains, nlongs;
1773	size_t size;
1774
1775	ndomains = cap_ndoms(iommu->cap);
1776	pr_debug("%s: Number of Domains supported <%d>\n",
1777		 iommu->name, ndomains);
1778	nlongs = BITS_TO_LONGS(ndomains);
1779
1780	spin_lock_init(&iommu->lock);
1781
1782	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1783	if (!iommu->domain_ids) {
1784		pr_err("%s: Allocating domain id array failed\n",
1785		       iommu->name);
1786		return -ENOMEM;
1787	}
1788
1789	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1790	iommu->domains = kzalloc(size, GFP_KERNEL);
1791
1792	if (iommu->domains) {
1793		size = 256 * sizeof(struct dmar_domain *);
1794		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1795	}
1796
1797	if (!iommu->domains || !iommu->domains[0]) {
1798		pr_err("%s: Allocating domain array failed\n",
1799		       iommu->name);
1800		kfree(iommu->domain_ids);
1801		kfree(iommu->domains);
1802		iommu->domain_ids = NULL;
1803		iommu->domains    = NULL;
1804		return -ENOMEM;
1805	}
1806
1807	/*
1808	 * If Caching mode is set, then invalid translations are tagged
1809	 * with domain-id 0, hence we need to pre-allocate it. We also
1810	 * use domain-id 0 as a marker for non-allocated domain-id, so
1811	 * make sure it is not used for a real domain.
1812	 */
1813	set_bit(0, iommu->domain_ids);
1814
1815	/*
1816	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1817	 * entry for first-level or pass-through translation modes should
1818	 * be programmed with a domain id different from those used for
1819	 * second-level or nested translation. We reserve a domain id for
1820	 * this purpose.
1821	 */
1822	if (sm_supported(iommu))
1823		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1824
1825	return 0;
1826}
1827
1828static void disable_dmar_iommu(struct intel_iommu *iommu)
1829{
1830	struct device_domain_info *info, *tmp;
1831	unsigned long flags;
1832
1833	if (!iommu->domains || !iommu->domain_ids)
 
 
 
 
 
1834		return;
1835
1836	spin_lock_irqsave(&device_domain_lock, flags);
1837	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1838		if (info->iommu != iommu)
1839			continue;
1840
1841		if (!info->dev || !info->domain)
1842			continue;
1843
1844		__dmar_remove_one_dev_info(info);
1845	}
1846	spin_unlock_irqrestore(&device_domain_lock, flags);
1847
1848	if (iommu->gcmd & DMA_GCMD_TE)
1849		iommu_disable_translation(iommu);
1850}
1851
1852static void free_dmar_iommu(struct intel_iommu *iommu)
1853{
1854	if ((iommu->domains) && (iommu->domain_ids)) {
1855		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1856		int i;
1857
1858		for (i = 0; i < elems; i++)
1859			kfree(iommu->domains[i]);
1860		kfree(iommu->domains);
1861		kfree(iommu->domain_ids);
1862		iommu->domains = NULL;
1863		iommu->domain_ids = NULL;
1864	}
1865
1866	g_iommus[iommu->seq_id] = NULL;
 
 
 
1867
1868	/* free context mapping */
1869	free_context_table(iommu);
1870
1871#ifdef CONFIG_INTEL_IOMMU_SVM
1872	if (pasid_supported(iommu)) {
1873		if (ecap_prs(iommu->ecap))
1874			intel_svm_finish_prq(iommu);
1875	}
1876	if (vccap_pasid(iommu->vccap))
1877		ioasid_unregister_allocator(&iommu->pasid_allocator);
1878
1879#endif
1880}
1881
1882/*
1883 * Check and return whether first level is used by default for
1884 * DMA translation.
1885 */
1886static bool first_level_by_default(void)
1887{
1888	return scalable_mode_support() && intel_cap_flts_sanity();
 
 
 
 
 
 
 
 
 
1889}
1890
1891static struct dmar_domain *alloc_domain(int flags)
1892{
1893	struct dmar_domain *domain;
1894
1895	domain = alloc_domain_mem();
1896	if (!domain)
1897		return NULL;
1898
1899	memset(domain, 0, sizeof(*domain));
1900	domain->nid = NUMA_NO_NODE;
1901	domain->flags = flags;
1902	if (first_level_by_default())
1903		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1904	domain->has_iotlb_device = false;
1905	INIT_LIST_HEAD(&domain->devices);
1906	INIT_LIST_HEAD(&domain->subdevices);
 
1907
1908	return domain;
1909}
1910
1911/* Must be called with iommu->lock */
1912static int domain_attach_iommu(struct dmar_domain *domain,
1913			       struct intel_iommu *iommu)
1914{
 
1915	unsigned long ndomains;
1916	int num;
1917
1918	assert_spin_locked(&device_domain_lock);
1919	assert_spin_locked(&iommu->lock);
 
1920
1921	domain->iommu_refcnt[iommu->seq_id] += 1;
1922	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1923		ndomains = cap_ndoms(iommu->cap);
1924		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
 
 
 
 
1925
1926		if (num >= ndomains) {
1927			pr_err("%s: No free domain ids\n", iommu->name);
1928			domain->iommu_refcnt[iommu->seq_id] -= 1;
1929			return -ENOSPC;
1930		}
1931
1932		set_bit(num, iommu->domain_ids);
1933		set_iommu_domain(iommu, num, domain);
1934
1935		domain->iommu_did[iommu->seq_id] = num;
1936		domain->nid			 = iommu->node;
1937
1938		domain_update_iommu_cap(domain);
 
 
 
1939	}
 
1940
 
1941	return 0;
 
 
 
 
 
 
 
1942}
1943
1944static void domain_detach_iommu(struct dmar_domain *domain,
1945				struct intel_iommu *iommu)
1946{
1947	int num;
1948
1949	assert_spin_locked(&device_domain_lock);
1950	assert_spin_locked(&iommu->lock);
1951
1952	domain->iommu_refcnt[iommu->seq_id] -= 1;
1953	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1954		num = domain->iommu_did[iommu->seq_id];
1955		clear_bit(num, iommu->domain_ids);
1956		set_iommu_domain(iommu, num, NULL);
1957
 
 
 
 
 
 
1958		domain_update_iommu_cap(domain);
1959		domain->iommu_did[iommu->seq_id] = 0;
1960	}
 
1961}
1962
1963static inline int guestwidth_to_adjustwidth(int gaw)
1964{
1965	int agaw;
1966	int r = (gaw - 12) % 9;
1967
1968	if (r == 0)
1969		agaw = gaw;
1970	else
1971		agaw = gaw + 9 - r;
1972	if (agaw > 64)
1973		agaw = 64;
1974	return agaw;
1975}
1976
1977static void domain_exit(struct dmar_domain *domain)
1978{
1979
1980	/* Remove associated devices and clear attached or cached domains */
1981	domain_remove_dev_info(domain);
1982
1983	/* destroy iovas */
1984	if (domain->domain.type == IOMMU_DOMAIN_DMA)
1985		iommu_put_dma_cookie(&domain->domain);
1986
1987	if (domain->pgd) {
1988		struct page *freelist;
1989
1990		freelist = domain_unmap(domain, 0,
1991					DOMAIN_MAX_PFN(domain->gaw), NULL);
1992		dma_free_pagelist(freelist);
1993	}
1994
1995	free_domain_mem(domain);
 
 
 
1996}
1997
1998/*
1999 * Get the PASID directory size for scalable mode context entry.
2000 * Value of X in the PDTS field of a scalable mode context entry
2001 * indicates PASID directory with 2^(X + 7) entries.
2002 */
2003static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2004{
2005	int pds, max_pde;
2006
2007	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2008	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2009	if (pds < 7)
2010		return 0;
2011
2012	return pds - 7;
2013}
2014
2015/*
2016 * Set the RID_PASID field of a scalable mode context entry. The
2017 * IOMMU hardware will use the PASID value set in this field for
2018 * DMA translations of DMA requests without PASID.
2019 */
2020static inline void
2021context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2022{
2023	context->hi |= pasid & ((1 << 20) - 1);
2024}
2025
2026/*
2027 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2028 * entry.
2029 */
2030static inline void context_set_sm_dte(struct context_entry *context)
2031{
2032	context->lo |= (1 << 2);
2033}
2034
2035/*
2036 * Set the PRE(Page Request Enable) field of a scalable mode context
2037 * entry.
2038 */
2039static inline void context_set_sm_pre(struct context_entry *context)
2040{
2041	context->lo |= (1 << 4);
2042}
2043
2044/* Convert value to context PASID directory size field coding. */
2045#define context_pdts(pds)	(((pds) & 0x7) << 9)
2046
2047static int domain_context_mapping_one(struct dmar_domain *domain,
2048				      struct intel_iommu *iommu,
2049				      struct pasid_table *table,
2050				      u8 bus, u8 devfn)
2051{
2052	u16 did = domain->iommu_did[iommu->seq_id];
 
 
2053	int translation = CONTEXT_TT_MULTI_LEVEL;
2054	struct device_domain_info *info = NULL;
2055	struct context_entry *context;
2056	unsigned long flags;
2057	int ret;
2058
2059	WARN_ON(did == 0);
2060
2061	if (hw_pass_through && domain_type_is_si(domain))
2062		translation = CONTEXT_TT_PASS_THROUGH;
2063
2064	pr_debug("Set context mapping for %02x:%02x.%d\n",
2065		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2066
2067	BUG_ON(!domain->pgd);
2068
2069	spin_lock_irqsave(&device_domain_lock, flags);
2070	spin_lock(&iommu->lock);
2071
2072	ret = -ENOMEM;
2073	context = iommu_context_addr(iommu, bus, devfn, 1);
2074	if (!context)
2075		goto out_unlock;
2076
2077	ret = 0;
2078	if (context_present(context))
2079		goto out_unlock;
2080
2081	/*
2082	 * For kdump cases, old valid entries may be cached due to the
2083	 * in-flight DMA and copied pgtable, but there is no unmapping
2084	 * behaviour for them, thus we need an explicit cache flush for
2085	 * the newly-mapped device. For kdump, at this point, the device
2086	 * is supposed to finish reset at its driver probe stage, so no
2087	 * in-flight DMA will exist, and we don't need to worry anymore
2088	 * hereafter.
2089	 */
2090	if (context_copied(context)) {
2091		u16 did_old = context_domain_id(context);
2092
2093		if (did_old < cap_ndoms(iommu->cap)) {
2094			iommu->flush.flush_context(iommu, did_old,
2095						   (((u16)bus) << 8) | devfn,
2096						   DMA_CCMD_MASK_NOBIT,
2097						   DMA_CCMD_DEVICE_INVL);
2098			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2099						 DMA_TLB_DSI_FLUSH);
2100		}
 
 
2101	}
2102
2103	context_clear_entry(context);
2104
2105	if (sm_supported(iommu)) {
2106		unsigned long pds;
2107
2108		WARN_ON(!table);
2109
2110		/* Setup the PASID DIR pointer: */
2111		pds = context_get_sm_pds(table);
2112		context->lo = (u64)virt_to_phys(table->table) |
2113				context_pdts(pds);
2114
2115		/* Setup the RID_PASID field: */
2116		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2117
2118		/*
2119		 * Setup the Device-TLB enable bit and Page request
2120		 * Enable bit:
2121		 */
2122		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2123		if (info && info->ats_supported)
2124			context_set_sm_dte(context);
2125		if (info && info->pri_supported)
2126			context_set_sm_pre(context);
 
 
2127	} else {
2128		struct dma_pte *pgd = domain->pgd;
2129		int agaw;
2130
2131		context_set_domain_id(context, did);
2132
2133		if (translation != CONTEXT_TT_PASS_THROUGH) {
2134			/*
2135			 * Skip top levels of page tables for iommu which has
2136			 * less agaw than default. Unnecessary for PT mode.
2137			 */
2138			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2139				ret = -ENOMEM;
2140				pgd = phys_to_virt(dma_pte_addr(pgd));
2141				if (!dma_pte_present(pgd))
2142					goto out_unlock;
2143			}
2144
2145			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2146			if (info && info->ats_supported)
2147				translation = CONTEXT_TT_DEV_IOTLB;
2148			else
2149				translation = CONTEXT_TT_MULTI_LEVEL;
2150
2151			context_set_address_root(context, virt_to_phys(pgd));
2152			context_set_address_width(context, agaw);
2153		} else {
2154			/*
2155			 * In pass through mode, AW must be programmed to
2156			 * indicate the largest AGAW value supported by
2157			 * hardware. And ASR is ignored by hardware.
2158			 */
2159			context_set_address_width(context, iommu->msagaw);
2160		}
2161
2162		context_set_translation_type(context, translation);
2163	}
2164
2165	context_set_fault_enable(context);
2166	context_set_present(context);
2167	if (!ecap_coherent(iommu->ecap))
2168		clflush_cache_range(context, sizeof(*context));
2169
2170	/*
2171	 * It's a non-present to present mapping. If hardware doesn't cache
2172	 * non-present entry we only need to flush the write-buffer. If the
2173	 * _does_ cache non-present entries, then it does so in the special
2174	 * domain #0, which we have to flush:
2175	 */
2176	if (cap_caching_mode(iommu->cap)) {
2177		iommu->flush.flush_context(iommu, 0,
2178					   (((u16)bus) << 8) | devfn,
2179					   DMA_CCMD_MASK_NOBIT,
2180					   DMA_CCMD_DEVICE_INVL);
2181		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2182	} else {
2183		iommu_flush_write_buffer(iommu);
2184	}
2185	iommu_enable_dev_iotlb(info);
2186
2187	ret = 0;
2188
2189out_unlock:
2190	spin_unlock(&iommu->lock);
2191	spin_unlock_irqrestore(&device_domain_lock, flags);
2192
2193	return ret;
2194}
2195
2196struct domain_context_mapping_data {
2197	struct dmar_domain *domain;
2198	struct intel_iommu *iommu;
2199	struct pasid_table *table;
2200};
2201
2202static int domain_context_mapping_cb(struct pci_dev *pdev,
2203				     u16 alias, void *opaque)
2204{
2205	struct domain_context_mapping_data *data = opaque;
2206
2207	return domain_context_mapping_one(data->domain, data->iommu,
2208					  data->table, PCI_BUS_NUM(alias),
2209					  alias & 0xff);
2210}
2211
2212static int
2213domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2214{
2215	struct domain_context_mapping_data data;
2216	struct pasid_table *table;
2217	struct intel_iommu *iommu;
2218	u8 bus, devfn;
2219
2220	iommu = device_to_iommu(dev, &bus, &devfn);
2221	if (!iommu)
2222		return -ENODEV;
2223
2224	table = intel_pasid_get_table(dev);
2225
2226	if (!dev_is_pci(dev))
2227		return domain_context_mapping_one(domain, iommu, table,
2228						  bus, devfn);
2229
2230	data.domain = domain;
2231	data.iommu = iommu;
2232	data.table = table;
2233
2234	return pci_for_each_dma_alias(to_pci_dev(dev),
2235				      &domain_context_mapping_cb, &data);
2236}
2237
2238static int domain_context_mapped_cb(struct pci_dev *pdev,
2239				    u16 alias, void *opaque)
2240{
2241	struct intel_iommu *iommu = opaque;
2242
2243	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2244}
2245
2246static int domain_context_mapped(struct device *dev)
2247{
2248	struct intel_iommu *iommu;
2249	u8 bus, devfn;
2250
2251	iommu = device_to_iommu(dev, &bus, &devfn);
2252	if (!iommu)
2253		return -ENODEV;
2254
2255	if (!dev_is_pci(dev))
2256		return device_context_mapped(iommu, bus, devfn);
2257
2258	return !pci_for_each_dma_alias(to_pci_dev(dev),
2259				       domain_context_mapped_cb, iommu);
2260}
2261
2262/* Returns a number of VTD pages, but aligned to MM page size */
2263static inline unsigned long aligned_nrpages(unsigned long host_addr,
2264					    size_t size)
2265{
2266	host_addr &= ~PAGE_MASK;
2267	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2268}
2269
2270/* Return largest possible superpage level for a given mapping */
2271static inline int hardware_largepage_caps(struct dmar_domain *domain,
2272					  unsigned long iov_pfn,
2273					  unsigned long phy_pfn,
2274					  unsigned long pages)
2275{
2276	int support, level = 1;
2277	unsigned long pfnmerge;
2278
2279	support = domain->iommu_superpage;
2280
2281	/* To use a large page, the virtual *and* physical addresses
2282	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2283	   of them will mean we have to use smaller pages. So just
2284	   merge them and check both at once. */
2285	pfnmerge = iov_pfn | phy_pfn;
2286
2287	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2288		pages >>= VTD_STRIDE_SHIFT;
2289		if (!pages)
2290			break;
2291		pfnmerge >>= VTD_STRIDE_SHIFT;
2292		level++;
2293		support--;
2294	}
2295	return level;
2296}
2297
2298/*
2299 * Ensure that old small page tables are removed to make room for superpage(s).
2300 * We're going to add new large pages, so make sure we don't remove their parent
2301 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2302 */
2303static void switch_to_super_page(struct dmar_domain *domain,
2304				 unsigned long start_pfn,
2305				 unsigned long end_pfn, int level)
2306{
2307	unsigned long lvl_pages = lvl_to_nr_pages(level);
 
2308	struct dma_pte *pte = NULL;
2309	int i;
2310
2311	while (start_pfn <= end_pfn) {
2312		if (!pte)
2313			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2314
2315		if (dma_pte_present(pte)) {
2316			dma_pte_free_pagetable(domain, start_pfn,
2317					       start_pfn + lvl_pages - 1,
2318					       level + 1);
2319
2320			for_each_domain_iommu(i, domain)
2321				iommu_flush_iotlb_psi(g_iommus[i], domain,
2322						      start_pfn, lvl_pages,
2323						      0, 0);
2324		}
2325
2326		pte++;
2327		start_pfn += lvl_pages;
2328		if (first_pte_in_page(pte))
2329			pte = NULL;
2330	}
2331}
2332
2333static int
2334__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2335		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2336{
 
2337	unsigned int largepage_lvl = 0;
2338	unsigned long lvl_pages = 0;
2339	struct dma_pte *pte = NULL;
2340	phys_addr_t pteval;
2341	u64 attr;
2342
2343	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2344
2345	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2346		return -EINVAL;
2347
2348	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2349	attr |= DMA_FL_PTE_PRESENT;
2350	if (domain_use_first_level(domain)) {
2351		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
2352
2353		if (domain->domain.type == IOMMU_DOMAIN_DMA) {
2354			attr |= DMA_FL_PTE_ACCESS;
2355			if (prot & DMA_PTE_WRITE)
2356				attr |= DMA_FL_PTE_DIRTY;
2357		}
2358	}
2359
2360	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2361
2362	while (nr_pages > 0) {
2363		uint64_t tmp;
2364
2365		if (!pte) {
2366			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2367					phys_pfn, nr_pages);
2368
2369			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2370			if (!pte)
2371				return -ENOMEM;
 
 
 
 
2372			/* It is large page*/
2373			if (largepage_lvl > 1) {
2374				unsigned long end_pfn;
 
2375
2376				pteval |= DMA_PTE_LARGE_PAGE;
2377				end_pfn = ((iov_pfn + nr_pages) & level_mask(largepage_lvl)) - 1;
 
 
2378				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2379			} else {
2380				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2381			}
2382
2383		}
2384		/* We don't need lock here, nobody else
2385		 * touches the iova range
2386		 */
2387		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2388		if (tmp) {
2389			static int dumps = 5;
2390			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2391				iov_pfn, tmp, (unsigned long long)pteval);
2392			if (dumps) {
2393				dumps--;
2394				debug_dma_dump_mappings(NULL);
2395			}
2396			WARN_ON(1);
2397		}
2398
2399		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2400
2401		BUG_ON(nr_pages < lvl_pages);
2402
2403		nr_pages -= lvl_pages;
2404		iov_pfn += lvl_pages;
2405		phys_pfn += lvl_pages;
2406		pteval += lvl_pages * VTD_PAGE_SIZE;
2407
2408		/* If the next PTE would be the first in a new page, then we
2409		 * need to flush the cache on the entries we've just written.
2410		 * And then we'll need to recalculate 'pte', so clear it and
2411		 * let it get set again in the if (!pte) block above.
2412		 *
2413		 * If we're done (!nr_pages) we need to flush the cache too.
2414		 *
2415		 * Also if we've been setting superpages, we may need to
2416		 * recalculate 'pte' and switch back to smaller pages for the
2417		 * end of the mapping, if the trailing size is not enough to
2418		 * use another superpage (i.e. nr_pages < lvl_pages).
2419		 *
2420		 * We leave clflush for the leaf pte changes to iotlb_sync_map()
2421		 * callback.
2422		 */
2423		pte++;
2424		if (!nr_pages || first_pte_in_page(pte) ||
2425		    (largepage_lvl > 1 && nr_pages < lvl_pages))
 
 
2426			pte = NULL;
 
2427	}
2428
2429	return 0;
2430}
2431
2432static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2433{
2434	struct intel_iommu *iommu = info->iommu;
2435	struct context_entry *context;
2436	unsigned long flags;
2437	u16 did_old;
2438
2439	if (!iommu)
2440		return;
2441
2442	spin_lock_irqsave(&iommu->lock, flags);
2443	context = iommu_context_addr(iommu, bus, devfn, 0);
2444	if (!context) {
2445		spin_unlock_irqrestore(&iommu->lock, flags);
2446		return;
2447	}
2448
2449	if (sm_supported(iommu)) {
2450		if (hw_pass_through && domain_type_is_si(info->domain))
2451			did_old = FLPT_DEFAULT_DID;
2452		else
2453			did_old = info->domain->iommu_did[iommu->seq_id];
2454	} else {
2455		did_old = context_domain_id(context);
2456	}
2457
2458	context_clear_entry(context);
2459	__iommu_flush_cache(iommu, context, sizeof(*context));
2460	spin_unlock_irqrestore(&iommu->lock, flags);
2461	iommu->flush.flush_context(iommu,
2462				   did_old,
2463				   (((u16)bus) << 8) | devfn,
2464				   DMA_CCMD_MASK_NOBIT,
2465				   DMA_CCMD_DEVICE_INVL);
2466
2467	if (sm_supported(iommu))
2468		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2469
2470	iommu->flush.flush_iotlb(iommu,
2471				 did_old,
2472				 0,
2473				 0,
2474				 DMA_TLB_DSI_FLUSH);
2475
2476	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2477}
2478
2479static inline void unlink_domain_info(struct device_domain_info *info)
2480{
2481	assert_spin_locked(&device_domain_lock);
2482	list_del(&info->link);
2483	list_del(&info->global);
2484	if (info->dev)
2485		dev_iommu_priv_set(info->dev, NULL);
2486}
2487
2488static void domain_remove_dev_info(struct dmar_domain *domain)
2489{
2490	struct device_domain_info *info, *tmp;
2491	unsigned long flags;
2492
2493	spin_lock_irqsave(&device_domain_lock, flags);
2494	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2495		__dmar_remove_one_dev_info(info);
2496	spin_unlock_irqrestore(&device_domain_lock, flags);
2497}
2498
2499struct dmar_domain *find_domain(struct device *dev)
2500{
2501	struct device_domain_info *info;
2502
2503	if (unlikely(!dev || !dev->iommu))
2504		return NULL;
2505
2506	if (unlikely(attach_deferred(dev)))
2507		return NULL;
2508
2509	/* No lock here, assumes no domain exit in normal case */
2510	info = get_domain_info(dev);
2511	if (likely(info))
2512		return info->domain;
2513
2514	return NULL;
2515}
2516
2517static inline struct device_domain_info *
2518dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2519{
2520	struct device_domain_info *info;
2521
2522	list_for_each_entry(info, &device_domain_list, global)
2523		if (info->segment == segment && info->bus == bus &&
2524		    info->devfn == devfn)
2525			return info;
2526
2527	return NULL;
2528}
2529
2530static int domain_setup_first_level(struct intel_iommu *iommu,
2531				    struct dmar_domain *domain,
2532				    struct device *dev,
2533				    u32 pasid)
2534{
2535	struct dma_pte *pgd = domain->pgd;
2536	int agaw, level;
2537	int flags = 0;
2538
2539	/*
2540	 * Skip top levels of page tables for iommu which has
2541	 * less agaw than default. Unnecessary for PT mode.
2542	 */
2543	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2544		pgd = phys_to_virt(dma_pte_addr(pgd));
2545		if (!dma_pte_present(pgd))
2546			return -ENOMEM;
2547	}
2548
2549	level = agaw_to_level(agaw);
2550	if (level != 4 && level != 5)
2551		return -EINVAL;
2552
2553	if (pasid != PASID_RID2PASID)
2554		flags |= PASID_FLAG_SUPERVISOR_MODE;
2555	if (level == 5)
2556		flags |= PASID_FLAG_FL5LP;
2557
2558	if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2559		flags |= PASID_FLAG_PAGE_SNOOP;
2560
2561	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2562					     domain->iommu_did[iommu->seq_id],
2563					     flags);
2564}
2565
2566static bool dev_is_real_dma_subdevice(struct device *dev)
2567{
2568	return dev && dev_is_pci(dev) &&
2569	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2570}
2571
2572static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2573						    int bus, int devfn,
2574						    struct device *dev,
2575						    struct dmar_domain *domain)
2576{
2577	struct dmar_domain *found = NULL;
2578	struct device_domain_info *info;
2579	unsigned long flags;
2580	int ret;
2581
2582	info = alloc_devinfo_mem();
2583	if (!info)
2584		return NULL;
2585
2586	if (!dev_is_real_dma_subdevice(dev)) {
2587		info->bus = bus;
2588		info->devfn = devfn;
2589		info->segment = iommu->segment;
2590	} else {
2591		struct pci_dev *pdev = to_pci_dev(dev);
2592
2593		info->bus = pdev->bus->number;
2594		info->devfn = pdev->devfn;
2595		info->segment = pci_domain_nr(pdev->bus);
2596	}
2597
2598	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2599	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2600	info->ats_qdep = 0;
2601	info->dev = dev;
2602	info->domain = domain;
2603	info->iommu = iommu;
2604	info->pasid_table = NULL;
2605	info->auxd_enabled = 0;
2606	INIT_LIST_HEAD(&info->subdevices);
2607
2608	if (dev && dev_is_pci(dev)) {
2609		struct pci_dev *pdev = to_pci_dev(info->dev);
2610
2611		if (ecap_dev_iotlb_support(iommu->ecap) &&
2612		    pci_ats_supported(pdev) &&
2613		    dmar_find_matched_atsr_unit(pdev))
2614			info->ats_supported = 1;
2615
2616		if (sm_supported(iommu)) {
2617			if (pasid_supported(iommu)) {
2618				int features = pci_pasid_features(pdev);
2619				if (features >= 0)
2620					info->pasid_supported = features | 1;
2621			}
2622
2623			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2624			    pci_pri_supported(pdev))
2625				info->pri_supported = 1;
2626		}
2627	}
2628
2629	spin_lock_irqsave(&device_domain_lock, flags);
2630	if (dev)
2631		found = find_domain(dev);
2632
2633	if (!found) {
2634		struct device_domain_info *info2;
2635		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2636						       info->devfn);
2637		if (info2) {
2638			found      = info2->domain;
2639			info2->dev = dev;
2640		}
2641	}
2642
2643	if (found) {
2644		spin_unlock_irqrestore(&device_domain_lock, flags);
2645		free_devinfo_mem(info);
2646		/* Caller must free the original domain */
2647		return found;
2648	}
2649
2650	spin_lock(&iommu->lock);
2651	ret = domain_attach_iommu(domain, iommu);
2652	spin_unlock(&iommu->lock);
2653
2654	if (ret) {
2655		spin_unlock_irqrestore(&device_domain_lock, flags);
2656		free_devinfo_mem(info);
2657		return NULL;
2658	}
2659
2660	list_add(&info->link, &domain->devices);
2661	list_add(&info->global, &device_domain_list);
2662	if (dev)
2663		dev_iommu_priv_set(dev, info);
2664	spin_unlock_irqrestore(&device_domain_lock, flags);
2665
2666	/* PASID table is mandatory for a PCI device in scalable mode. */
2667	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2668		ret = intel_pasid_alloc_table(dev);
2669		if (ret) {
2670			dev_err(dev, "PASID table allocation failed\n");
2671			dmar_remove_one_dev_info(dev);
2672			return NULL;
2673		}
2674
2675		/* Setup the PASID entry for requests without PASID: */
2676		spin_lock_irqsave(&iommu->lock, flags);
2677		if (hw_pass_through && domain_type_is_si(domain))
2678			ret = intel_pasid_setup_pass_through(iommu, domain,
2679					dev, PASID_RID2PASID);
2680		else if (domain_use_first_level(domain))
2681			ret = domain_setup_first_level(iommu, domain, dev,
2682					PASID_RID2PASID);
2683		else
2684			ret = intel_pasid_setup_second_level(iommu, domain,
2685					dev, PASID_RID2PASID);
2686		spin_unlock_irqrestore(&iommu->lock, flags);
2687		if (ret) {
2688			dev_err(dev, "Setup RID2PASID failed\n");
2689			dmar_remove_one_dev_info(dev);
2690			return NULL;
2691		}
2692	}
2693
2694	if (dev && domain_context_mapping(domain, dev)) {
2695		dev_err(dev, "Domain context map failed\n");
2696		dmar_remove_one_dev_info(dev);
2697		return NULL;
2698	}
2699
2700	return domain;
2701}
2702
2703static int iommu_domain_identity_map(struct dmar_domain *domain,
2704				     unsigned long first_vpfn,
2705				     unsigned long last_vpfn)
2706{
2707	/*
2708	 * RMRR range might have overlap with physical memory range,
2709	 * clear it first
2710	 */
2711	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2712
2713	return __domain_mapping(domain, first_vpfn,
2714				first_vpfn, last_vpfn - first_vpfn + 1,
2715				DMA_PTE_READ|DMA_PTE_WRITE);
2716}
2717
2718static int md_domain_init(struct dmar_domain *domain, int guest_width);
2719
2720static int __init si_domain_init(int hw)
2721{
2722	struct dmar_rmrr_unit *rmrr;
2723	struct device *dev;
2724	int i, nid, ret;
2725
2726	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2727	if (!si_domain)
2728		return -EFAULT;
2729
2730	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2731		domain_exit(si_domain);
 
2732		return -EFAULT;
2733	}
2734
2735	if (hw)
2736		return 0;
2737
2738	for_each_online_node(nid) {
2739		unsigned long start_pfn, end_pfn;
2740		int i;
2741
2742		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2743			ret = iommu_domain_identity_map(si_domain,
2744					mm_to_dma_pfn(start_pfn),
2745					mm_to_dma_pfn(end_pfn));
2746			if (ret)
2747				return ret;
2748		}
2749	}
2750
2751	/*
2752	 * Identity map the RMRRs so that devices with RMRRs could also use
2753	 * the si_domain.
2754	 */
2755	for_each_rmrr_units(rmrr) {
2756		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2757					  i, dev) {
2758			unsigned long long start = rmrr->base_address;
2759			unsigned long long end = rmrr->end_address;
2760
2761			if (WARN_ON(end < start ||
2762				    end >> agaw_to_width(si_domain->agaw)))
2763				continue;
2764
2765			ret = iommu_domain_identity_map(si_domain,
2766					mm_to_dma_pfn(start >> PAGE_SHIFT),
2767					mm_to_dma_pfn(end >> PAGE_SHIFT));
2768			if (ret)
2769				return ret;
2770		}
2771	}
2772
2773	return 0;
2774}
2775
2776static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
 
2777{
2778	struct dmar_domain *ndomain;
2779	struct intel_iommu *iommu;
 
2780	u8 bus, devfn;
 
2781
2782	iommu = device_to_iommu(dev, &bus, &devfn);
2783	if (!iommu)
2784		return -ENODEV;
2785
2786	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2787	if (ndomain != domain)
2788		return -EBUSY;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2789
2790	return 0;
2791}
2792
2793static bool device_has_rmrr(struct device *dev)
2794{
2795	struct dmar_rmrr_unit *rmrr;
2796	struct device *tmp;
2797	int i;
2798
2799	rcu_read_lock();
2800	for_each_rmrr_units(rmrr) {
2801		/*
2802		 * Return TRUE if this RMRR contains the device that
2803		 * is passed in.
2804		 */
2805		for_each_active_dev_scope(rmrr->devices,
2806					  rmrr->devices_cnt, i, tmp)
2807			if (tmp == dev ||
2808			    is_downstream_to_pci_bridge(dev, tmp)) {
2809				rcu_read_unlock();
2810				return true;
2811			}
2812	}
2813	rcu_read_unlock();
2814	return false;
2815}
2816
2817/**
2818 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2819 * is relaxable (ie. is allowed to be not enforced under some conditions)
2820 * @dev: device handle
2821 *
2822 * We assume that PCI USB devices with RMRRs have them largely
2823 * for historical reasons and that the RMRR space is not actively used post
2824 * boot.  This exclusion may change if vendors begin to abuse it.
2825 *
2826 * The same exception is made for graphics devices, with the requirement that
2827 * any use of the RMRR regions will be torn down before assigning the device
2828 * to a guest.
2829 *
2830 * Return: true if the RMRR is relaxable, false otherwise
2831 */
2832static bool device_rmrr_is_relaxable(struct device *dev)
2833{
2834	struct pci_dev *pdev;
2835
2836	if (!dev_is_pci(dev))
2837		return false;
2838
2839	pdev = to_pci_dev(dev);
2840	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2841		return true;
2842	else
2843		return false;
2844}
2845
2846/*
2847 * There are a couple cases where we need to restrict the functionality of
2848 * devices associated with RMRRs.  The first is when evaluating a device for
2849 * identity mapping because problems exist when devices are moved in and out
2850 * of domains and their respective RMRR information is lost.  This means that
2851 * a device with associated RMRRs will never be in a "passthrough" domain.
2852 * The second is use of the device through the IOMMU API.  This interface
2853 * expects to have full control of the IOVA space for the device.  We cannot
2854 * satisfy both the requirement that RMRR access is maintained and have an
2855 * unencumbered IOVA space.  We also have no ability to quiesce the device's
2856 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2857 * We therefore prevent devices associated with an RMRR from participating in
2858 * the IOMMU API, which eliminates them from device assignment.
2859 *
2860 * In both cases, devices which have relaxable RMRRs are not concerned by this
2861 * restriction. See device_rmrr_is_relaxable comment.
2862 */
2863static bool device_is_rmrr_locked(struct device *dev)
2864{
2865	if (!device_has_rmrr(dev))
2866		return false;
2867
2868	if (device_rmrr_is_relaxable(dev))
2869		return false;
2870
2871	return true;
2872}
2873
2874/*
2875 * Return the required default domain type for a specific device.
2876 *
2877 * @dev: the device in query
2878 * @startup: true if this is during early boot
2879 *
2880 * Returns:
2881 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2882 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2883 *  - 0: both identity and dynamic domains work for this device
2884 */
2885static int device_def_domain_type(struct device *dev)
2886{
2887	if (dev_is_pci(dev)) {
2888		struct pci_dev *pdev = to_pci_dev(dev);
2889
2890		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2891			return IOMMU_DOMAIN_IDENTITY;
2892
2893		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2894			return IOMMU_DOMAIN_IDENTITY;
2895	}
2896
2897	return 0;
2898}
2899
2900static void intel_iommu_init_qi(struct intel_iommu *iommu)
2901{
2902	/*
2903	 * Start from the sane iommu hardware state.
2904	 * If the queued invalidation is already initialized by us
2905	 * (for example, while enabling interrupt-remapping) then
2906	 * we got the things already rolling from a sane state.
2907	 */
2908	if (!iommu->qi) {
2909		/*
2910		 * Clear any previous faults.
2911		 */
2912		dmar_fault(-1, iommu);
2913		/*
2914		 * Disable queued invalidation if supported and already enabled
2915		 * before OS handover.
2916		 */
2917		dmar_disable_qi(iommu);
2918	}
2919
2920	if (dmar_enable_qi(iommu)) {
2921		/*
2922		 * Queued Invalidate not enabled, use Register Based Invalidate
2923		 */
2924		iommu->flush.flush_context = __iommu_flush_context;
2925		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2926		pr_info("%s: Using Register based invalidation\n",
2927			iommu->name);
2928	} else {
2929		iommu->flush.flush_context = qi_flush_context;
2930		iommu->flush.flush_iotlb = qi_flush_iotlb;
2931		pr_info("%s: Using Queued invalidation\n", iommu->name);
2932	}
2933}
2934
2935static int copy_context_table(struct intel_iommu *iommu,
2936			      struct root_entry *old_re,
2937			      struct context_entry **tbl,
2938			      int bus, bool ext)
2939{
2940	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2941	struct context_entry *new_ce = NULL, ce;
2942	struct context_entry *old_ce = NULL;
2943	struct root_entry re;
2944	phys_addr_t old_ce_phys;
2945
2946	tbl_idx = ext ? bus * 2 : bus;
2947	memcpy(&re, old_re, sizeof(re));
2948
2949	for (devfn = 0; devfn < 256; devfn++) {
2950		/* First calculate the correct index */
2951		idx = (ext ? devfn * 2 : devfn) % 256;
2952
2953		if (idx == 0) {
2954			/* First save what we may have and clean up */
2955			if (new_ce) {
2956				tbl[tbl_idx] = new_ce;
2957				__iommu_flush_cache(iommu, new_ce,
2958						    VTD_PAGE_SIZE);
2959				pos = 1;
2960			}
2961
2962			if (old_ce)
2963				memunmap(old_ce);
2964
2965			ret = 0;
2966			if (devfn < 0x80)
2967				old_ce_phys = root_entry_lctp(&re);
2968			else
2969				old_ce_phys = root_entry_uctp(&re);
2970
2971			if (!old_ce_phys) {
2972				if (ext && devfn == 0) {
2973					/* No LCTP, try UCTP */
2974					devfn = 0x7f;
2975					continue;
2976				} else {
2977					goto out;
2978				}
2979			}
2980
2981			ret = -ENOMEM;
2982			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2983					MEMREMAP_WB);
2984			if (!old_ce)
2985				goto out;
2986
2987			new_ce = alloc_pgtable_page(iommu->node);
2988			if (!new_ce)
2989				goto out_unmap;
2990
2991			ret = 0;
2992		}
2993
2994		/* Now copy the context entry */
2995		memcpy(&ce, old_ce + idx, sizeof(ce));
2996
2997		if (!__context_present(&ce))
2998			continue;
2999
3000		did = context_domain_id(&ce);
3001		if (did >= 0 && did < cap_ndoms(iommu->cap))
3002			set_bit(did, iommu->domain_ids);
3003
3004		/*
3005		 * We need a marker for copied context entries. This
3006		 * marker needs to work for the old format as well as
3007		 * for extended context entries.
3008		 *
3009		 * Bit 67 of the context entry is used. In the old
3010		 * format this bit is available to software, in the
3011		 * extended format it is the PGE bit, but PGE is ignored
3012		 * by HW if PASIDs are disabled (and thus still
3013		 * available).
3014		 *
3015		 * So disable PASIDs first and then mark the entry
3016		 * copied. This means that we don't copy PASID
3017		 * translations from the old kernel, but this is fine as
3018		 * faults there are not fatal.
3019		 */
3020		context_clear_pasid_enable(&ce);
3021		context_set_copied(&ce);
3022
3023		new_ce[idx] = ce;
3024	}
3025
3026	tbl[tbl_idx + pos] = new_ce;
3027
3028	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3029
3030out_unmap:
3031	memunmap(old_ce);
3032
3033out:
3034	return ret;
3035}
3036
3037static int copy_translation_tables(struct intel_iommu *iommu)
3038{
3039	struct context_entry **ctxt_tbls;
3040	struct root_entry *old_rt;
3041	phys_addr_t old_rt_phys;
3042	int ctxt_table_entries;
3043	unsigned long flags;
3044	u64 rtaddr_reg;
3045	int bus, ret;
3046	bool new_ext, ext;
3047
3048	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3049	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3050	new_ext    = !!ecap_ecs(iommu->ecap);
3051
3052	/*
3053	 * The RTT bit can only be changed when translation is disabled,
3054	 * but disabling translation means to open a window for data
3055	 * corruption. So bail out and don't copy anything if we would
3056	 * have to change the bit.
3057	 */
3058	if (new_ext != ext)
3059		return -EINVAL;
3060
 
 
 
 
3061	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3062	if (!old_rt_phys)
3063		return -EINVAL;
3064
3065	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3066	if (!old_rt)
3067		return -ENOMEM;
3068
3069	/* This is too big for the stack - allocate it from slab */
3070	ctxt_table_entries = ext ? 512 : 256;
3071	ret = -ENOMEM;
3072	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3073	if (!ctxt_tbls)
3074		goto out_unmap;
3075
3076	for (bus = 0; bus < 256; bus++) {
3077		ret = copy_context_table(iommu, &old_rt[bus],
3078					 ctxt_tbls, bus, ext);
3079		if (ret) {
3080			pr_err("%s: Failed to copy context table for bus %d\n",
3081				iommu->name, bus);
3082			continue;
3083		}
3084	}
3085
3086	spin_lock_irqsave(&iommu->lock, flags);
3087
3088	/* Context tables are copied, now write them to the root_entry table */
3089	for (bus = 0; bus < 256; bus++) {
3090		int idx = ext ? bus * 2 : bus;
3091		u64 val;
3092
3093		if (ctxt_tbls[idx]) {
3094			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3095			iommu->root_entry[bus].lo = val;
3096		}
3097
3098		if (!ext || !ctxt_tbls[idx + 1])
3099			continue;
3100
3101		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3102		iommu->root_entry[bus].hi = val;
3103	}
3104
3105	spin_unlock_irqrestore(&iommu->lock, flags);
3106
3107	kfree(ctxt_tbls);
3108
3109	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3110
3111	ret = 0;
3112
3113out_unmap:
3114	memunmap(old_rt);
3115
3116	return ret;
3117}
3118
3119#ifdef CONFIG_INTEL_IOMMU_SVM
3120static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3121{
3122	struct intel_iommu *iommu = data;
3123	ioasid_t ioasid;
3124
3125	if (!iommu)
3126		return INVALID_IOASID;
3127	/*
3128	 * VT-d virtual command interface always uses the full 20 bit
3129	 * PASID range. Host can partition guest PASID range based on
3130	 * policies but it is out of guest's control.
3131	 */
3132	if (min < PASID_MIN || max > intel_pasid_max_id)
3133		return INVALID_IOASID;
3134
3135	if (vcmd_alloc_pasid(iommu, &ioasid))
3136		return INVALID_IOASID;
3137
3138	return ioasid;
3139}
3140
3141static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3142{
3143	struct intel_iommu *iommu = data;
3144
3145	if (!iommu)
3146		return;
3147	/*
3148	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3149	 * We can only free the PASID when all the devices are unbound.
3150	 */
3151	if (ioasid_find(NULL, ioasid, NULL)) {
3152		pr_alert("Cannot free active IOASID %d\n", ioasid);
3153		return;
3154	}
3155	vcmd_free_pasid(iommu, ioasid);
3156}
3157
3158static void register_pasid_allocator(struct intel_iommu *iommu)
3159{
3160	/*
3161	 * If we are running in the host, no need for custom allocator
3162	 * in that PASIDs are allocated from the host system-wide.
3163	 */
3164	if (!cap_caching_mode(iommu->cap))
3165		return;
3166
3167	if (!sm_supported(iommu)) {
3168		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3169		return;
3170	}
3171
3172	/*
3173	 * Register a custom PASID allocator if we are running in a guest,
3174	 * guest PASID must be obtained via virtual command interface.
3175	 * There can be multiple vIOMMUs in each guest but only one allocator
3176	 * is active. All vIOMMU allocators will eventually be calling the same
3177	 * host allocator.
3178	 */
3179	if (!vccap_pasid(iommu->vccap))
3180		return;
3181
3182	pr_info("Register custom PASID allocator\n");
3183	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3184	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3185	iommu->pasid_allocator.pdata = (void *)iommu;
3186	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3187		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3188		/*
3189		 * Disable scalable mode on this IOMMU if there
3190		 * is no custom allocator. Mixing SM capable vIOMMU
3191		 * and non-SM vIOMMU are not supported.
3192		 */
3193		intel_iommu_sm = 0;
3194	}
3195}
3196#endif
3197
3198static int __init init_dmars(void)
3199{
3200	struct dmar_drhd_unit *drhd;
3201	struct intel_iommu *iommu;
3202	int ret;
3203
3204	/*
3205	 * for each drhd
3206	 *    allocate root
3207	 *    initialize and program root entry to not present
3208	 * endfor
3209	 */
3210	for_each_drhd_unit(drhd) {
3211		/*
3212		 * lock not needed as this is only incremented in the single
3213		 * threaded kernel __init code path all other access are read
3214		 * only
3215		 */
3216		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3217			g_num_of_iommus++;
3218			continue;
3219		}
3220		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3221	}
3222
3223	/* Preallocate enough resources for IOMMU hot-addition */
3224	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3225		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3226
3227	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3228			GFP_KERNEL);
3229	if (!g_iommus) {
3230		pr_err("Allocating global iommu array failed\n");
3231		ret = -ENOMEM;
3232		goto error;
3233	}
3234
3235	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3236	if (ret)
3237		goto free_iommu;
3238
3239	for_each_iommu(iommu, drhd) {
3240		if (drhd->ignored) {
3241			iommu_disable_translation(iommu);
3242			continue;
3243		}
3244
3245		/*
3246		 * Find the max pasid size of all IOMMU's in the system.
3247		 * We need to ensure the system pasid table is no bigger
3248		 * than the smallest supported.
3249		 */
3250		if (pasid_supported(iommu)) {
3251			u32 temp = 2 << ecap_pss(iommu->ecap);
3252
3253			intel_pasid_max_id = min_t(u32, temp,
3254						   intel_pasid_max_id);
3255		}
3256
3257		g_iommus[iommu->seq_id] = iommu;
3258
3259		intel_iommu_init_qi(iommu);
3260
3261		ret = iommu_init_domains(iommu);
3262		if (ret)
3263			goto free_iommu;
3264
3265		init_translation_status(iommu);
3266
3267		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3268			iommu_disable_translation(iommu);
3269			clear_translation_pre_enabled(iommu);
3270			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3271				iommu->name);
3272		}
3273
3274		/*
3275		 * TBD:
3276		 * we could share the same root & context tables
3277		 * among all IOMMU's. Need to Split it later.
3278		 */
3279		ret = iommu_alloc_root_entry(iommu);
3280		if (ret)
3281			goto free_iommu;
3282
3283		if (translation_pre_enabled(iommu)) {
3284			pr_info("Translation already enabled - trying to copy translation structures\n");
3285
3286			ret = copy_translation_tables(iommu);
3287			if (ret) {
3288				/*
3289				 * We found the IOMMU with translation
3290				 * enabled - but failed to copy over the
3291				 * old root-entry table. Try to proceed
3292				 * by disabling translation now and
3293				 * allocating a clean root-entry table.
3294				 * This might cause DMAR faults, but
3295				 * probably the dump will still succeed.
3296				 */
3297				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3298				       iommu->name);
3299				iommu_disable_translation(iommu);
3300				clear_translation_pre_enabled(iommu);
3301			} else {
3302				pr_info("Copied translation tables from previous kernel for %s\n",
3303					iommu->name);
3304			}
3305		}
3306
3307		if (!ecap_pass_through(iommu->ecap))
3308			hw_pass_through = 0;
3309		intel_svm_check(iommu);
3310	}
3311
3312	/*
3313	 * Now that qi is enabled on all iommus, set the root entry and flush
3314	 * caches. This is required on some Intel X58 chipsets, otherwise the
3315	 * flush_context function will loop forever and the boot hangs.
3316	 */
3317	for_each_active_iommu(iommu, drhd) {
3318		iommu_flush_write_buffer(iommu);
3319#ifdef CONFIG_INTEL_IOMMU_SVM
3320		register_pasid_allocator(iommu);
3321#endif
3322		iommu_set_root_entry(iommu);
3323	}
3324
3325#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3326	dmar_map_gfx = 0;
3327#endif
3328
3329	if (!dmar_map_gfx)
3330		iommu_identity_mapping |= IDENTMAP_GFX;
3331
3332	check_tylersburg_isoch();
3333
3334	ret = si_domain_init(hw_pass_through);
3335	if (ret)
3336		goto free_iommu;
3337
3338	/*
3339	 * for each drhd
3340	 *   enable fault log
3341	 *   global invalidate context cache
3342	 *   global invalidate iotlb
3343	 *   enable translation
3344	 */
3345	for_each_iommu(iommu, drhd) {
3346		if (drhd->ignored) {
3347			/*
3348			 * we always have to disable PMRs or DMA may fail on
3349			 * this device
3350			 */
3351			if (force_on)
3352				iommu_disable_protect_mem_regions(iommu);
3353			continue;
3354		}
3355
3356		iommu_flush_write_buffer(iommu);
3357
3358#ifdef CONFIG_INTEL_IOMMU_SVM
3359		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3360			/*
3361			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3362			 * could cause possible lock race condition.
3363			 */
3364			up_write(&dmar_global_lock);
3365			ret = intel_svm_enable_prq(iommu);
3366			down_write(&dmar_global_lock);
3367			if (ret)
3368				goto free_iommu;
3369		}
3370#endif
3371		ret = dmar_set_interrupt(iommu);
3372		if (ret)
3373			goto free_iommu;
3374	}
3375
3376	return 0;
3377
3378free_iommu:
3379	for_each_active_iommu(iommu, drhd) {
3380		disable_dmar_iommu(iommu);
3381		free_dmar_iommu(iommu);
3382	}
3383
3384	kfree(g_iommus);
3385
3386error:
3387	return ret;
3388}
3389
3390static inline int iommu_domain_cache_init(void)
3391{
3392	int ret = 0;
3393
3394	iommu_domain_cache = kmem_cache_create("iommu_domain",
3395					 sizeof(struct dmar_domain),
3396					 0,
3397					 SLAB_HWCACHE_ALIGN,
3398
3399					 NULL);
3400	if (!iommu_domain_cache) {
3401		pr_err("Couldn't create iommu_domain cache\n");
3402		ret = -ENOMEM;
3403	}
3404
3405	return ret;
3406}
3407
3408static inline int iommu_devinfo_cache_init(void)
3409{
3410	int ret = 0;
3411
3412	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3413					 sizeof(struct device_domain_info),
3414					 0,
3415					 SLAB_HWCACHE_ALIGN,
3416					 NULL);
3417	if (!iommu_devinfo_cache) {
3418		pr_err("Couldn't create devinfo cache\n");
3419		ret = -ENOMEM;
3420	}
3421
3422	return ret;
3423}
3424
3425static int __init iommu_init_mempool(void)
3426{
3427	int ret;
3428	ret = iova_cache_get();
3429	if (ret)
3430		return ret;
3431
3432	ret = iommu_domain_cache_init();
3433	if (ret)
3434		goto domain_error;
3435
3436	ret = iommu_devinfo_cache_init();
3437	if (!ret)
3438		return ret;
3439
3440	kmem_cache_destroy(iommu_domain_cache);
3441domain_error:
3442	iova_cache_put();
3443
3444	return -ENOMEM;
3445}
3446
3447static void __init iommu_exit_mempool(void)
3448{
3449	kmem_cache_destroy(iommu_devinfo_cache);
3450	kmem_cache_destroy(iommu_domain_cache);
3451	iova_cache_put();
3452}
3453
3454static void __init init_no_remapping_devices(void)
3455{
3456	struct dmar_drhd_unit *drhd;
3457	struct device *dev;
3458	int i;
3459
3460	for_each_drhd_unit(drhd) {
3461		if (!drhd->include_all) {
3462			for_each_active_dev_scope(drhd->devices,
3463						  drhd->devices_cnt, i, dev)
3464				break;
3465			/* ignore DMAR unit if no devices exist */
3466			if (i == drhd->devices_cnt)
3467				drhd->ignored = 1;
3468		}
3469	}
3470
3471	for_each_active_drhd_unit(drhd) {
3472		if (drhd->include_all)
3473			continue;
3474
3475		for_each_active_dev_scope(drhd->devices,
3476					  drhd->devices_cnt, i, dev)
3477			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3478				break;
3479		if (i < drhd->devices_cnt)
3480			continue;
3481
3482		/* This IOMMU has *only* gfx devices. Either bypass it or
3483		   set the gfx_mapped flag, as appropriate */
3484		drhd->gfx_dedicated = 1;
3485		if (!dmar_map_gfx)
3486			drhd->ignored = 1;
3487	}
3488}
3489
3490#ifdef CONFIG_SUSPEND
3491static int init_iommu_hw(void)
3492{
3493	struct dmar_drhd_unit *drhd;
3494	struct intel_iommu *iommu = NULL;
3495
3496	for_each_active_iommu(iommu, drhd)
3497		if (iommu->qi)
3498			dmar_reenable_qi(iommu);
3499
3500	for_each_iommu(iommu, drhd) {
3501		if (drhd->ignored) {
3502			/*
3503			 * we always have to disable PMRs or DMA may fail on
3504			 * this device
3505			 */
3506			if (force_on)
3507				iommu_disable_protect_mem_regions(iommu);
3508			continue;
3509		}
3510
3511		iommu_flush_write_buffer(iommu);
3512		iommu_set_root_entry(iommu);
3513		iommu_enable_translation(iommu);
3514		iommu_disable_protect_mem_regions(iommu);
3515	}
3516
3517	return 0;
3518}
3519
3520static void iommu_flush_all(void)
3521{
3522	struct dmar_drhd_unit *drhd;
3523	struct intel_iommu *iommu;
3524
3525	for_each_active_iommu(iommu, drhd) {
3526		iommu->flush.flush_context(iommu, 0, 0, 0,
3527					   DMA_CCMD_GLOBAL_INVL);
3528		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3529					 DMA_TLB_GLOBAL_FLUSH);
3530	}
3531}
3532
3533static int iommu_suspend(void)
3534{
3535	struct dmar_drhd_unit *drhd;
3536	struct intel_iommu *iommu = NULL;
3537	unsigned long flag;
3538
3539	for_each_active_iommu(iommu, drhd) {
3540		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3541					     GFP_KERNEL);
3542		if (!iommu->iommu_state)
3543			goto nomem;
3544	}
3545
3546	iommu_flush_all();
3547
3548	for_each_active_iommu(iommu, drhd) {
3549		iommu_disable_translation(iommu);
3550
3551		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3552
3553		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3554			readl(iommu->reg + DMAR_FECTL_REG);
3555		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3556			readl(iommu->reg + DMAR_FEDATA_REG);
3557		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3558			readl(iommu->reg + DMAR_FEADDR_REG);
3559		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3560			readl(iommu->reg + DMAR_FEUADDR_REG);
3561
3562		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3563	}
3564	return 0;
3565
3566nomem:
3567	for_each_active_iommu(iommu, drhd)
3568		kfree(iommu->iommu_state);
3569
3570	return -ENOMEM;
3571}
3572
3573static void iommu_resume(void)
3574{
3575	struct dmar_drhd_unit *drhd;
3576	struct intel_iommu *iommu = NULL;
3577	unsigned long flag;
3578
3579	if (init_iommu_hw()) {
3580		if (force_on)
3581			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3582		else
3583			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3584		return;
3585	}
3586
3587	for_each_active_iommu(iommu, drhd) {
3588
3589		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3590
3591		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3592			iommu->reg + DMAR_FECTL_REG);
3593		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3594			iommu->reg + DMAR_FEDATA_REG);
3595		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3596			iommu->reg + DMAR_FEADDR_REG);
3597		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3598			iommu->reg + DMAR_FEUADDR_REG);
3599
3600		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3601	}
3602
3603	for_each_active_iommu(iommu, drhd)
3604		kfree(iommu->iommu_state);
3605}
3606
3607static struct syscore_ops iommu_syscore_ops = {
3608	.resume		= iommu_resume,
3609	.suspend	= iommu_suspend,
3610};
3611
3612static void __init init_iommu_pm_ops(void)
3613{
3614	register_syscore_ops(&iommu_syscore_ops);
3615}
3616
3617#else
3618static inline void init_iommu_pm_ops(void) {}
3619#endif	/* CONFIG_PM */
3620
3621static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3622{
3623	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3624	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3625	    rmrr->end_address <= rmrr->base_address ||
3626	    arch_rmrr_sanity_check(rmrr))
3627		return -EINVAL;
3628
3629	return 0;
3630}
3631
3632int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3633{
3634	struct acpi_dmar_reserved_memory *rmrr;
3635	struct dmar_rmrr_unit *rmrru;
3636
3637	rmrr = (struct acpi_dmar_reserved_memory *)header;
3638	if (rmrr_sanity_check(rmrr)) {
3639		pr_warn(FW_BUG
3640			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3641			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3642			   rmrr->base_address, rmrr->end_address,
3643			   dmi_get_system_info(DMI_BIOS_VENDOR),
3644			   dmi_get_system_info(DMI_BIOS_VERSION),
3645			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3646		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3647	}
3648
3649	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3650	if (!rmrru)
3651		goto out;
3652
3653	rmrru->hdr = header;
3654
3655	rmrru->base_address = rmrr->base_address;
3656	rmrru->end_address = rmrr->end_address;
3657
3658	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3659				((void *)rmrr) + rmrr->header.length,
3660				&rmrru->devices_cnt);
3661	if (rmrru->devices_cnt && rmrru->devices == NULL)
3662		goto free_rmrru;
3663
3664	list_add(&rmrru->list, &dmar_rmrr_units);
3665
3666	return 0;
3667free_rmrru:
3668	kfree(rmrru);
3669out:
3670	return -ENOMEM;
3671}
3672
3673static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3674{
3675	struct dmar_atsr_unit *atsru;
3676	struct acpi_dmar_atsr *tmp;
3677
3678	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3679				dmar_rcu_check()) {
3680		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3681		if (atsr->segment != tmp->segment)
3682			continue;
3683		if (atsr->header.length != tmp->header.length)
3684			continue;
3685		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3686			return atsru;
3687	}
3688
3689	return NULL;
3690}
3691
3692int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3693{
3694	struct acpi_dmar_atsr *atsr;
3695	struct dmar_atsr_unit *atsru;
3696
3697	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3698		return 0;
3699
3700	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3701	atsru = dmar_find_atsr(atsr);
3702	if (atsru)
3703		return 0;
3704
3705	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3706	if (!atsru)
3707		return -ENOMEM;
3708
3709	/*
3710	 * If memory is allocated from slab by ACPI _DSM method, we need to
3711	 * copy the memory content because the memory buffer will be freed
3712	 * on return.
3713	 */
3714	atsru->hdr = (void *)(atsru + 1);
3715	memcpy(atsru->hdr, hdr, hdr->length);
3716	atsru->include_all = atsr->flags & 0x1;
3717	if (!atsru->include_all) {
3718		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3719				(void *)atsr + atsr->header.length,
3720				&atsru->devices_cnt);
3721		if (atsru->devices_cnt && atsru->devices == NULL) {
3722			kfree(atsru);
3723			return -ENOMEM;
3724		}
3725	}
3726
3727	list_add_rcu(&atsru->list, &dmar_atsr_units);
3728
3729	return 0;
3730}
3731
3732static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3733{
3734	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3735	kfree(atsru);
3736}
3737
3738int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3739{
3740	struct acpi_dmar_atsr *atsr;
3741	struct dmar_atsr_unit *atsru;
3742
3743	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3744	atsru = dmar_find_atsr(atsr);
3745	if (atsru) {
3746		list_del_rcu(&atsru->list);
3747		synchronize_rcu();
3748		intel_iommu_free_atsr(atsru);
3749	}
3750
3751	return 0;
3752}
3753
3754int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3755{
3756	int i;
3757	struct device *dev;
3758	struct acpi_dmar_atsr *atsr;
3759	struct dmar_atsr_unit *atsru;
3760
3761	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3762	atsru = dmar_find_atsr(atsr);
3763	if (!atsru)
3764		return 0;
3765
3766	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3767		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3768					  i, dev)
3769			return -EBUSY;
3770	}
3771
3772	return 0;
3773}
3774
3775static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3776{
3777	struct dmar_satc_unit *satcu;
3778	struct acpi_dmar_satc *tmp;
3779
3780	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3781				dmar_rcu_check()) {
3782		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3783		if (satc->segment != tmp->segment)
3784			continue;
3785		if (satc->header.length != tmp->header.length)
3786			continue;
3787		if (memcmp(satc, tmp, satc->header.length) == 0)
3788			return satcu;
3789	}
3790
3791	return NULL;
3792}
3793
3794int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3795{
3796	struct acpi_dmar_satc *satc;
3797	struct dmar_satc_unit *satcu;
3798
3799	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3800		return 0;
3801
3802	satc = container_of(hdr, struct acpi_dmar_satc, header);
3803	satcu = dmar_find_satc(satc);
3804	if (satcu)
3805		return 0;
3806
3807	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3808	if (!satcu)
3809		return -ENOMEM;
3810
3811	satcu->hdr = (void *)(satcu + 1);
3812	memcpy(satcu->hdr, hdr, hdr->length);
3813	satcu->atc_required = satc->flags & 0x1;
3814	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3815					      (void *)satc + satc->header.length,
3816					      &satcu->devices_cnt);
3817	if (satcu->devices_cnt && !satcu->devices) {
3818		kfree(satcu);
3819		return -ENOMEM;
3820	}
3821	list_add_rcu(&satcu->list, &dmar_satc_units);
3822
3823	return 0;
3824}
3825
3826static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3827{
3828	int sp, ret;
3829	struct intel_iommu *iommu = dmaru->iommu;
3830
3831	if (g_iommus[iommu->seq_id])
3832		return 0;
3833
3834	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3835	if (ret)
3836		goto out;
3837
3838	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3839		pr_warn("%s: Doesn't support hardware pass through.\n",
3840			iommu->name);
3841		return -ENXIO;
3842	}
3843	if (!ecap_sc_support(iommu->ecap) &&
3844	    domain_update_iommu_snooping(iommu)) {
3845		pr_warn("%s: Doesn't support snooping.\n",
3846			iommu->name);
3847		return -ENXIO;
3848	}
3849	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3850	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3851		pr_warn("%s: Doesn't support large page.\n",
3852			iommu->name);
3853		return -ENXIO;
3854	}
3855
3856	/*
3857	 * Disable translation if already enabled prior to OS handover.
3858	 */
3859	if (iommu->gcmd & DMA_GCMD_TE)
3860		iommu_disable_translation(iommu);
3861
3862	g_iommus[iommu->seq_id] = iommu;
3863	ret = iommu_init_domains(iommu);
3864	if (ret == 0)
3865		ret = iommu_alloc_root_entry(iommu);
3866	if (ret)
3867		goto out;
3868
3869	intel_svm_check(iommu);
3870
3871	if (dmaru->ignored) {
3872		/*
3873		 * we always have to disable PMRs or DMA may fail on this device
3874		 */
3875		if (force_on)
3876			iommu_disable_protect_mem_regions(iommu);
3877		return 0;
3878	}
3879
3880	intel_iommu_init_qi(iommu);
3881	iommu_flush_write_buffer(iommu);
3882
3883#ifdef CONFIG_INTEL_IOMMU_SVM
3884	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3885		ret = intel_svm_enable_prq(iommu);
3886		if (ret)
3887			goto disable_iommu;
3888	}
3889#endif
3890	ret = dmar_set_interrupt(iommu);
3891	if (ret)
3892		goto disable_iommu;
3893
3894	iommu_set_root_entry(iommu);
3895	iommu_enable_translation(iommu);
3896
3897	iommu_disable_protect_mem_regions(iommu);
3898	return 0;
3899
3900disable_iommu:
3901	disable_dmar_iommu(iommu);
3902out:
3903	free_dmar_iommu(iommu);
3904	return ret;
3905}
3906
3907int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3908{
3909	int ret = 0;
3910	struct intel_iommu *iommu = dmaru->iommu;
3911
3912	if (!intel_iommu_enabled)
3913		return 0;
3914	if (iommu == NULL)
3915		return -EINVAL;
3916
3917	if (insert) {
3918		ret = intel_iommu_add(dmaru);
3919	} else {
3920		disable_dmar_iommu(iommu);
3921		free_dmar_iommu(iommu);
3922	}
3923
3924	return ret;
3925}
3926
3927static void intel_iommu_free_dmars(void)
3928{
3929	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3930	struct dmar_atsr_unit *atsru, *atsr_n;
3931	struct dmar_satc_unit *satcu, *satc_n;
3932
3933	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3934		list_del(&rmrru->list);
3935		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3936		kfree(rmrru);
3937	}
3938
3939	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3940		list_del(&atsru->list);
3941		intel_iommu_free_atsr(atsru);
3942	}
3943	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3944		list_del(&satcu->list);
3945		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3946		kfree(satcu);
3947	}
3948}
3949
3950int dmar_find_matched_atsr_unit(struct pci_dev *dev)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3951{
3952	int i, ret = 1;
3953	struct pci_bus *bus;
3954	struct pci_dev *bridge = NULL;
3955	struct device *tmp;
3956	struct acpi_dmar_atsr *atsr;
3957	struct dmar_atsr_unit *atsru;
 
3958
3959	dev = pci_physfn(dev);
 
 
 
 
 
 
 
 
 
 
 
3960	for (bus = dev->bus; bus; bus = bus->parent) {
3961		bridge = bus->self;
3962		/* If it's an integrated device, allow ATS */
3963		if (!bridge)
3964			return 1;
3965		/* Connected via non-PCIe: no ATS */
3966		if (!pci_is_pcie(bridge) ||
3967		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3968			return 0;
3969		/* If we found the root port, look it up in the ATSR */
3970		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3971			break;
3972	}
3973
3974	rcu_read_lock();
3975	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3976		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3977		if (atsr->segment != pci_domain_nr(dev->bus))
3978			continue;
3979
3980		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3981			if (tmp == &bridge->dev)
3982				goto out;
3983
3984		if (atsru->include_all)
3985			goto out;
3986	}
3987	ret = 0;
3988out:
3989	rcu_read_unlock();
3990
3991	return ret;
3992}
3993
3994int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3995{
3996	int ret;
3997	struct dmar_rmrr_unit *rmrru;
3998	struct dmar_atsr_unit *atsru;
3999	struct dmar_satc_unit *satcu;
4000	struct acpi_dmar_atsr *atsr;
4001	struct acpi_dmar_reserved_memory *rmrr;
4002	struct acpi_dmar_satc *satc;
4003
4004	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4005		return 0;
4006
4007	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4008		rmrr = container_of(rmrru->hdr,
4009				    struct acpi_dmar_reserved_memory, header);
4010		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4011			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4012				((void *)rmrr) + rmrr->header.length,
4013				rmrr->segment, rmrru->devices,
4014				rmrru->devices_cnt);
4015			if (ret < 0)
4016				return ret;
4017		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4018			dmar_remove_dev_scope(info, rmrr->segment,
4019				rmrru->devices, rmrru->devices_cnt);
4020		}
4021	}
4022
4023	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4024		if (atsru->include_all)
4025			continue;
4026
4027		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4028		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4029			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4030					(void *)atsr + atsr->header.length,
4031					atsr->segment, atsru->devices,
4032					atsru->devices_cnt);
4033			if (ret > 0)
4034				break;
4035			else if (ret < 0)
4036				return ret;
4037		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4038			if (dmar_remove_dev_scope(info, atsr->segment,
4039					atsru->devices, atsru->devices_cnt))
4040				break;
4041		}
4042	}
4043	list_for_each_entry(satcu, &dmar_satc_units, list) {
4044		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
4045		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4046			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
4047					(void *)satc + satc->header.length,
4048					satc->segment, satcu->devices,
4049					satcu->devices_cnt);
4050			if (ret > 0)
4051				break;
4052			else if (ret < 0)
4053				return ret;
4054		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4055			if (dmar_remove_dev_scope(info, satc->segment,
4056					satcu->devices, satcu->devices_cnt))
4057				break;
4058		}
4059	}
4060
4061	return 0;
4062}
4063
4064static int intel_iommu_memory_notifier(struct notifier_block *nb,
4065				       unsigned long val, void *v)
4066{
4067	struct memory_notify *mhp = v;
4068	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4069	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4070			mhp->nr_pages - 1);
4071
4072	switch (val) {
4073	case MEM_GOING_ONLINE:
4074		if (iommu_domain_identity_map(si_domain,
4075					      start_vpfn, last_vpfn)) {
4076			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4077				start_vpfn, last_vpfn);
4078			return NOTIFY_BAD;
4079		}
4080		break;
4081
4082	case MEM_OFFLINE:
4083	case MEM_CANCEL_ONLINE:
4084		{
4085			struct dmar_drhd_unit *drhd;
4086			struct intel_iommu *iommu;
4087			struct page *freelist;
4088
4089			freelist = domain_unmap(si_domain,
4090						start_vpfn, last_vpfn,
4091						NULL);
4092
4093			rcu_read_lock();
4094			for_each_active_iommu(iommu, drhd)
4095				iommu_flush_iotlb_psi(iommu, si_domain,
4096					start_vpfn, mhp->nr_pages,
4097					!freelist, 0);
4098			rcu_read_unlock();
4099			dma_free_pagelist(freelist);
4100		}
4101		break;
4102	}
4103
4104	return NOTIFY_OK;
4105}
4106
4107static struct notifier_block intel_iommu_memory_nb = {
4108	.notifier_call = intel_iommu_memory_notifier,
4109	.priority = 0
4110};
4111
4112static void intel_disable_iommus(void)
4113{
4114	struct intel_iommu *iommu = NULL;
4115	struct dmar_drhd_unit *drhd;
4116
4117	for_each_iommu(iommu, drhd)
4118		iommu_disable_translation(iommu);
4119}
4120
4121void intel_iommu_shutdown(void)
4122{
4123	struct dmar_drhd_unit *drhd;
4124	struct intel_iommu *iommu = NULL;
4125
4126	if (no_iommu || dmar_disabled)
4127		return;
4128
4129	down_write(&dmar_global_lock);
4130
4131	/* Disable PMRs explicitly here. */
4132	for_each_iommu(iommu, drhd)
4133		iommu_disable_protect_mem_regions(iommu);
4134
4135	/* Make sure the IOMMUs are switched off */
4136	intel_disable_iommus();
4137
4138	up_write(&dmar_global_lock);
4139}
4140
4141static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4142{
4143	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4144
4145	return container_of(iommu_dev, struct intel_iommu, iommu);
4146}
4147
4148static ssize_t version_show(struct device *dev,
4149			    struct device_attribute *attr, char *buf)
4150{
4151	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4152	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4153	return sprintf(buf, "%d:%d\n",
4154		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4155}
4156static DEVICE_ATTR_RO(version);
4157
4158static ssize_t address_show(struct device *dev,
4159			    struct device_attribute *attr, char *buf)
4160{
4161	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4162	return sprintf(buf, "%llx\n", iommu->reg_phys);
4163}
4164static DEVICE_ATTR_RO(address);
4165
4166static ssize_t cap_show(struct device *dev,
4167			struct device_attribute *attr, char *buf)
4168{
4169	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4170	return sprintf(buf, "%llx\n", iommu->cap);
4171}
4172static DEVICE_ATTR_RO(cap);
4173
4174static ssize_t ecap_show(struct device *dev,
4175			 struct device_attribute *attr, char *buf)
4176{
4177	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4178	return sprintf(buf, "%llx\n", iommu->ecap);
4179}
4180static DEVICE_ATTR_RO(ecap);
4181
4182static ssize_t domains_supported_show(struct device *dev,
4183				      struct device_attribute *attr, char *buf)
4184{
4185	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4186	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4187}
4188static DEVICE_ATTR_RO(domains_supported);
4189
4190static ssize_t domains_used_show(struct device *dev,
4191				 struct device_attribute *attr, char *buf)
4192{
4193	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4194	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4195						  cap_ndoms(iommu->cap)));
4196}
4197static DEVICE_ATTR_RO(domains_used);
4198
4199static struct attribute *intel_iommu_attrs[] = {
4200	&dev_attr_version.attr,
4201	&dev_attr_address.attr,
4202	&dev_attr_cap.attr,
4203	&dev_attr_ecap.attr,
4204	&dev_attr_domains_supported.attr,
4205	&dev_attr_domains_used.attr,
4206	NULL,
4207};
4208
4209static struct attribute_group intel_iommu_group = {
4210	.name = "intel-iommu",
4211	.attrs = intel_iommu_attrs,
4212};
4213
4214const struct attribute_group *intel_iommu_groups[] = {
4215	&intel_iommu_group,
4216	NULL,
4217};
4218
4219static inline bool has_external_pci(void)
4220{
4221	struct pci_dev *pdev = NULL;
4222
4223	for_each_pci_dev(pdev)
4224		if (pdev->external_facing)
 
4225			return true;
 
4226
4227	return false;
4228}
4229
4230static int __init platform_optin_force_iommu(void)
4231{
4232	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4233		return 0;
4234
4235	if (no_iommu || dmar_disabled)
4236		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4237
4238	/*
4239	 * If Intel-IOMMU is disabled by default, we will apply identity
4240	 * map for all devices except those marked as being untrusted.
4241	 */
4242	if (dmar_disabled)
4243		iommu_set_default_passthrough(false);
4244
4245	dmar_disabled = 0;
4246	no_iommu = 0;
4247
4248	return 1;
4249}
4250
4251static int __init probe_acpi_namespace_devices(void)
4252{
4253	struct dmar_drhd_unit *drhd;
4254	/* To avoid a -Wunused-but-set-variable warning. */
4255	struct intel_iommu *iommu __maybe_unused;
4256	struct device *dev;
4257	int i, ret = 0;
4258
4259	for_each_active_iommu(iommu, drhd) {
4260		for_each_active_dev_scope(drhd->devices,
4261					  drhd->devices_cnt, i, dev) {
4262			struct acpi_device_physical_node *pn;
4263			struct iommu_group *group;
4264			struct acpi_device *adev;
4265
4266			if (dev->bus != &acpi_bus_type)
4267				continue;
4268
4269			adev = to_acpi_device(dev);
4270			mutex_lock(&adev->physical_node_lock);
4271			list_for_each_entry(pn,
4272					    &adev->physical_node_list, node) {
4273				group = iommu_group_get(pn->dev);
4274				if (group) {
4275					iommu_group_put(group);
4276					continue;
4277				}
4278
4279				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4280				ret = iommu_probe_device(pn->dev);
4281				if (ret)
4282					break;
4283			}
4284			mutex_unlock(&adev->physical_node_lock);
4285
4286			if (ret)
4287				return ret;
4288		}
4289	}
4290
4291	return 0;
4292}
4293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4294int __init intel_iommu_init(void)
4295{
4296	int ret = -ENODEV;
4297	struct dmar_drhd_unit *drhd;
4298	struct intel_iommu *iommu;
4299
4300	/*
4301	 * Intel IOMMU is required for a TXT/tboot launch or platform
4302	 * opt in, so enforce that.
4303	 */
4304	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4305		    platform_optin_force_iommu();
4306
4307	if (iommu_init_mempool()) {
4308		if (force_on)
4309			panic("tboot: Failed to initialize iommu memory\n");
4310		return -ENOMEM;
4311	}
4312
4313	down_write(&dmar_global_lock);
4314	if (dmar_table_init()) {
4315		if (force_on)
4316			panic("tboot: Failed to initialize DMAR table\n");
4317		goto out_free_dmar;
4318	}
4319
4320	if (dmar_dev_scope_init() < 0) {
4321		if (force_on)
4322			panic("tboot: Failed to initialize DMAR device scope\n");
4323		goto out_free_dmar;
4324	}
4325
4326	up_write(&dmar_global_lock);
4327
4328	/*
4329	 * The bus notifier takes the dmar_global_lock, so lockdep will
4330	 * complain later when we register it under the lock.
4331	 */
4332	dmar_register_bus_notifier();
4333
4334	down_write(&dmar_global_lock);
4335
4336	if (!no_iommu)
4337		intel_iommu_debugfs_init();
4338
4339	if (no_iommu || dmar_disabled) {
4340		/*
4341		 * We exit the function here to ensure IOMMU's remapping and
4342		 * mempool aren't setup, which means that the IOMMU's PMRs
4343		 * won't be disabled via the call to init_dmars(). So disable
4344		 * it explicitly here. The PMRs were setup by tboot prior to
4345		 * calling SENTER, but the kernel is expected to reset/tear
4346		 * down the PMRs.
4347		 */
4348		if (intel_iommu_tboot_noforce) {
4349			for_each_iommu(iommu, drhd)
4350				iommu_disable_protect_mem_regions(iommu);
4351		}
4352
4353		/*
4354		 * Make sure the IOMMUs are switched off, even when we
4355		 * boot into a kexec kernel and the previous kernel left
4356		 * them enabled
4357		 */
4358		intel_disable_iommus();
4359		goto out_free_dmar;
4360	}
4361
4362	if (list_empty(&dmar_rmrr_units))
4363		pr_info("No RMRR found\n");
4364
4365	if (list_empty(&dmar_atsr_units))
4366		pr_info("No ATSR found\n");
4367
4368	if (list_empty(&dmar_satc_units))
4369		pr_info("No SATC found\n");
4370
4371	if (dmar_map_gfx)
4372		intel_iommu_gfx_mapped = 1;
4373
4374	init_no_remapping_devices();
4375
4376	ret = init_dmars();
4377	if (ret) {
4378		if (force_on)
4379			panic("tboot: Failed to initialize DMARs\n");
4380		pr_err("Initialization failed\n");
4381		goto out_free_dmar;
4382	}
4383	up_write(&dmar_global_lock);
4384
4385	init_iommu_pm_ops();
4386
4387	down_read(&dmar_global_lock);
4388	for_each_active_iommu(iommu, drhd) {
4389		/*
4390		 * The flush queue implementation does not perform
4391		 * page-selective invalidations that are required for efficient
4392		 * TLB flushes in virtual environments.  The benefit of batching
4393		 * is likely to be much lower than the overhead of synchronizing
4394		 * the virtual and physical IOMMU page-tables.
4395		 */
4396		if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
4397			pr_warn("IOMMU batching is disabled due to virtualization");
4398			intel_iommu_strict = 1;
4399		}
4400		iommu_device_sysfs_add(&iommu->iommu, NULL,
4401				       intel_iommu_groups,
4402				       "%s", iommu->name);
4403		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4404	}
4405	up_read(&dmar_global_lock);
4406
4407	iommu_set_dma_strict(intel_iommu_strict);
4408	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4409	if (si_domain && !hw_pass_through)
4410		register_memory_notifier(&intel_iommu_memory_nb);
4411
4412	down_read(&dmar_global_lock);
4413	if (probe_acpi_namespace_devices())
4414		pr_warn("ACPI name space devices didn't probe correctly\n");
4415
4416	/* Finally, we enable the DMA remapping hardware. */
4417	for_each_iommu(iommu, drhd) {
4418		if (!drhd->ignored && !translation_pre_enabled(iommu))
4419			iommu_enable_translation(iommu);
4420
4421		iommu_disable_protect_mem_regions(iommu);
4422	}
4423	up_read(&dmar_global_lock);
4424
4425	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4426
4427	intel_iommu_enabled = 1;
4428
4429	return 0;
4430
4431out_free_dmar:
4432	intel_iommu_free_dmars();
4433	up_write(&dmar_global_lock);
4434	iommu_exit_mempool();
4435	return ret;
4436}
4437
4438static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4439{
4440	struct device_domain_info *info = opaque;
4441
4442	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4443	return 0;
4444}
4445
4446/*
4447 * NB - intel-iommu lacks any sort of reference counting for the users of
4448 * dependent devices.  If multiple endpoints have intersecting dependent
4449 * devices, unbinding the driver from any one of them will possibly leave
4450 * the others unable to operate.
4451 */
4452static void domain_context_clear(struct device_domain_info *info)
4453{
4454	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4455		return;
4456
4457	pci_for_each_dma_alias(to_pci_dev(info->dev),
4458			       &domain_context_clear_one_cb, info);
4459}
4460
4461static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4462{
4463	struct dmar_domain *domain;
4464	struct intel_iommu *iommu;
 
4465	unsigned long flags;
4466
4467	assert_spin_locked(&device_domain_lock);
4468
4469	if (WARN_ON(!info))
4470		return;
4471
4472	iommu = info->iommu;
4473	domain = info->domain;
4474
4475	if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4476		if (dev_is_pci(info->dev) && sm_supported(iommu))
4477			intel_pasid_tear_down_entry(iommu, info->dev,
4478					PASID_RID2PASID, false);
4479
4480		iommu_disable_dev_iotlb(info);
4481		domain_context_clear(info);
4482		intel_pasid_free_table(info->dev);
4483	}
4484
4485	unlink_domain_info(info);
 
 
4486
4487	spin_lock_irqsave(&iommu->lock, flags);
4488	domain_detach_iommu(domain, iommu);
4489	spin_unlock_irqrestore(&iommu->lock, flags);
4490
4491	free_devinfo_mem(info);
4492}
4493
4494static void dmar_remove_one_dev_info(struct device *dev)
 
 
 
 
 
4495{
4496	struct device_domain_info *info;
 
4497	unsigned long flags;
4498
4499	spin_lock_irqsave(&device_domain_lock, flags);
4500	info = get_domain_info(dev);
4501	if (info)
4502		__dmar_remove_one_dev_info(info);
4503	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
4504}
4505
4506static int md_domain_init(struct dmar_domain *domain, int guest_width)
4507{
4508	int adjust_width;
4509
4510	/* calculate AGAW */
4511	domain->gaw = guest_width;
4512	adjust_width = guestwidth_to_adjustwidth(guest_width);
4513	domain->agaw = width_to_agaw(adjust_width);
4514
4515	domain->iommu_coherency = false;
4516	domain->iommu_snooping = false;
4517	domain->iommu_superpage = 0;
4518	domain->max_addr = 0;
4519
4520	/* always allocate the top pgd */
4521	domain->pgd = alloc_pgtable_page(domain->nid);
4522	if (!domain->pgd)
4523		return -ENOMEM;
4524	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4525	return 0;
4526}
4527
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4528static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4529{
4530	struct dmar_domain *dmar_domain;
4531	struct iommu_domain *domain;
4532
4533	switch (type) {
 
 
4534	case IOMMU_DOMAIN_DMA:
 
4535	case IOMMU_DOMAIN_UNMANAGED:
4536		dmar_domain = alloc_domain(0);
4537		if (!dmar_domain) {
4538			pr_err("Can't allocate dmar_domain\n");
4539			return NULL;
4540		}
4541		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4542			pr_err("Domain initialization failed\n");
4543			domain_exit(dmar_domain);
4544			return NULL;
4545		}
4546
4547		if (type == IOMMU_DOMAIN_DMA &&
4548		    iommu_get_dma_cookie(&dmar_domain->domain))
4549			return NULL;
4550
4551		domain = &dmar_domain->domain;
4552		domain->geometry.aperture_start = 0;
4553		domain->geometry.aperture_end   =
4554				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4555		domain->geometry.force_aperture = true;
4556
4557		return domain;
4558	case IOMMU_DOMAIN_IDENTITY:
4559		return &si_domain->domain;
 
 
4560	default:
4561		return NULL;
4562	}
4563
4564	return NULL;
4565}
4566
4567static void intel_iommu_domain_free(struct iommu_domain *domain)
4568{
4569	if (domain != &si_domain->domain)
4570		domain_exit(to_dmar_domain(domain));
4571}
4572
4573/*
4574 * Check whether a @domain could be attached to the @dev through the
4575 * aux-domain attach/detach APIs.
4576 */
4577static inline bool
4578is_aux_domain(struct device *dev, struct iommu_domain *domain)
4579{
4580	struct device_domain_info *info = get_domain_info(dev);
4581
4582	return info && info->auxd_enabled &&
4583			domain->type == IOMMU_DOMAIN_UNMANAGED;
4584}
4585
4586static inline struct subdev_domain_info *
4587lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4588{
4589	struct subdev_domain_info *sinfo;
4590
4591	if (!list_empty(&domain->subdevices)) {
4592		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4593			if (sinfo->pdev == dev)
4594				return sinfo;
4595		}
4596	}
4597
4598	return NULL;
4599}
4600
4601static int auxiliary_link_device(struct dmar_domain *domain,
4602				 struct device *dev)
4603{
4604	struct device_domain_info *info = get_domain_info(dev);
4605	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4606
4607	assert_spin_locked(&device_domain_lock);
4608	if (WARN_ON(!info))
4609		return -EINVAL;
4610
4611	if (!sinfo) {
4612		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4613		if (!sinfo)
4614			return -ENOMEM;
4615		sinfo->domain = domain;
4616		sinfo->pdev = dev;
4617		list_add(&sinfo->link_phys, &info->subdevices);
4618		list_add(&sinfo->link_domain, &domain->subdevices);
4619	}
4620
4621	return ++sinfo->users;
4622}
4623
4624static int auxiliary_unlink_device(struct dmar_domain *domain,
4625				   struct device *dev)
4626{
4627	struct device_domain_info *info = get_domain_info(dev);
4628	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4629	int ret;
4630
4631	assert_spin_locked(&device_domain_lock);
4632	if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4633		return -EINVAL;
4634
4635	ret = --sinfo->users;
4636	if (!ret) {
4637		list_del(&sinfo->link_phys);
4638		list_del(&sinfo->link_domain);
4639		kfree(sinfo);
4640	}
4641
4642	return ret;
4643}
4644
4645static int aux_domain_add_dev(struct dmar_domain *domain,
4646			      struct device *dev)
4647{
4648	int ret;
4649	unsigned long flags;
4650	struct intel_iommu *iommu;
4651
4652	iommu = device_to_iommu(dev, NULL, NULL);
4653	if (!iommu)
4654		return -ENODEV;
4655
4656	if (domain->default_pasid <= 0) {
4657		u32 pasid;
4658
4659		/* No private data needed for the default pasid */
4660		pasid = ioasid_alloc(NULL, PASID_MIN,
4661				     pci_max_pasids(to_pci_dev(dev)) - 1,
4662				     NULL);
4663		if (pasid == INVALID_IOASID) {
4664			pr_err("Can't allocate default pasid\n");
4665			return -ENODEV;
4666		}
4667		domain->default_pasid = pasid;
4668	}
4669
4670	spin_lock_irqsave(&device_domain_lock, flags);
4671	ret = auxiliary_link_device(domain, dev);
4672	if (ret <= 0)
4673		goto link_failed;
4674
4675	/*
4676	 * Subdevices from the same physical device can be attached to the
4677	 * same domain. For such cases, only the first subdevice attachment
4678	 * needs to go through the full steps in this function. So if ret >
4679	 * 1, just goto out.
4680	 */
4681	if (ret > 1)
4682		goto out;
4683
4684	/*
4685	 * iommu->lock must be held to attach domain to iommu and setup the
4686	 * pasid entry for second level translation.
4687	 */
4688	spin_lock(&iommu->lock);
4689	ret = domain_attach_iommu(domain, iommu);
4690	if (ret)
4691		goto attach_failed;
4692
4693	/* Setup the PASID entry for mediated devices: */
4694	if (domain_use_first_level(domain))
4695		ret = domain_setup_first_level(iommu, domain, dev,
4696					       domain->default_pasid);
4697	else
4698		ret = intel_pasid_setup_second_level(iommu, domain, dev,
4699						     domain->default_pasid);
4700	if (ret)
4701		goto table_failed;
4702
4703	spin_unlock(&iommu->lock);
4704out:
4705	spin_unlock_irqrestore(&device_domain_lock, flags);
4706
4707	return 0;
4708
4709table_failed:
4710	domain_detach_iommu(domain, iommu);
4711attach_failed:
4712	spin_unlock(&iommu->lock);
4713	auxiliary_unlink_device(domain, dev);
4714link_failed:
4715	spin_unlock_irqrestore(&device_domain_lock, flags);
4716	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4717		ioasid_put(domain->default_pasid);
4718
4719	return ret;
4720}
4721
4722static void aux_domain_remove_dev(struct dmar_domain *domain,
4723				  struct device *dev)
4724{
4725	struct device_domain_info *info;
4726	struct intel_iommu *iommu;
4727	unsigned long flags;
4728
4729	if (!is_aux_domain(dev, &domain->domain))
4730		return;
4731
4732	spin_lock_irqsave(&device_domain_lock, flags);
4733	info = get_domain_info(dev);
4734	iommu = info->iommu;
4735
4736	if (!auxiliary_unlink_device(domain, dev)) {
4737		spin_lock(&iommu->lock);
4738		intel_pasid_tear_down_entry(iommu, dev,
4739					    domain->default_pasid, false);
4740		domain_detach_iommu(domain, iommu);
4741		spin_unlock(&iommu->lock);
4742	}
4743
4744	spin_unlock_irqrestore(&device_domain_lock, flags);
4745
4746	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4747		ioasid_put(domain->default_pasid);
4748}
4749
4750static int prepare_domain_attach_device(struct iommu_domain *domain,
4751					struct device *dev)
4752{
4753	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4754	struct intel_iommu *iommu;
4755	int addr_width;
4756
4757	iommu = device_to_iommu(dev, NULL, NULL);
4758	if (!iommu)
4759		return -ENODEV;
4760
4761	if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) &&
4762	    !ecap_nest(iommu->ecap)) {
4763		dev_err(dev, "%s: iommu not support nested translation\n",
4764			iommu->name);
4765		return -EINVAL;
4766	}
4767
4768	/* check if this iommu agaw is sufficient for max mapped address */
4769	addr_width = agaw_to_width(iommu->agaw);
4770	if (addr_width > cap_mgaw(iommu->cap))
4771		addr_width = cap_mgaw(iommu->cap);
4772
4773	if (dmar_domain->max_addr > (1LL << addr_width)) {
4774		dev_err(dev, "%s: iommu width (%d) is not "
4775		        "sufficient for the mapped address (%llx)\n",
4776		        __func__, addr_width, dmar_domain->max_addr);
4777		return -EFAULT;
4778	}
4779	dmar_domain->gaw = addr_width;
4780
4781	/*
4782	 * Knock out extra levels of page tables if necessary
4783	 */
4784	while (iommu->agaw < dmar_domain->agaw) {
4785		struct dma_pte *pte;
4786
4787		pte = dmar_domain->pgd;
4788		if (dma_pte_present(pte)) {
4789			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4790			free_pgtable_page(pte);
4791		}
4792		dmar_domain->agaw--;
4793	}
4794
4795	return 0;
4796}
4797
4798static int intel_iommu_attach_device(struct iommu_domain *domain,
4799				     struct device *dev)
4800{
 
4801	int ret;
4802
4803	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4804	    device_is_rmrr_locked(dev)) {
4805		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4806		return -EPERM;
4807	}
4808
4809	if (is_aux_domain(dev, domain))
4810		return -EPERM;
4811
4812	/* normally dev is not mapped */
4813	if (unlikely(domain_context_mapped(dev))) {
4814		struct dmar_domain *old_domain;
4815
4816		old_domain = find_domain(dev);
4817		if (old_domain)
4818			dmar_remove_one_dev_info(dev);
4819	}
4820
4821	ret = prepare_domain_attach_device(domain, dev);
4822	if (ret)
4823		return ret;
4824
4825	return domain_add_dev_info(to_dmar_domain(domain), dev);
4826}
4827
4828static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4829					 struct device *dev)
4830{
4831	int ret;
4832
4833	if (!is_aux_domain(dev, domain))
4834		return -EPERM;
4835
4836	ret = prepare_domain_attach_device(domain, dev);
4837	if (ret)
4838		return ret;
4839
4840	return aux_domain_add_dev(to_dmar_domain(domain), dev);
4841}
4842
4843static void intel_iommu_detach_device(struct iommu_domain *domain,
4844				      struct device *dev)
4845{
4846	dmar_remove_one_dev_info(dev);
4847}
4848
4849static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4850					  struct device *dev)
4851{
4852	aux_domain_remove_dev(to_dmar_domain(domain), dev);
4853}
4854
4855#ifdef CONFIG_INTEL_IOMMU_SVM
4856/*
4857 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4858 * VT-d granularity. Invalidation is typically included in the unmap operation
4859 * as a result of DMA or VFIO unmap. However, for assigned devices guest
4860 * owns the first level page tables. Invalidations of translation caches in the
4861 * guest are trapped and passed down to the host.
4862 *
4863 * vIOMMU in the guest will only expose first level page tables, therefore
4864 * we do not support IOTLB granularity for request without PASID (second level).
4865 *
4866 * For example, to find the VT-d granularity encoding for IOTLB
4867 * type and page selective granularity within PASID:
4868 * X: indexed by iommu cache type
4869 * Y: indexed by enum iommu_inv_granularity
4870 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4871 */
4872
4873static const int
4874inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4875	/*
4876	 * PASID based IOTLB invalidation: PASID selective (per PASID),
4877	 * page selective (address granularity)
4878	 */
4879	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4880	/* PASID based dev TLBs */
4881	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4882	/* PASID cache */
4883	{-EINVAL, -EINVAL, -EINVAL}
4884};
4885
4886static inline int to_vtd_granularity(int type, int granu)
4887{
4888	return inv_type_granu_table[type][granu];
4889}
4890
4891static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4892{
4893	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4894
4895	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4896	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
4897	 * granu size in contiguous memory.
4898	 */
4899	return order_base_2(nr_pages);
4900}
4901
4902static int
4903intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4904			   struct iommu_cache_invalidate_info *inv_info)
4905{
4906	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4907	struct device_domain_info *info;
4908	struct intel_iommu *iommu;
4909	unsigned long flags;
4910	int cache_type;
4911	u8 bus, devfn;
4912	u16 did, sid;
4913	int ret = 0;
4914	u64 size = 0;
4915
4916	if (!inv_info || !dmar_domain)
4917		return -EINVAL;
4918
4919	if (!dev || !dev_is_pci(dev))
4920		return -ENODEV;
4921
4922	iommu = device_to_iommu(dev, &bus, &devfn);
4923	if (!iommu)
4924		return -ENODEV;
4925
4926	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4927		return -EINVAL;
4928
4929	spin_lock_irqsave(&device_domain_lock, flags);
4930	spin_lock(&iommu->lock);
4931	info = get_domain_info(dev);
4932	if (!info) {
4933		ret = -EINVAL;
4934		goto out_unlock;
4935	}
4936	did = dmar_domain->iommu_did[iommu->seq_id];
4937	sid = PCI_DEVID(bus, devfn);
4938
4939	/* Size is only valid in address selective invalidation */
4940	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4941		size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4942				   inv_info->granu.addr_info.nb_granules);
4943
4944	for_each_set_bit(cache_type,
4945			 (unsigned long *)&inv_info->cache,
4946			 IOMMU_CACHE_INV_TYPE_NR) {
4947		int granu = 0;
4948		u64 pasid = 0;
4949		u64 addr = 0;
4950
4951		granu = to_vtd_granularity(cache_type, inv_info->granularity);
4952		if (granu == -EINVAL) {
4953			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4954					   cache_type, inv_info->granularity);
4955			break;
4956		}
4957
4958		/*
4959		 * PASID is stored in different locations based on the
4960		 * granularity.
4961		 */
4962		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4963		    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4964			pasid = inv_info->granu.pasid_info.pasid;
4965		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4966			 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4967			pasid = inv_info->granu.addr_info.pasid;
4968
4969		switch (BIT(cache_type)) {
4970		case IOMMU_CACHE_INV_TYPE_IOTLB:
4971			/* HW will ignore LSB bits based on address mask */
4972			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4973			    size &&
4974			    (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4975				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4976						   inv_info->granu.addr_info.addr, size);
4977			}
4978
4979			/*
4980			 * If granu is PASID-selective, address is ignored.
4981			 * We use npages = -1 to indicate that.
4982			 */
4983			qi_flush_piotlb(iommu, did, pasid,
4984					mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4985					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4986					inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4987
4988			if (!info->ats_enabled)
4989				break;
4990			/*
4991			 * Always flush device IOTLB if ATS is enabled. vIOMMU
4992			 * in the guest may assume IOTLB flush is inclusive,
4993			 * which is more efficient.
4994			 */
4995			fallthrough;
4996		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4997			/*
4998			 * PASID based device TLB invalidation does not support
4999			 * IOMMU_INV_GRANU_PASID granularity but only supports
5000			 * IOMMU_INV_GRANU_ADDR.
5001			 * The equivalent of that is we set the size to be the
5002			 * entire range of 64 bit. User only provides PASID info
5003			 * without address info. So we set addr to 0.
5004			 */
5005			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5006				size = 64 - VTD_PAGE_SHIFT;
5007				addr = 0;
5008			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5009				addr = inv_info->granu.addr_info.addr;
5010			}
5011
5012			if (info->ats_enabled)
5013				qi_flush_dev_iotlb_pasid(iommu, sid,
5014						info->pfsid, pasid,
5015						info->ats_qdep, addr,
5016						size);
5017			else
5018				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5019			break;
5020		default:
5021			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5022					    cache_type);
5023			ret = -EINVAL;
5024		}
5025	}
5026out_unlock:
5027	spin_unlock(&iommu->lock);
5028	spin_unlock_irqrestore(&device_domain_lock, flags);
5029
5030	return ret;
5031}
5032#endif
5033
5034static int intel_iommu_map(struct iommu_domain *domain,
5035			   unsigned long iova, phys_addr_t hpa,
5036			   size_t size, int iommu_prot, gfp_t gfp)
5037{
5038	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5039	u64 max_addr;
5040	int prot = 0;
5041
5042	if (iommu_prot & IOMMU_READ)
5043		prot |= DMA_PTE_READ;
5044	if (iommu_prot & IOMMU_WRITE)
5045		prot |= DMA_PTE_WRITE;
5046	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5047		prot |= DMA_PTE_SNP;
5048
5049	max_addr = iova + size;
5050	if (dmar_domain->max_addr < max_addr) {
5051		u64 end;
5052
5053		/* check if minimum agaw is sufficient for mapped address */
5054		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5055		if (end < max_addr) {
5056			pr_err("%s: iommu width (%d) is not "
5057			       "sufficient for the mapped address (%llx)\n",
5058			       __func__, dmar_domain->gaw, max_addr);
5059			return -EFAULT;
5060		}
5061		dmar_domain->max_addr = max_addr;
5062	}
5063	/* Round up size to next multiple of PAGE_SIZE, if it and
5064	   the low bits of hpa would take us onto the next page */
5065	size = aligned_nrpages(hpa, size);
5066	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5067				hpa >> VTD_PAGE_SHIFT, size, prot);
5068}
5069
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5070static size_t intel_iommu_unmap(struct iommu_domain *domain,
5071				unsigned long iova, size_t size,
5072				struct iommu_iotlb_gather *gather)
5073{
5074	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5075	unsigned long start_pfn, last_pfn;
5076	int level = 0;
5077
5078	/* Cope with horrid API which requires us to unmap more than the
5079	   size argument if it happens to be a large-page mapping. */
5080	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5081
5082	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5083		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5084
5085	start_pfn = iova >> VTD_PAGE_SHIFT;
5086	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5087
5088	gather->freelist = domain_unmap(dmar_domain, start_pfn,
5089					last_pfn, gather->freelist);
5090
5091	if (dmar_domain->max_addr == iova + size)
5092		dmar_domain->max_addr = iova;
5093
5094	iommu_iotlb_gather_add_page(domain, gather, iova, size);
5095
5096	return size;
5097}
5098
 
 
 
 
 
 
 
 
 
 
 
5099static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5100				 struct iommu_iotlb_gather *gather)
5101{
5102	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5103	unsigned long iova_pfn = IOVA_PFN(gather->start);
5104	size_t size = gather->end - gather->start;
 
5105	unsigned long start_pfn;
5106	unsigned long nrpages;
5107	int iommu_id;
5108
5109	nrpages = aligned_nrpages(gather->start, size);
5110	start_pfn = mm_to_dma_pfn(iova_pfn);
5111
5112	for_each_domain_iommu(iommu_id, dmar_domain)
5113		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5114				      start_pfn, nrpages, !gather->freelist, 0);
 
5115
5116	dma_free_pagelist(gather->freelist);
5117}
5118
5119static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5120					    dma_addr_t iova)
5121{
5122	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5123	struct dma_pte *pte;
5124	int level = 0;
5125	u64 phys = 0;
5126
5127	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5128	if (pte && dma_pte_present(pte))
5129		phys = dma_pte_addr(pte) +
5130			(iova & (BIT_MASK(level_to_offset_bits(level) +
5131						VTD_PAGE_SHIFT) - 1));
5132
5133	return phys;
5134}
5135
5136static bool intel_iommu_capable(enum iommu_cap cap)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5137{
5138	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5139		return domain_update_iommu_snooping(NULL);
5140	if (cap == IOMMU_CAP_INTR_REMAP)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5141		return irq_remapping_enabled == 1;
5142
5143	return false;
 
 
 
 
 
5144}
5145
5146static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5147{
 
 
5148	struct intel_iommu *iommu;
 
 
5149
5150	iommu = device_to_iommu(dev, NULL, NULL);
5151	if (!iommu)
5152		return ERR_PTR(-ENODEV);
5153
5154	if (translation_pre_enabled(iommu))
5155		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5156
5157	return &iommu->iommu;
5158}
5159
5160static void intel_iommu_release_device(struct device *dev)
5161{
5162	struct intel_iommu *iommu;
5163
5164	iommu = device_to_iommu(dev, NULL, NULL);
5165	if (!iommu)
5166		return;
5167
5168	dmar_remove_one_dev_info(dev);
5169
 
 
5170	set_dma_ops(dev, NULL);
5171}
5172
5173static void intel_iommu_probe_finalize(struct device *dev)
5174{
5175	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
5176
5177	if (domain && domain->type == IOMMU_DOMAIN_DMA)
5178		iommu_setup_dma_ops(dev, 0, U64_MAX);
5179	else
5180		set_dma_ops(dev, NULL);
5181}
5182
5183static void intel_iommu_get_resv_regions(struct device *device,
5184					 struct list_head *head)
5185{
5186	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5187	struct iommu_resv_region *reg;
5188	struct dmar_rmrr_unit *rmrr;
5189	struct device *i_dev;
5190	int i;
5191
5192	down_read(&dmar_global_lock);
5193	for_each_rmrr_units(rmrr) {
5194		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5195					  i, i_dev) {
5196			struct iommu_resv_region *resv;
5197			enum iommu_resv_type type;
5198			size_t length;
5199
5200			if (i_dev != device &&
5201			    !is_downstream_to_pci_bridge(device, i_dev))
5202				continue;
5203
5204			length = rmrr->end_address - rmrr->base_address + 1;
5205
5206			type = device_rmrr_is_relaxable(device) ?
5207				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5208
5209			resv = iommu_alloc_resv_region(rmrr->base_address,
5210						       length, prot, type);
 
5211			if (!resv)
5212				break;
5213
5214			list_add_tail(&resv->list, head);
5215		}
5216	}
5217	up_read(&dmar_global_lock);
5218
5219#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5220	if (dev_is_pci(device)) {
5221		struct pci_dev *pdev = to_pci_dev(device);
5222
5223		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5224			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5225						   IOMMU_RESV_DIRECT_RELAXABLE);
 
5226			if (reg)
5227				list_add_tail(&reg->list, head);
5228		}
5229	}
5230#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5231
5232	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5233				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5234				      0, IOMMU_RESV_MSI);
5235	if (!reg)
5236		return;
5237	list_add_tail(&reg->list, head);
5238}
5239
5240int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5241{
5242	struct device_domain_info *info;
5243	struct context_entry *context;
5244	struct dmar_domain *domain;
5245	unsigned long flags;
5246	u64 ctx_lo;
5247	int ret;
5248
5249	domain = find_domain(dev);
5250	if (!domain)
5251		return -EINVAL;
5252
5253	spin_lock_irqsave(&device_domain_lock, flags);
5254	spin_lock(&iommu->lock);
5255
5256	ret = -EINVAL;
5257	info = get_domain_info(dev);
5258	if (!info || !info->pasid_supported)
5259		goto out;
5260
5261	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5262	if (WARN_ON(!context))
5263		goto out;
5264
5265	ctx_lo = context[0].lo;
5266
5267	if (!(ctx_lo & CONTEXT_PASIDE)) {
5268		ctx_lo |= CONTEXT_PASIDE;
5269		context[0].lo = ctx_lo;
5270		wmb();
5271		iommu->flush.flush_context(iommu,
5272					   domain->iommu_did[iommu->seq_id],
5273					   PCI_DEVID(info->bus, info->devfn),
5274					   DMA_CCMD_MASK_NOBIT,
5275					   DMA_CCMD_DEVICE_INVL);
5276	}
5277
5278	/* Enable PASID support in the device, if it wasn't already */
5279	if (!info->pasid_enabled)
5280		iommu_enable_dev_iotlb(info);
5281
5282	ret = 0;
5283
5284 out:
5285	spin_unlock(&iommu->lock);
5286	spin_unlock_irqrestore(&device_domain_lock, flags);
5287
5288	return ret;
5289}
5290
5291static struct iommu_group *intel_iommu_device_group(struct device *dev)
5292{
5293	if (dev_is_pci(dev))
5294		return pci_device_group(dev);
5295	return generic_device_group(dev);
5296}
5297
5298static int intel_iommu_enable_auxd(struct device *dev)
5299{
5300	struct device_domain_info *info;
5301	struct intel_iommu *iommu;
5302	unsigned long flags;
5303	int ret;
5304
5305	iommu = device_to_iommu(dev, NULL, NULL);
5306	if (!iommu || dmar_disabled)
5307		return -EINVAL;
5308
5309	if (!sm_supported(iommu) || !pasid_supported(iommu))
5310		return -EINVAL;
5311
5312	ret = intel_iommu_enable_pasid(iommu, dev);
5313	if (ret)
5314		return -ENODEV;
5315
5316	spin_lock_irqsave(&device_domain_lock, flags);
5317	info = get_domain_info(dev);
5318	info->auxd_enabled = 1;
5319	spin_unlock_irqrestore(&device_domain_lock, flags);
5320
5321	return 0;
5322}
5323
5324static int intel_iommu_disable_auxd(struct device *dev)
5325{
5326	struct device_domain_info *info;
5327	unsigned long flags;
5328
5329	spin_lock_irqsave(&device_domain_lock, flags);
5330	info = get_domain_info(dev);
5331	if (!WARN_ON(!info))
5332		info->auxd_enabled = 0;
5333	spin_unlock_irqrestore(&device_domain_lock, flags);
5334
5335	return 0;
5336}
5337
5338static int intel_iommu_enable_sva(struct device *dev)
5339{
5340	struct device_domain_info *info = get_domain_info(dev);
5341	struct intel_iommu *iommu;
5342	int ret;
5343
5344	if (!info || dmar_disabled)
5345		return -EINVAL;
5346
5347	iommu = info->iommu;
5348	if (!iommu)
5349		return -EINVAL;
5350
5351	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
5352		return -ENODEV;
5353
5354	if (intel_iommu_enable_pasid(iommu, dev))
5355		return -ENODEV;
5356
5357	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
5358		return -EINVAL;
5359
5360	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
5361	if (!ret)
5362		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
5363
5364	return ret;
5365}
5366
5367static int intel_iommu_disable_sva(struct device *dev)
5368{
5369	struct device_domain_info *info = get_domain_info(dev);
5370	struct intel_iommu *iommu = info->iommu;
5371	int ret;
5372
5373	ret = iommu_unregister_device_fault_handler(dev);
5374	if (!ret)
5375		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
5376
5377	return ret;
5378}
5379
5380/*
5381 * A PCI express designated vendor specific extended capability is defined
5382 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5383 * for system software and tools to detect endpoint devices supporting the
5384 * Intel scalable IO virtualization without host driver dependency.
5385 *
5386 * Returns the address of the matching extended capability structure within
5387 * the device's PCI configuration space or 0 if the device does not support
5388 * it.
5389 */
5390static int siov_find_pci_dvsec(struct pci_dev *pdev)
5391{
5392	int pos;
5393	u16 vendor, id;
5394
5395	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5396	while (pos) {
5397		pci_read_config_word(pdev, pos + 4, &vendor);
5398		pci_read_config_word(pdev, pos + 8, &id);
5399		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5400			return pos;
5401
5402		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5403	}
5404
5405	return 0;
5406}
5407
5408static bool
5409intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5410{
5411	struct device_domain_info *info = get_domain_info(dev);
5412
5413	if (feat == IOMMU_DEV_FEAT_AUX) {
5414		int ret;
5415
5416		if (!dev_is_pci(dev) || dmar_disabled ||
5417		    !scalable_mode_support() || !pasid_mode_support())
5418			return false;
5419
5420		ret = pci_pasid_features(to_pci_dev(dev));
5421		if (ret < 0)
5422			return false;
5423
5424		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5425	}
5426
5427	if (feat == IOMMU_DEV_FEAT_IOPF)
5428		return info && info->pri_supported;
5429
5430	if (feat == IOMMU_DEV_FEAT_SVA)
5431		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5432			info->pasid_supported && info->pri_supported &&
5433			info->ats_supported;
5434
5435	return false;
5436}
5437
5438static int
5439intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5440{
5441	switch (feat) {
5442	case IOMMU_DEV_FEAT_AUX:
5443		return intel_iommu_enable_auxd(dev);
5444
5445	case IOMMU_DEV_FEAT_IOPF:
5446		return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
5447
5448	case IOMMU_DEV_FEAT_SVA:
5449		return intel_iommu_enable_sva(dev);
5450
5451	default:
5452		return -ENODEV;
5453	}
5454}
5455
5456static int
5457intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5458{
5459	switch (feat) {
5460	case IOMMU_DEV_FEAT_AUX:
5461		return intel_iommu_disable_auxd(dev);
5462
5463	case IOMMU_DEV_FEAT_IOPF:
5464		return 0;
5465
5466	case IOMMU_DEV_FEAT_SVA:
5467		return intel_iommu_disable_sva(dev);
5468
5469	default:
5470		return -ENODEV;
5471	}
5472}
5473
5474static bool
5475intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5476{
5477	struct device_domain_info *info = get_domain_info(dev);
5478
5479	if (feat == IOMMU_DEV_FEAT_AUX)
5480		return scalable_mode_support() && info && info->auxd_enabled;
5481
5482	return false;
5483}
5484
5485static int
5486intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5487{
5488	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5489
5490	return dmar_domain->default_pasid > 0 ?
5491			dmar_domain->default_pasid : -EINVAL;
5492}
5493
5494static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5495					   struct device *dev)
5496{
5497	return attach_deferred(dev);
5498}
5499
5500static int
5501intel_iommu_enable_nesting(struct iommu_domain *domain)
5502{
5503	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5504	unsigned long flags;
5505	int ret = -ENODEV;
5506
5507	spin_lock_irqsave(&device_domain_lock, flags);
5508	if (list_empty(&dmar_domain->devices)) {
5509		dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5510		dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5511		ret = 0;
5512	}
5513	spin_unlock_irqrestore(&device_domain_lock, flags);
5514
5515	return ret;
5516}
5517
5518/*
5519 * Check that the device does not live on an external facing PCI port that is
5520 * marked as untrusted. Such devices should not be able to apply quirks and
5521 * thus not be able to bypass the IOMMU restrictions.
5522 */
5523static bool risky_device(struct pci_dev *pdev)
5524{
5525	if (pdev->untrusted) {
5526		pci_info(pdev,
5527			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5528			 pdev->vendor, pdev->device);
5529		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5530		return true;
5531	}
5532	return false;
5533}
5534
5535static void clflush_sync_map(struct dmar_domain *domain, unsigned long clf_pfn,
5536			     unsigned long clf_pages)
5537{
5538	struct dma_pte *first_pte = NULL, *pte = NULL;
5539	unsigned long lvl_pages = 0;
5540	int level = 0;
5541
5542	while (clf_pages > 0) {
5543		if (!pte) {
5544			level = 0;
5545			pte = pfn_to_dma_pte(domain, clf_pfn, &level);
5546			if (WARN_ON(!pte))
5547				return;
5548			first_pte = pte;
5549			lvl_pages = lvl_to_nr_pages(level);
5550		}
5551
5552		if (WARN_ON(!lvl_pages || clf_pages < lvl_pages))
5553			return;
5554
5555		clf_pages -= lvl_pages;
5556		clf_pfn += lvl_pages;
5557		pte++;
5558
5559		if (!clf_pages || first_pte_in_page(pte) ||
5560		    (level > 1 && clf_pages < lvl_pages)) {
5561			domain_flush_cache(domain, first_pte,
5562					   (void *)pte - (void *)first_pte);
5563			pte = NULL;
5564		}
5565	}
5566}
5567
5568static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
5569				       unsigned long iova, size_t size)
5570{
5571	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5572	unsigned long pages = aligned_nrpages(iova, size);
5573	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
5574	struct intel_iommu *iommu;
5575	int iommu_id;
5576
5577	if (!dmar_domain->iommu_coherency)
5578		clflush_sync_map(dmar_domain, pfn, pages);
 
5579
5580	for_each_domain_iommu(iommu_id, dmar_domain) {
5581		iommu = g_iommus[iommu_id];
5582		__mapping_notify_one(iommu, dmar_domain, pfn, pages);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5583	}
 
 
5584}
5585
5586const struct iommu_ops intel_iommu_ops = {
5587	.capable		= intel_iommu_capable,
5588	.domain_alloc		= intel_iommu_domain_alloc,
5589	.domain_free		= intel_iommu_domain_free,
5590	.enable_nesting		= intel_iommu_enable_nesting,
5591	.attach_dev		= intel_iommu_attach_device,
5592	.detach_dev		= intel_iommu_detach_device,
5593	.aux_attach_dev		= intel_iommu_aux_attach_device,
5594	.aux_detach_dev		= intel_iommu_aux_detach_device,
5595	.aux_get_pasid		= intel_iommu_aux_get_pasid,
5596	.map			= intel_iommu_map,
5597	.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
5598	.unmap			= intel_iommu_unmap,
5599	.flush_iotlb_all        = intel_flush_iotlb_all,
5600	.iotlb_sync		= intel_iommu_tlb_sync,
5601	.iova_to_phys		= intel_iommu_iova_to_phys,
5602	.probe_device		= intel_iommu_probe_device,
5603	.probe_finalize		= intel_iommu_probe_finalize,
5604	.release_device		= intel_iommu_release_device,
5605	.get_resv_regions	= intel_iommu_get_resv_regions,
5606	.put_resv_regions	= generic_iommu_put_resv_regions,
5607	.device_group		= intel_iommu_device_group,
5608	.dev_has_feat		= intel_iommu_dev_has_feat,
5609	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
5610	.dev_enable_feat	= intel_iommu_dev_enable_feat,
5611	.dev_disable_feat	= intel_iommu_dev_disable_feat,
5612	.is_attach_deferred	= intel_iommu_is_attach_deferred,
5613	.def_domain_type	= device_def_domain_type,
5614	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
 
5615#ifdef CONFIG_INTEL_IOMMU_SVM
5616	.cache_invalidate	= intel_iommu_sva_invalidate,
5617	.sva_bind_gpasid	= intel_svm_bind_gpasid,
5618	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
5619	.sva_bind		= intel_svm_bind,
5620	.sva_unbind		= intel_svm_unbind,
5621	.sva_get_pasid		= intel_svm_get_pasid,
5622	.page_response		= intel_svm_page_response,
5623#endif
 
 
 
 
 
 
 
 
 
 
 
5624};
5625
5626static void quirk_iommu_igfx(struct pci_dev *dev)
5627{
5628	if (risky_device(dev))
5629		return;
5630
5631	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5632	dmar_map_gfx = 0;
5633}
5634
5635/* G4x/GM45 integrated gfx dmar support is totally busted. */
5636DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5637DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5638DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5639DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5640DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5641DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5642DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5643
5644/* Broadwell igfx malfunctions with dmar */
5645DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5646DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5647DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5648DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5649DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5650DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5651DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5652DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5653DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5654DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5655DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5656DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5657DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5658DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5659DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5660DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5661DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5662DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5663DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5664DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5665DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5666DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5667DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5668DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5669
5670static void quirk_iommu_rwbf(struct pci_dev *dev)
5671{
5672	if (risky_device(dev))
5673		return;
5674
5675	/*
5676	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5677	 * but needs it. Same seems to hold for the desktop versions.
5678	 */
5679	pci_info(dev, "Forcing write-buffer flush capability\n");
5680	rwbf_quirk = 1;
5681}
5682
5683DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5684DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5685DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5686DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5687DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5688DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5689DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5690
5691#define GGC 0x52
5692#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5693#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5694#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5695#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5696#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5697#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5698#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5699#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5700
5701static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5702{
5703	unsigned short ggc;
5704
5705	if (risky_device(dev))
5706		return;
5707
5708	if (pci_read_config_word(dev, GGC, &ggc))
5709		return;
5710
5711	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5712		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5713		dmar_map_gfx = 0;
5714	} else if (dmar_map_gfx) {
5715		/* we have to ensure the gfx device is idle before we flush */
5716		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5717		intel_iommu_strict = 1;
5718       }
5719}
5720DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5721DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5722DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5723DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5724
5725static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5726{
5727	unsigned short ver;
5728
5729	if (!IS_GFX_DEVICE(dev))
5730		return;
5731
5732	ver = (dev->device >> 8) & 0xff;
5733	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5734	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5735	    ver != 0x9a)
5736		return;
5737
5738	if (risky_device(dev))
5739		return;
5740
5741	pci_info(dev, "Skip IOMMU disabling for graphics\n");
5742	iommu_skip_te_disable = 1;
5743}
5744DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5745
5746/* On Tylersburg chipsets, some BIOSes have been known to enable the
5747   ISOCH DMAR unit for the Azalia sound device, but not give it any
5748   TLB entries, which causes it to deadlock. Check for that.  We do
5749   this in a function called from init_dmars(), instead of in a PCI
5750   quirk, because we don't want to print the obnoxious "BIOS broken"
5751   message if VT-d is actually disabled.
5752*/
5753static void __init check_tylersburg_isoch(void)
5754{
5755	struct pci_dev *pdev;
5756	uint32_t vtisochctrl;
5757
5758	/* If there's no Azalia in the system anyway, forget it. */
5759	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5760	if (!pdev)
5761		return;
5762
5763	if (risky_device(pdev)) {
5764		pci_dev_put(pdev);
5765		return;
5766	}
5767
5768	pci_dev_put(pdev);
5769
5770	/* System Management Registers. Might be hidden, in which case
5771	   we can't do the sanity check. But that's OK, because the
5772	   known-broken BIOSes _don't_ actually hide it, so far. */
5773	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5774	if (!pdev)
5775		return;
5776
5777	if (risky_device(pdev)) {
5778		pci_dev_put(pdev);
5779		return;
5780	}
5781
5782	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5783		pci_dev_put(pdev);
5784		return;
5785	}
5786
5787	pci_dev_put(pdev);
5788
5789	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5790	if (vtisochctrl & 1)
5791		return;
5792
5793	/* Drop all bits other than the number of TLB entries */
5794	vtisochctrl &= 0x1c;
5795
5796	/* If we have the recommended number of TLB entries (16), fine. */
5797	if (vtisochctrl == 0x10)
5798		return;
5799
5800	/* Zero TLB entries? You get to ride the short bus to school. */
5801	if (!vtisochctrl) {
5802		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5803		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5804		     dmi_get_system_info(DMI_BIOS_VENDOR),
5805		     dmi_get_system_info(DMI_BIOS_VERSION),
5806		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5807		iommu_identity_mapping |= IDENTMAP_AZALIA;
5808		return;
5809	}
5810
5811	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5812	       vtisochctrl);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5813}