Linux Audio

Check our new training course

Loading...
v6.2
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright © 2006-2014 Intel Corporation.
   4 *
   5 * Authors: David Woodhouse <dwmw2@infradead.org>,
   6 *          Ashok Raj <ashok.raj@intel.com>,
   7 *          Shaohua Li <shaohua.li@intel.com>,
   8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9 *          Fenghua Yu <fenghua.yu@intel.com>
  10 *          Joerg Roedel <jroedel@suse.de>
  11 */
  12
  13#define pr_fmt(fmt)     "DMAR: " fmt
  14#define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16#include <linux/crash_dump.h>
  17#include <linux/dma-direct.h>
  18#include <linux/dmi.h>
  19#include <linux/intel-svm.h>
  20#include <linux/memory.h>
  21#include <linux/pci.h>
  22#include <linux/pci-ats.h>
  23#include <linux/spinlock.h>
 
 
 
 
 
 
 
 
 
 
 
  24#include <linux/syscore_ops.h>
  25#include <linux/tboot.h>
 
 
 
 
 
 
 
 
 
 
 
 
  26
  27#include "iommu.h"
  28#include "../dma-iommu.h"
  29#include "../irq_remapping.h"
  30#include "../iommu-sva.h"
  31#include "pasid.h"
  32#include "cap_audit.h"
  33
  34#define ROOT_SIZE		VTD_PAGE_SIZE
  35#define CONTEXT_SIZE		VTD_PAGE_SIZE
  36
  37#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  38#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  39#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  40#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  41
  42#define IOAPIC_RANGE_START	(0xfee00000)
  43#define IOAPIC_RANGE_END	(0xfeefffff)
  44#define IOVA_START_ADDR		(0x1000)
  45
  46#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  47
  48#define MAX_AGAW_WIDTH 64
  49#define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  50
  51#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
  52#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
  53
  54/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  55   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  56#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
  57				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  58#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  59
  60/* IO virtual address start page frame number */
  61#define IOVA_START_PFN		(1)
  62
  63#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
  64
  65/* page table handling */
  66#define LEVEL_STRIDE		(9)
  67#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
  68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  69static inline int agaw_to_level(int agaw)
  70{
  71	return agaw + 2;
  72}
  73
  74static inline int agaw_to_width(int agaw)
  75{
  76	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
  77}
  78
  79static inline int width_to_agaw(int width)
  80{
  81	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
  82}
  83
  84static inline unsigned int level_to_offset_bits(int level)
  85{
  86	return (level - 1) * LEVEL_STRIDE;
  87}
  88
  89static inline int pfn_level_offset(u64 pfn, int level)
  90{
  91	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
  92}
  93
  94static inline u64 level_mask(int level)
  95{
  96	return -1ULL << level_to_offset_bits(level);
  97}
  98
  99static inline u64 level_size(int level)
 100{
 101	return 1ULL << level_to_offset_bits(level);
 102}
 103
 104static inline u64 align_to_level(u64 pfn, int level)
 105{
 106	return (pfn + level_size(level) - 1) & level_mask(level);
 107}
 108
 109static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 110{
 111	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 112}
 113
 114/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 115   are never going to work. */
 
 
 
 
 
 116static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 117{
 118	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 119}
 120static inline unsigned long page_to_dma_pfn(struct page *pg)
 121{
 122	return mm_to_dma_pfn(page_to_pfn(pg));
 123}
 124static inline unsigned long virt_to_dma_pfn(void *p)
 125{
 126	return page_to_dma_pfn(virt_to_page(p));
 127}
 128
 
 
 
 129static void __init check_tylersburg_isoch(void);
 130static int rwbf_quirk;
 131
 132/*
 133 * set to 1 to panic kernel if can't successfully enable VT-d
 134 * (used when kernel is launched w/ TXT)
 135 */
 136static int force_on = 0;
 137static int intel_iommu_tboot_noforce;
 138static int no_platform_optin;
 139
 140#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 141
 142/*
 143 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 144 * if marked present.
 145 */
 146static phys_addr_t root_entry_lctp(struct root_entry *re)
 147{
 148	if (!(re->lo & 1))
 149		return 0;
 150
 151	return re->lo & VTD_PAGE_MASK;
 152}
 153
 154/*
 155 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 156 * if marked present.
 157 */
 158static phys_addr_t root_entry_uctp(struct root_entry *re)
 159{
 160	if (!(re->hi & 1))
 161		return 0;
 162
 163	return re->hi & VTD_PAGE_MASK;
 164}
 165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 166static inline void context_set_present(struct context_entry *context)
 167{
 168	context->lo |= 1;
 169}
 170
 171static inline void context_set_fault_enable(struct context_entry *context)
 172{
 173	context->lo &= (((u64)-1) << 2) | 1;
 174}
 175
 176static inline void context_set_translation_type(struct context_entry *context,
 177						unsigned long value)
 178{
 179	context->lo &= (((u64)-1) << 4) | 3;
 180	context->lo |= (value & 3) << 2;
 181}
 182
 183static inline void context_set_address_root(struct context_entry *context,
 184					    unsigned long value)
 185{
 186	context->lo &= ~VTD_PAGE_MASK;
 187	context->lo |= value & VTD_PAGE_MASK;
 188}
 189
 190static inline void context_set_address_width(struct context_entry *context,
 191					     unsigned long value)
 192{
 193	context->hi |= value & 7;
 194}
 195
 196static inline void context_set_domain_id(struct context_entry *context,
 197					 unsigned long value)
 198{
 199	context->hi |= (value & ((1 << 16) - 1)) << 8;
 200}
 201
 202static inline void context_set_pasid(struct context_entry *context)
 203{
 204	context->lo |= CONTEXT_PASIDE;
 205}
 206
 207static inline int context_domain_id(struct context_entry *c)
 208{
 209	return((c->hi >> 8) & 0xffff);
 210}
 211
 212static inline void context_clear_entry(struct context_entry *context)
 213{
 214	context->lo = 0;
 215	context->hi = 0;
 216}
 217
 218static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 219{
 220	if (!iommu->copied_tables)
 221		return false;
 222
 223	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 224}
 225
 226static inline void
 227set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 228{
 229	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 230}
 231
 232static inline void
 233clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 234{
 235	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 236}
 237
 238/*
 239 * This domain is a statically identity mapping domain.
 240 *	1. This domain creats a static 1:1 mapping to all usable memory.
 241 * 	2. It maps to each iommu if successful.
 242 *	3. Each iommu mapps to this domain if successful.
 243 */
 244static struct dmar_domain *si_domain;
 245static int hw_pass_through = 1;
 246
 
 
 
 
 247struct dmar_rmrr_unit {
 248	struct list_head list;		/* list of rmrr units	*/
 249	struct acpi_dmar_header *hdr;	/* ACPI header		*/
 250	u64	base_address;		/* reserved base address*/
 251	u64	end_address;		/* reserved end address */
 252	struct dmar_dev_scope *devices;	/* target devices */
 253	int	devices_cnt;		/* target device count */
 254};
 255
 256struct dmar_atsr_unit {
 257	struct list_head list;		/* list of ATSR units */
 258	struct acpi_dmar_header *hdr;	/* ACPI header */
 259	struct dmar_dev_scope *devices;	/* target devices */
 260	int devices_cnt;		/* target device count */
 261	u8 include_all:1;		/* include all ports */
 262};
 263
 264struct dmar_satc_unit {
 265	struct list_head list;		/* list of SATC units */
 266	struct acpi_dmar_header *hdr;	/* ACPI header */
 267	struct dmar_dev_scope *devices;	/* target devices */
 268	struct intel_iommu *iommu;	/* the corresponding iommu */
 269	int devices_cnt;		/* target device count */
 270	u8 atc_required:1;		/* ATS is required */
 271};
 272
 273static LIST_HEAD(dmar_atsr_units);
 274static LIST_HEAD(dmar_rmrr_units);
 275static LIST_HEAD(dmar_satc_units);
 276
 277#define for_each_rmrr_units(rmrr) \
 278	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 279
 280static void device_block_translation(struct device *dev);
 281static void intel_iommu_domain_free(struct iommu_domain *domain);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 282
 283int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
 284int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
 
 
 
 285
 286int intel_iommu_enabled = 0;
 287EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 288
 289static int dmar_map_gfx = 1;
 
 
 290static int intel_iommu_superpage = 1;
 291static int iommu_identity_mapping;
 
 292static int iommu_skip_te_disable;
 293
 294#define IDENTMAP_GFX		2
 295#define IDENTMAP_AZALIA		4
 296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 297const struct iommu_ops intel_iommu_ops;
 298
 299static bool translation_pre_enabled(struct intel_iommu *iommu)
 300{
 301	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 302}
 303
 304static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 305{
 306	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 307}
 308
 309static void init_translation_status(struct intel_iommu *iommu)
 310{
 311	u32 gsts;
 312
 313	gsts = readl(iommu->reg + DMAR_GSTS_REG);
 314	if (gsts & DMA_GSTS_TES)
 315		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 316}
 317
 318static int __init intel_iommu_setup(char *str)
 319{
 320	if (!str)
 321		return -EINVAL;
 322
 323	while (*str) {
 324		if (!strncmp(str, "on", 2)) {
 325			dmar_disabled = 0;
 326			pr_info("IOMMU enabled\n");
 327		} else if (!strncmp(str, "off", 3)) {
 328			dmar_disabled = 1;
 329			no_platform_optin = 1;
 330			pr_info("IOMMU disabled\n");
 331		} else if (!strncmp(str, "igfx_off", 8)) {
 332			dmar_map_gfx = 0;
 333			pr_info("Disable GFX device mapping\n");
 334		} else if (!strncmp(str, "forcedac", 8)) {
 335			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 336			iommu_dma_forcedac = true;
 337		} else if (!strncmp(str, "strict", 6)) {
 338			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
 339			iommu_set_dma_strict();
 340		} else if (!strncmp(str, "sp_off", 6)) {
 341			pr_info("Disable supported super page\n");
 342			intel_iommu_superpage = 0;
 343		} else if (!strncmp(str, "sm_on", 5)) {
 344			pr_info("Enable scalable mode if hardware supports\n");
 345			intel_iommu_sm = 1;
 346		} else if (!strncmp(str, "sm_off", 6)) {
 347			pr_info("Scalable mode is disallowed\n");
 348			intel_iommu_sm = 0;
 349		} else if (!strncmp(str, "tboot_noforce", 13)) {
 350			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 351			intel_iommu_tboot_noforce = 1;
 352		} else {
 353			pr_notice("Unknown option - '%s'\n", str);
 
 354		}
 355
 356		str += strcspn(str, ",");
 357		while (*str == ',')
 358			str++;
 359	}
 360
 361	return 1;
 362}
 363__setup("intel_iommu=", intel_iommu_setup);
 364
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 365void *alloc_pgtable_page(int node)
 366{
 367	struct page *page;
 368	void *vaddr = NULL;
 369
 370	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 371	if (page)
 372		vaddr = page_address(page);
 373	return vaddr;
 374}
 375
 376void free_pgtable_page(void *vaddr)
 377{
 378	free_page((unsigned long)vaddr);
 379}
 380
 381static inline int domain_type_is_si(struct dmar_domain *domain)
 382{
 383	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
 384}
 385
 386static inline int domain_pfn_supported(struct dmar_domain *domain,
 387				       unsigned long pfn)
 388{
 389	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 
 390
 391	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 
 
 392}
 393
 394/*
 395 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
 396 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
 397 * the returned SAGAW.
 398 */
 399static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
 400{
 401	unsigned long fl_sagaw, sl_sagaw;
 
 402
 403	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
 404	sl_sagaw = cap_sagaw(iommu->cap);
 
 
 405
 406	/* Second level only. */
 407	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
 408		return sl_sagaw;
 
 409
 410	/* First level only. */
 411	if (!ecap_slts(iommu->ecap))
 412		return fl_sagaw;
 
 413
 414	return fl_sagaw & sl_sagaw;
 415}
 416
 417static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 418{
 419	unsigned long sagaw;
 420	int agaw;
 421
 422	sagaw = __iommu_calculate_sagaw(iommu);
 423	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
 
 424		if (test_bit(agaw, &sagaw))
 425			break;
 426	}
 427
 428	return agaw;
 429}
 430
 431/*
 432 * Calculate max SAGAW for each iommu.
 433 */
 434int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 435{
 436	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 437}
 438
 439/*
 440 * calculate agaw for each iommu.
 441 * "SAGAW" may be different across iommus, use a default agaw, and
 442 * get a supported less agaw for iommus that don't support the default agaw.
 443 */
 444int iommu_calculate_agaw(struct intel_iommu *iommu)
 445{
 446	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 447}
 448
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 449static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 450{
 451	return sm_supported(iommu) ?
 452			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 453}
 454
 455static void domain_update_iommu_coherency(struct dmar_domain *domain)
 456{
 457	struct iommu_domain_info *info;
 458	struct dmar_drhd_unit *drhd;
 459	struct intel_iommu *iommu;
 460	bool found = false;
 461	unsigned long i;
 462
 463	domain->iommu_coherency = true;
 464	xa_for_each(&domain->iommu_array, i, info) {
 
 465		found = true;
 466		if (!iommu_paging_structure_coherency(info->iommu)) {
 467			domain->iommu_coherency = false;
 468			break;
 469		}
 470	}
 471	if (found)
 472		return;
 473
 474	/* No hardware attached; use lowest common denominator */
 475	rcu_read_lock();
 476	for_each_active_iommu(iommu, drhd) {
 477		if (!iommu_paging_structure_coherency(iommu)) {
 478			domain->iommu_coherency = false;
 479			break;
 480		}
 481	}
 482	rcu_read_unlock();
 483}
 484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 485static int domain_update_iommu_superpage(struct dmar_domain *domain,
 486					 struct intel_iommu *skip)
 487{
 488	struct dmar_drhd_unit *drhd;
 489	struct intel_iommu *iommu;
 490	int mask = 0x3;
 491
 492	if (!intel_iommu_superpage)
 493		return 0;
 
 494
 495	/* set iommu_superpage to the smallest common denominator */
 496	rcu_read_lock();
 497	for_each_active_iommu(iommu, drhd) {
 498		if (iommu != skip) {
 499			if (domain && domain->use_first_level) {
 500				if (!cap_fl1gp_support(iommu->cap))
 501					mask = 0x1;
 502			} else {
 503				mask &= cap_super_page_val(iommu->cap);
 504			}
 505
 506			if (!mask)
 507				break;
 508		}
 509	}
 510	rcu_read_unlock();
 511
 512	return fls(mask);
 513}
 514
 515static int domain_update_device_node(struct dmar_domain *domain)
 516{
 517	struct device_domain_info *info;
 518	int nid = NUMA_NO_NODE;
 519	unsigned long flags;
 520
 521	spin_lock_irqsave(&domain->lock, flags);
 522	list_for_each_entry(info, &domain->devices, link) {
 523		/*
 524		 * There could possibly be multiple device numa nodes as devices
 525		 * within the same domain may sit behind different IOMMUs. There
 526		 * isn't perfect answer in such situation, so we select first
 527		 * come first served policy.
 528		 */
 529		nid = dev_to_node(info->dev);
 530		if (nid != NUMA_NO_NODE)
 531			break;
 532	}
 533	spin_unlock_irqrestore(&domain->lock, flags);
 534
 535	return nid;
 536}
 537
 538static void domain_update_iotlb(struct dmar_domain *domain);
 539
 540/* Return the super pagesize bitmap if supported. */
 541static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
 542{
 543	unsigned long bitmap = 0;
 544
 545	/*
 546	 * 1-level super page supports page size of 2MiB, 2-level super page
 547	 * supports page size of both 2MiB and 1GiB.
 548	 */
 549	if (domain->iommu_superpage == 1)
 550		bitmap |= SZ_2M;
 551	else if (domain->iommu_superpage == 2)
 552		bitmap |= SZ_2M | SZ_1G;
 553
 554	return bitmap;
 555}
 556
 557/* Some capabilities may be different across iommus */
 558static void domain_update_iommu_cap(struct dmar_domain *domain)
 559{
 560	domain_update_iommu_coherency(domain);
 
 561	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 562
 563	/*
 564	 * If RHSA is missing, we should default to the device numa domain
 565	 * as fall back.
 566	 */
 567	if (domain->nid == NUMA_NO_NODE)
 568		domain->nid = domain_update_device_node(domain);
 569
 570	/*
 571	 * First-level translation restricts the input-address to a
 572	 * canonical address (i.e., address bits 63:N have the same
 573	 * value as address bit [N-1], where N is 48-bits with 4-level
 574	 * paging and 57-bits with 5-level paging). Hence, skip bit
 575	 * [N-1].
 576	 */
 577	if (domain->use_first_level)
 578		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
 579	else
 580		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
 581
 582	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
 583	domain_update_iotlb(domain);
 584}
 585
 586struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 587					 u8 devfn, int alloc)
 588{
 589	struct root_entry *root = &iommu->root_entry[bus];
 590	struct context_entry *context;
 591	u64 *entry;
 592
 593	/*
 594	 * Except that the caller requested to allocate a new entry,
 595	 * returning a copied context entry makes no sense.
 596	 */
 597	if (!alloc && context_copied(iommu, bus, devfn))
 598		return NULL;
 599
 600	entry = &root->lo;
 601	if (sm_supported(iommu)) {
 602		if (devfn >= 0x80) {
 603			devfn -= 0x80;
 604			entry = &root->hi;
 605		}
 606		devfn *= 2;
 607	}
 608	if (*entry & 1)
 609		context = phys_to_virt(*entry & VTD_PAGE_MASK);
 610	else {
 611		unsigned long phy_addr;
 612		if (!alloc)
 613			return NULL;
 614
 615		context = alloc_pgtable_page(iommu->node);
 616		if (!context)
 617			return NULL;
 618
 619		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 620		phy_addr = virt_to_phys((void *)context);
 621		*entry = phy_addr | 1;
 622		__iommu_flush_cache(iommu, entry, sizeof(*entry));
 623	}
 624	return &context[devfn];
 625}
 626
 
 
 
 
 
 627/**
 628 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 629 *				 sub-hierarchy of a candidate PCI-PCI bridge
 630 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 631 * @bridge: the candidate PCI-PCI bridge
 632 *
 633 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 634 */
 635static bool
 636is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 637{
 638	struct pci_dev *pdev, *pbridge;
 639
 640	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 641		return false;
 642
 643	pdev = to_pci_dev(dev);
 644	pbridge = to_pci_dev(bridge);
 645
 646	if (pbridge->subordinate &&
 647	    pbridge->subordinate->number <= pdev->bus->number &&
 648	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
 649		return true;
 650
 651	return false;
 652}
 653
 654static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 655{
 656	struct dmar_drhd_unit *drhd;
 657	u32 vtbar;
 658	int rc;
 659
 660	/* We know that this device on this chipset has its own IOMMU.
 661	 * If we find it under a different IOMMU, then the BIOS is lying
 662	 * to us. Hope that the IOMMU for this device is actually
 663	 * disabled, and it needs no translation...
 664	 */
 665	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 666	if (rc) {
 667		/* "can't" happen */
 668		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 669		return false;
 670	}
 671	vtbar &= 0xffff0000;
 672
 673	/* we know that the this iommu should be at offset 0xa000 from vtbar */
 674	drhd = dmar_find_matched_drhd_unit(pdev);
 675	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 676		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 677		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 678		return true;
 679	}
 680
 681	return false;
 682}
 683
 684static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 685{
 686	if (!iommu || iommu->drhd->ignored)
 687		return true;
 688
 689	if (dev_is_pci(dev)) {
 690		struct pci_dev *pdev = to_pci_dev(dev);
 691
 692		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 693		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 694		    quirk_ioat_snb_local_iommu(pdev))
 695			return true;
 696	}
 697
 698	return false;
 699}
 700
 701struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 702{
 703	struct dmar_drhd_unit *drhd = NULL;
 704	struct pci_dev *pdev = NULL;
 705	struct intel_iommu *iommu;
 706	struct device *tmp;
 707	u16 segment = 0;
 708	int i;
 709
 710	if (!dev)
 711		return NULL;
 712
 713	if (dev_is_pci(dev)) {
 714		struct pci_dev *pf_pdev;
 715
 716		pdev = pci_real_dma_dev(to_pci_dev(dev));
 717
 718		/* VFs aren't listed in scope tables; we need to look up
 719		 * the PF instead to find the IOMMU. */
 720		pf_pdev = pci_physfn(pdev);
 721		dev = &pf_pdev->dev;
 722		segment = pci_domain_nr(pdev->bus);
 723	} else if (has_acpi_companion(dev))
 724		dev = &ACPI_COMPANION(dev)->dev;
 725
 726	rcu_read_lock();
 727	for_each_iommu(iommu, drhd) {
 728		if (pdev && segment != drhd->segment)
 729			continue;
 730
 731		for_each_active_dev_scope(drhd->devices,
 732					  drhd->devices_cnt, i, tmp) {
 733			if (tmp == dev) {
 734				/* For a VF use its original BDF# not that of the PF
 735				 * which we used for the IOMMU lookup. Strictly speaking
 736				 * we could do this for all PCI devices; we only need to
 737				 * get the BDF# from the scope table for ACPI matches. */
 738				if (pdev && pdev->is_virtfn)
 739					goto got_pdev;
 740
 741				if (bus && devfn) {
 742					*bus = drhd->devices[i].bus;
 743					*devfn = drhd->devices[i].devfn;
 744				}
 745				goto out;
 746			}
 747
 748			if (is_downstream_to_pci_bridge(dev, tmp))
 749				goto got_pdev;
 750		}
 751
 752		if (pdev && drhd->include_all) {
 753got_pdev:
 754			if (bus && devfn) {
 755				*bus = pdev->bus->number;
 756				*devfn = pdev->devfn;
 757			}
 758			goto out;
 759		}
 760	}
 761	iommu = NULL;
 762out:
 763	if (iommu_is_dummy(iommu, dev))
 764		iommu = NULL;
 765
 766	rcu_read_unlock();
 767
 768	return iommu;
 769}
 770
 771static void domain_flush_cache(struct dmar_domain *domain,
 772			       void *addr, int size)
 773{
 774	if (!domain->iommu_coherency)
 775		clflush_cache_range(addr, size);
 776}
 777
 778static void free_context_table(struct intel_iommu *iommu)
 779{
 780	struct context_entry *context;
 781	int i;
 
 782
 783	if (!iommu->root_entry)
 784		return;
 
 
 
 
 
 
 
 
 
 
 
 785
 
 
 
 
 786	for (i = 0; i < ROOT_ENTRY_NR; i++) {
 787		context = iommu_context_addr(iommu, i, 0, 0);
 788		if (context)
 789			free_pgtable_page(context);
 790
 791		if (!sm_supported(iommu))
 792			continue;
 793
 794		context = iommu_context_addr(iommu, i, 0x80, 0);
 795		if (context)
 796			free_pgtable_page(context);
 797	}
 798
 
 799	free_pgtable_page(iommu->root_entry);
 800	iommu->root_entry = NULL;
 
 
 801}
 802
 803#ifdef CONFIG_DMAR_DEBUG
 804static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
 805			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
 806{
 807	struct dma_pte *pte;
 808	int offset;
 809
 810	while (1) {
 811		offset = pfn_level_offset(pfn, level);
 812		pte = &parent[offset];
 813		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
 814			pr_info("PTE not present at level %d\n", level);
 815			break;
 816		}
 817
 818		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
 819
 820		if (level == 1)
 821			break;
 822
 823		parent = phys_to_virt(dma_pte_addr(pte));
 824		level--;
 825	}
 826}
 827
 828void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
 829			  unsigned long long addr, u32 pasid)
 830{
 831	struct pasid_dir_entry *dir, *pde;
 832	struct pasid_entry *entries, *pte;
 833	struct context_entry *ctx_entry;
 834	struct root_entry *rt_entry;
 835	int i, dir_index, index, level;
 836	u8 devfn = source_id & 0xff;
 837	u8 bus = source_id >> 8;
 838	struct dma_pte *pgtable;
 839
 840	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
 841
 842	/* root entry dump */
 843	rt_entry = &iommu->root_entry[bus];
 844	if (!rt_entry) {
 845		pr_info("root table entry is not present\n");
 846		return;
 847	}
 848
 849	if (sm_supported(iommu))
 850		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
 851			rt_entry->hi, rt_entry->lo);
 852	else
 853		pr_info("root entry: 0x%016llx", rt_entry->lo);
 854
 855	/* context entry dump */
 856	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
 857	if (!ctx_entry) {
 858		pr_info("context table entry is not present\n");
 859		return;
 860	}
 861
 862	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
 863		ctx_entry->hi, ctx_entry->lo);
 864
 865	/* legacy mode does not require PASID entries */
 866	if (!sm_supported(iommu)) {
 867		level = agaw_to_level(ctx_entry->hi & 7);
 868		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 869		goto pgtable_walk;
 870	}
 871
 872	/* get the pointer to pasid directory entry */
 873	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 874	if (!dir) {
 875		pr_info("pasid directory entry is not present\n");
 876		return;
 877	}
 878	/* For request-without-pasid, get the pasid from context entry */
 879	if (intel_iommu_sm && pasid == INVALID_IOASID)
 880		pasid = PASID_RID2PASID;
 881
 882	dir_index = pasid >> PASID_PDE_SHIFT;
 883	pde = &dir[dir_index];
 884	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
 885
 886	/* get the pointer to the pasid table entry */
 887	entries = get_pasid_table_from_pde(pde);
 888	if (!entries) {
 889		pr_info("pasid table entry is not present\n");
 890		return;
 891	}
 892	index = pasid & PASID_PTE_MASK;
 893	pte = &entries[index];
 894	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
 895		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
 896
 897	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
 898		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
 899		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
 900	} else {
 901		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
 902		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
 903	}
 904
 905pgtable_walk:
 906	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
 907}
 908#endif
 909
 910static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 911				      unsigned long pfn, int *target_level)
 912{
 913	struct dma_pte *parent, *pte;
 914	int level = agaw_to_level(domain->agaw);
 915	int offset;
 916
 917	BUG_ON(!domain->pgd);
 918
 919	if (!domain_pfn_supported(domain, pfn))
 920		/* Address beyond IOMMU's addressing capabilities. */
 921		return NULL;
 922
 923	parent = domain->pgd;
 924
 925	while (1) {
 926		void *tmp_page;
 927
 928		offset = pfn_level_offset(pfn, level);
 929		pte = &parent[offset];
 930		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 931			break;
 932		if (level == *target_level)
 933			break;
 934
 935		if (!dma_pte_present(pte)) {
 936			uint64_t pteval;
 937
 938			tmp_page = alloc_pgtable_page(domain->nid);
 939
 940			if (!tmp_page)
 941				return NULL;
 942
 943			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 944			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 945			if (domain->use_first_level)
 946				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
 947
 948			if (cmpxchg64(&pte->val, 0ULL, pteval))
 949				/* Someone else set it while we were thinking; use theirs. */
 950				free_pgtable_page(tmp_page);
 951			else
 952				domain_flush_cache(domain, pte, sizeof(*pte));
 953		}
 954		if (level == 1)
 955			break;
 956
 957		parent = phys_to_virt(dma_pte_addr(pte));
 958		level--;
 959	}
 960
 961	if (!*target_level)
 962		*target_level = level;
 963
 964	return pte;
 965}
 966
 967/* return address's pte at specific level */
 968static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 969					 unsigned long pfn,
 970					 int level, int *large_page)
 971{
 972	struct dma_pte *parent, *pte;
 973	int total = agaw_to_level(domain->agaw);
 974	int offset;
 975
 976	parent = domain->pgd;
 977	while (level <= total) {
 978		offset = pfn_level_offset(pfn, total);
 979		pte = &parent[offset];
 980		if (level == total)
 981			return pte;
 982
 983		if (!dma_pte_present(pte)) {
 984			*large_page = total;
 985			break;
 986		}
 987
 988		if (dma_pte_superpage(pte)) {
 989			*large_page = total;
 990			return pte;
 991		}
 992
 993		parent = phys_to_virt(dma_pte_addr(pte));
 994		total--;
 995	}
 996	return NULL;
 997}
 998
 999/* clear last level pte, a tlb flush should be followed */
1000static void dma_pte_clear_range(struct dmar_domain *domain,
1001				unsigned long start_pfn,
1002				unsigned long last_pfn)
1003{
1004	unsigned int large_page;
1005	struct dma_pte *first_pte, *pte;
1006
1007	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1008	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1009	BUG_ON(start_pfn > last_pfn);
1010
1011	/* we don't need lock here; nobody else touches the iova range */
1012	do {
1013		large_page = 1;
1014		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1015		if (!pte) {
1016			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1017			continue;
1018		}
1019		do {
1020			dma_clear_pte(pte);
1021			start_pfn += lvl_to_nr_pages(large_page);
1022			pte++;
1023		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1024
1025		domain_flush_cache(domain, first_pte,
1026				   (void *)pte - (void *)first_pte);
1027
1028	} while (start_pfn && start_pfn <= last_pfn);
1029}
1030
1031static void dma_pte_free_level(struct dmar_domain *domain, int level,
1032			       int retain_level, struct dma_pte *pte,
1033			       unsigned long pfn, unsigned long start_pfn,
1034			       unsigned long last_pfn)
1035{
1036	pfn = max(start_pfn, pfn);
1037	pte = &pte[pfn_level_offset(pfn, level)];
1038
1039	do {
1040		unsigned long level_pfn;
1041		struct dma_pte *level_pte;
1042
1043		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1044			goto next;
1045
1046		level_pfn = pfn & level_mask(level);
1047		level_pte = phys_to_virt(dma_pte_addr(pte));
1048
1049		if (level > 2) {
1050			dma_pte_free_level(domain, level - 1, retain_level,
1051					   level_pte, level_pfn, start_pfn,
1052					   last_pfn);
1053		}
1054
1055		/*
1056		 * Free the page table if we're below the level we want to
1057		 * retain and the range covers the entire table.
1058		 */
1059		if (level < retain_level && !(start_pfn > level_pfn ||
1060		      last_pfn < level_pfn + level_size(level) - 1)) {
1061			dma_clear_pte(pte);
1062			domain_flush_cache(domain, pte, sizeof(*pte));
1063			free_pgtable_page(level_pte);
1064		}
1065next:
1066		pfn += level_size(level);
1067	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1068}
1069
1070/*
1071 * clear last level (leaf) ptes and free page table pages below the
1072 * level we wish to keep intact.
1073 */
1074static void dma_pte_free_pagetable(struct dmar_domain *domain,
1075				   unsigned long start_pfn,
1076				   unsigned long last_pfn,
1077				   int retain_level)
1078{
 
 
 
 
1079	dma_pte_clear_range(domain, start_pfn, last_pfn);
1080
1081	/* We don't need lock here; nobody else touches the iova range */
1082	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1083			   domain->pgd, 0, start_pfn, last_pfn);
1084
1085	/* free pgd */
1086	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1087		free_pgtable_page(domain->pgd);
1088		domain->pgd = NULL;
1089	}
1090}
1091
1092/* When a page at a given level is being unlinked from its parent, we don't
1093   need to *modify* it at all. All we need to do is make a list of all the
1094   pages which can be freed just as soon as we've flushed the IOTLB and we
1095   know the hardware page-walk will no longer touch them.
1096   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1097   be freed. */
1098static void dma_pte_list_pagetables(struct dmar_domain *domain,
1099				    int level, struct dma_pte *pte,
1100				    struct list_head *freelist)
1101{
1102	struct page *pg;
1103
1104	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1105	list_add_tail(&pg->lru, freelist);
 
1106
1107	if (level == 1)
1108		return;
1109
1110	pte = page_address(pg);
1111	do {
1112		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1113			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
 
1114		pte++;
1115	} while (!first_pte_in_page(pte));
 
 
1116}
1117
1118static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1119				struct dma_pte *pte, unsigned long pfn,
1120				unsigned long start_pfn, unsigned long last_pfn,
1121				struct list_head *freelist)
 
1122{
1123	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1124
1125	pfn = max(start_pfn, pfn);
1126	pte = &pte[pfn_level_offset(pfn, level)];
1127
1128	do {
1129		unsigned long level_pfn = pfn & level_mask(level);
1130
1131		if (!dma_pte_present(pte))
1132			goto next;
1133
 
 
1134		/* If range covers entire pagetable, free it */
1135		if (start_pfn <= level_pfn &&
1136		    last_pfn >= level_pfn + level_size(level) - 1) {
1137			/* These suborbinate page tables are going away entirely. Don't
1138			   bother to clear them; we're just going to *free* them. */
1139			if (level > 1 && !dma_pte_superpage(pte))
1140				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1141
1142			dma_clear_pte(pte);
1143			if (!first_pte)
1144				first_pte = pte;
1145			last_pte = pte;
1146		} else if (level > 1) {
1147			/* Recurse down into a level that isn't *entirely* obsolete */
1148			dma_pte_clear_level(domain, level - 1,
1149					    phys_to_virt(dma_pte_addr(pte)),
1150					    level_pfn, start_pfn, last_pfn,
1151					    freelist);
1152		}
1153next:
1154		pfn = level_pfn + level_size(level);
1155	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1156
1157	if (first_pte)
1158		domain_flush_cache(domain, first_pte,
1159				   (void *)++last_pte - (void *)first_pte);
 
 
1160}
1161
1162/* We can't just free the pages because the IOMMU may still be walking
1163   the page tables, and may have cached the intermediate levels. The
1164   pages can only be freed after the IOTLB flush has been done. */
1165static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1166			 unsigned long last_pfn, struct list_head *freelist)
 
1167{
 
 
1168	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1169	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1170	BUG_ON(start_pfn > last_pfn);
1171
1172	/* we don't need lock here; nobody else touches the iova range */
1173	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1174			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1175
1176	/* free pgd */
1177	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1178		struct page *pgd_page = virt_to_page(domain->pgd);
1179		list_add_tail(&pgd_page->lru, freelist);
 
 
1180		domain->pgd = NULL;
1181	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1182}
1183
1184/* iommu handling */
1185static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1186{
1187	struct root_entry *root;
 
1188
1189	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1190	if (!root) {
1191		pr_err("Allocating root entry for %s failed\n",
1192			iommu->name);
1193		return -ENOMEM;
1194	}
1195
1196	__iommu_flush_cache(iommu, root, ROOT_SIZE);
 
 
1197	iommu->root_entry = root;
 
1198
1199	return 0;
1200}
1201
1202static void iommu_set_root_entry(struct intel_iommu *iommu)
1203{
1204	u64 addr;
1205	u32 sts;
1206	unsigned long flag;
1207
1208	addr = virt_to_phys(iommu->root_entry);
1209	if (sm_supported(iommu))
1210		addr |= DMA_RTADDR_SMT;
1211
1212	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1213	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1214
1215	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1216
1217	/* Make sure hardware complete it */
1218	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1219		      readl, (sts & DMA_GSTS_RTPS), sts);
1220
1221	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1222
1223	/*
1224	 * Hardware invalidates all DMA remapping hardware translation
1225	 * caches as part of SRTP flow.
1226	 */
1227	if (cap_esrtps(iommu->cap))
1228		return;
1229
1230	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1231	if (sm_supported(iommu))
1232		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1233	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1234}
1235
1236void iommu_flush_write_buffer(struct intel_iommu *iommu)
1237{
1238	u32 val;
1239	unsigned long flag;
1240
1241	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1242		return;
1243
1244	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1245	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1246
1247	/* Make sure hardware complete it */
1248	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1249		      readl, (!(val & DMA_GSTS_WBFS)), val);
1250
1251	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1252}
1253
1254/* return value determine if we need a write buffer flush */
1255static void __iommu_flush_context(struct intel_iommu *iommu,
1256				  u16 did, u16 source_id, u8 function_mask,
1257				  u64 type)
1258{
1259	u64 val = 0;
1260	unsigned long flag;
1261
1262	switch (type) {
1263	case DMA_CCMD_GLOBAL_INVL:
1264		val = DMA_CCMD_GLOBAL_INVL;
1265		break;
1266	case DMA_CCMD_DOMAIN_INVL:
1267		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1268		break;
1269	case DMA_CCMD_DEVICE_INVL:
1270		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1271			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1272		break;
1273	default:
1274		BUG();
1275	}
1276	val |= DMA_CCMD_ICC;
1277
1278	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1279	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1280
1281	/* Make sure hardware complete it */
1282	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1283		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1284
1285	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1286}
1287
1288/* return value determine if we need a write buffer flush */
1289static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1290				u64 addr, unsigned int size_order, u64 type)
1291{
1292	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1293	u64 val = 0, val_iva = 0;
1294	unsigned long flag;
1295
1296	switch (type) {
1297	case DMA_TLB_GLOBAL_FLUSH:
1298		/* global flush doesn't need set IVA_REG */
1299		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1300		break;
1301	case DMA_TLB_DSI_FLUSH:
1302		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1303		break;
1304	case DMA_TLB_PSI_FLUSH:
1305		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1306		/* IH bit is passed in as part of address */
1307		val_iva = size_order | addr;
1308		break;
1309	default:
1310		BUG();
1311	}
1312	/* Note: set drain read/write */
1313#if 0
1314	/*
1315	 * This is probably to be super secure.. Looks like we can
1316	 * ignore it without any impact.
1317	 */
1318	if (cap_read_drain(iommu->cap))
1319		val |= DMA_TLB_READ_DRAIN;
1320#endif
1321	if (cap_write_drain(iommu->cap))
1322		val |= DMA_TLB_WRITE_DRAIN;
1323
1324	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325	/* Note: Only uses first TLB reg currently */
1326	if (val_iva)
1327		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329
1330	/* Make sure hardware complete it */
1331	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333
1334	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335
1336	/* check IOTLB invalidation granularity */
1337	if (DMA_TLB_IAIG(val) == 0)
1338		pr_err("Flush IOTLB failed\n");
1339	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340		pr_debug("TLB flush request %Lx, actual %Lx\n",
1341			(unsigned long long)DMA_TLB_IIRG(type),
1342			(unsigned long long)DMA_TLB_IAIG(val));
1343}
1344
1345static struct device_domain_info *
1346domain_lookup_dev_info(struct dmar_domain *domain,
1347		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1348{
1349	struct device_domain_info *info;
1350	unsigned long flags;
1351
1352	spin_lock_irqsave(&domain->lock, flags);
1353	list_for_each_entry(info, &domain->devices, link) {
 
 
 
 
1354		if (info->iommu == iommu && info->bus == bus &&
1355		    info->devfn == devfn) {
1356			spin_unlock_irqrestore(&domain->lock, flags);
1357			return info;
 
1358		}
1359	}
1360	spin_unlock_irqrestore(&domain->lock, flags);
1361
1362	return NULL;
1363}
1364
1365static void domain_update_iotlb(struct dmar_domain *domain)
1366{
1367	struct device_domain_info *info;
1368	bool has_iotlb_device = false;
1369	unsigned long flags;
1370
1371	spin_lock_irqsave(&domain->lock, flags);
 
1372	list_for_each_entry(info, &domain->devices, link) {
1373		if (info->ats_enabled) {
 
 
 
 
 
 
1374			has_iotlb_device = true;
1375			break;
1376		}
1377	}
1378	domain->has_iotlb_device = has_iotlb_device;
1379	spin_unlock_irqrestore(&domain->lock, flags);
1380}
1381
1382/*
1383 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1384 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1385 * check because it applies only to the built-in QAT devices and it doesn't
1386 * grant additional privileges.
1387 */
1388#define BUGGY_QAT_DEVID_MASK 0x4940
1389static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1390{
1391	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1392		return false;
1393
1394	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1395		return false;
1396
1397	return true;
1398}
1399
1400static void iommu_enable_pci_caps(struct device_domain_info *info)
1401{
1402	struct pci_dev *pdev;
1403
1404	if (!dev_is_pci(info->dev))
 
 
1405		return;
1406
1407	pdev = to_pci_dev(info->dev);
1408	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1409	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1410	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1411	 * reserved, which should be set to 0.
1412	 */
1413	if (!ecap_dit(info->iommu->ecap))
1414		info->pfsid = 0;
1415	else {
1416		struct pci_dev *pf_pdev;
1417
1418		/* pdev will be returned if device is not a vf */
1419		pf_pdev = pci_physfn(pdev);
1420		info->pfsid = pci_dev_id(pf_pdev);
1421	}
1422
 
1423	/* The PCIe spec, in its wisdom, declares that the behaviour of
1424	   the device if you enable PASID support after ATS support is
1425	   undefined. So always enable PASID support on devices which
1426	   have it, even if we can't yet know if we're ever going to
1427	   use it. */
1428	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1429		info->pasid_enabled = 1;
1430
1431	if (info->pri_supported &&
1432	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1433	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1434		info->pri_enabled = 1;
1435
1436	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1437	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1438		info->ats_enabled = 1;
1439		domain_update_iotlb(info->domain);
1440		info->ats_qdep = pci_ats_queue_depth(pdev);
1441	}
1442}
1443
1444static void iommu_disable_pci_caps(struct device_domain_info *info)
1445{
1446	struct pci_dev *pdev;
1447
 
 
1448	if (!dev_is_pci(info->dev))
1449		return;
1450
1451	pdev = to_pci_dev(info->dev);
1452
1453	if (info->ats_enabled) {
1454		pci_disable_ats(pdev);
1455		info->ats_enabled = 0;
1456		domain_update_iotlb(info->domain);
1457	}
1458
1459	if (info->pri_enabled) {
1460		pci_disable_pri(pdev);
1461		info->pri_enabled = 0;
1462	}
1463
1464	if (info->pasid_enabled) {
1465		pci_disable_pasid(pdev);
1466		info->pasid_enabled = 0;
1467	}
 
1468}
1469
1470static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1471				    u64 addr, unsigned int mask)
1472{
1473	u16 sid, qdep;
 
 
1474
1475	if (!info || !info->ats_enabled)
1476		return;
1477
1478	sid = info->bus << 8 | info->devfn;
1479	qdep = info->ats_qdep;
1480	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1481			   qdep, addr, mask);
1482	quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
 
 
 
 
 
 
1483}
1484
1485static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1486				  u64 addr, unsigned mask)
 
1487{
1488	struct device_domain_info *info;
1489	unsigned long flags;
1490
1491	if (!domain->has_iotlb_device)
1492		return;
 
1493
1494	spin_lock_irqsave(&domain->lock, flags);
1495	list_for_each_entry(info, &domain->devices, link)
1496		__iommu_flush_dev_iotlb(info, addr, mask);
1497	spin_unlock_irqrestore(&domain->lock, flags);
1498}
1499
1500static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1501				  struct dmar_domain *domain,
1502				  unsigned long pfn, unsigned int pages,
1503				  int ih, int map)
1504{
1505	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1506	unsigned int mask = ilog2(aligned_pages);
1507	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1508	u16 did = domain_id_iommu(domain, iommu);
1509
1510	BUG_ON(pages == 0);
1511
1512	if (ih)
1513		ih = 1 << 6;
1514
1515	if (domain->use_first_level) {
1516		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1517	} else {
1518		unsigned long bitmask = aligned_pages - 1;
1519
1520		/*
1521		 * PSI masks the low order bits of the base address. If the
1522		 * address isn't aligned to the mask, then compute a mask value
1523		 * needed to ensure the target range is flushed.
1524		 */
1525		if (unlikely(bitmask & pfn)) {
1526			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1527
1528			/*
1529			 * Since end_pfn <= pfn + bitmask, the only way bits
1530			 * higher than bitmask can differ in pfn and end_pfn is
1531			 * by carrying. This means after masking out bitmask,
1532			 * high bits starting with the first set bit in
1533			 * shared_bits are all equal in both pfn and end_pfn.
1534			 */
1535			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1536			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1537		}
1538
1539		/*
1540		 * Fallback to domain selective flush if no PSI support or
1541		 * the size is too big.
 
1542		 */
1543		if (!cap_pgsel_inv(iommu->cap) ||
1544		    mask > cap_max_amask_val(iommu->cap))
1545			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1546							DMA_TLB_DSI_FLUSH);
1547		else
1548			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1549							DMA_TLB_PSI_FLUSH);
1550	}
1551
1552	/*
1553	 * In caching mode, changes of pages from non-present to present require
1554	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1555	 */
1556	if (!cap_caching_mode(iommu->cap) || !map)
1557		iommu_flush_dev_iotlb(domain, addr, mask);
1558}
1559
1560/* Notification for newly created mappings */
1561static inline void __mapping_notify_one(struct intel_iommu *iommu,
1562					struct dmar_domain *domain,
1563					unsigned long pfn, unsigned int pages)
1564{
1565	/*
1566	 * It's a non-present to present mapping. Only flush if caching mode
1567	 * and second level.
1568	 */
1569	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1570		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1571	else
1572		iommu_flush_write_buffer(iommu);
1573}
1574
1575static void intel_flush_iotlb_all(struct iommu_domain *domain)
1576{
1577	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1578	struct iommu_domain_info *info;
1579	unsigned long idx;
1580
1581	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1582		struct intel_iommu *iommu = info->iommu;
1583		u16 did = domain_id_iommu(dmar_domain, iommu);
1584
1585		if (dmar_domain->use_first_level)
1586			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
 
 
 
 
1587		else
1588			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1589						 DMA_TLB_DSI_FLUSH);
1590
1591		if (!cap_caching_mode(iommu->cap))
1592			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
 
1593	}
1594}
1595
1596static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1597{
1598	u32 pmen;
1599	unsigned long flags;
1600
1601	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1602		return;
1603
1604	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1605	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1606	pmen &= ~DMA_PMEN_EPM;
1607	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1608
1609	/* wait for the protected region status bit to clear */
1610	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1611		readl, !(pmen & DMA_PMEN_PRS), pmen);
1612
1613	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1614}
1615
1616static void iommu_enable_translation(struct intel_iommu *iommu)
1617{
1618	u32 sts;
1619	unsigned long flags;
1620
1621	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1622	iommu->gcmd |= DMA_GCMD_TE;
1623	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1624
1625	/* Make sure hardware complete it */
1626	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1627		      readl, (sts & DMA_GSTS_TES), sts);
1628
1629	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630}
1631
1632static void iommu_disable_translation(struct intel_iommu *iommu)
1633{
1634	u32 sts;
1635	unsigned long flag;
1636
1637	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1638	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1639		return;
1640
1641	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1642	iommu->gcmd &= ~DMA_GCMD_TE;
1643	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1644
1645	/* Make sure hardware complete it */
1646	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1647		      readl, (!(sts & DMA_GSTS_TES)), sts);
1648
1649	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1650}
1651
1652static int iommu_init_domains(struct intel_iommu *iommu)
1653{
1654	u32 ndomains;
 
1655
1656	ndomains = cap_ndoms(iommu->cap);
1657	pr_debug("%s: Number of Domains supported <%d>\n",
1658		 iommu->name, ndomains);
 
1659
1660	spin_lock_init(&iommu->lock);
1661
1662	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1663	if (!iommu->domain_ids)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1664		return -ENOMEM;
 
1665
1666	/*
1667	 * If Caching mode is set, then invalid translations are tagged
1668	 * with domain-id 0, hence we need to pre-allocate it. We also
1669	 * use domain-id 0 as a marker for non-allocated domain-id, so
1670	 * make sure it is not used for a real domain.
1671	 */
1672	set_bit(0, iommu->domain_ids);
1673
1674	/*
1675	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1676	 * entry for first-level or pass-through translation modes should
1677	 * be programmed with a domain id different from those used for
1678	 * second-level or nested translation. We reserve a domain id for
1679	 * this purpose.
1680	 */
1681	if (sm_supported(iommu))
1682		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1683
1684	return 0;
1685}
1686
1687static void disable_dmar_iommu(struct intel_iommu *iommu)
1688{
1689	if (!iommu->domain_ids)
1690		return;
1691
1692	/*
1693	 * All iommu domains must have been detached from the devices,
1694	 * hence there should be no domain IDs in use.
1695	 */
1696	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1697		    > NUM_RESERVED_DID))
1698		return;
1699
 
 
 
 
 
 
 
 
 
 
 
 
1700	if (iommu->gcmd & DMA_GCMD_TE)
1701		iommu_disable_translation(iommu);
1702}
1703
1704static void free_dmar_iommu(struct intel_iommu *iommu)
1705{
1706	if (iommu->domain_ids) {
1707		bitmap_free(iommu->domain_ids);
 
 
 
 
 
 
 
1708		iommu->domain_ids = NULL;
1709	}
1710
1711	if (iommu->copied_tables) {
1712		bitmap_free(iommu->copied_tables);
1713		iommu->copied_tables = NULL;
1714	}
1715
1716	/* free context mapping */
1717	free_context_table(iommu);
1718
1719#ifdef CONFIG_INTEL_IOMMU_SVM
1720	if (pasid_supported(iommu)) {
1721		if (ecap_prs(iommu->ecap))
1722			intel_svm_finish_prq(iommu);
1723	}
1724	if (vccap_pasid(iommu->vccap))
1725		ioasid_unregister_allocator(&iommu->pasid_allocator);
1726
1727#endif
1728}
1729
1730/*
1731 * Check and return whether first level is used by default for
1732 * DMA translation.
1733 */
1734static bool first_level_by_default(unsigned int type)
1735{
1736	/* Only SL is available in legacy mode */
1737	if (!scalable_mode_support())
1738		return false;
 
 
 
 
 
1739
1740	/* Only level (either FL or SL) is available, just use it */
1741	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1742		return intel_cap_flts_sanity();
 
 
 
 
 
1743
1744	/* Both levels are available, decide it based on domain type */
1745	return type != IOMMU_DOMAIN_UNMANAGED;
1746}
1747
1748static struct dmar_domain *alloc_domain(unsigned int type)
1749{
1750	struct dmar_domain *domain;
1751
1752	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1753	if (!domain)
1754		return NULL;
1755
 
1756	domain->nid = NUMA_NO_NODE;
1757	if (first_level_by_default(type))
1758		domain->use_first_level = true;
 
1759	domain->has_iotlb_device = false;
1760	INIT_LIST_HEAD(&domain->devices);
1761	spin_lock_init(&domain->lock);
1762	xa_init(&domain->iommu_array);
1763
1764	return domain;
1765}
1766
 
1767static int domain_attach_iommu(struct dmar_domain *domain,
1768			       struct intel_iommu *iommu)
1769{
1770	struct iommu_domain_info *info, *curr;
1771	unsigned long ndomains;
1772	int num, ret = -ENOSPC;
1773
1774	info = kzalloc(sizeof(*info), GFP_KERNEL);
1775	if (!info)
1776		return -ENOMEM;
1777
1778	spin_lock(&iommu->lock);
1779	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1780	if (curr) {
1781		curr->refcnt++;
1782		spin_unlock(&iommu->lock);
1783		kfree(info);
1784		return 0;
1785	}
1786
1787	ndomains = cap_ndoms(iommu->cap);
1788	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1789	if (num >= ndomains) {
1790		pr_err("%s: No free domain ids\n", iommu->name);
1791		goto err_unlock;
1792	}
1793
1794	set_bit(num, iommu->domain_ids);
1795	info->refcnt	= 1;
1796	info->did	= num;
1797	info->iommu	= iommu;
1798	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1799			  NULL, info, GFP_ATOMIC);
1800	if (curr) {
1801		ret = xa_err(curr) ? : -EBUSY;
1802		goto err_clear;
1803	}
1804	domain_update_iommu_cap(domain);
1805
1806	spin_unlock(&iommu->lock);
1807	return 0;
1808
1809err_clear:
1810	clear_bit(info->did, iommu->domain_ids);
1811err_unlock:
1812	spin_unlock(&iommu->lock);
1813	kfree(info);
1814	return ret;
1815}
1816
1817static void domain_detach_iommu(struct dmar_domain *domain,
1818				struct intel_iommu *iommu)
1819{
1820	struct iommu_domain_info *info;
 
 
 
 
 
 
 
 
 
 
1821
1822	spin_lock(&iommu->lock);
1823	info = xa_load(&domain->iommu_array, iommu->seq_id);
1824	if (--info->refcnt == 0) {
1825		clear_bit(info->did, iommu->domain_ids);
1826		xa_erase(&domain->iommu_array, iommu->seq_id);
1827		domain->nid = NUMA_NO_NODE;
1828		domain_update_iommu_cap(domain);
1829		kfree(info);
1830	}
1831	spin_unlock(&iommu->lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1832}
1833
1834static inline int guestwidth_to_adjustwidth(int gaw)
1835{
1836	int agaw;
1837	int r = (gaw - 12) % 9;
1838
1839	if (r == 0)
1840		agaw = gaw;
1841	else
1842		agaw = gaw + 9 - r;
1843	if (agaw > 64)
1844		agaw = 64;
1845	return agaw;
1846}
1847
1848static void domain_exit(struct dmar_domain *domain)
1849{
 
 
 
 
 
 
 
 
1850	if (domain->pgd) {
1851		LIST_HEAD(freelist);
1852
1853		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1854		put_pages_list(&freelist);
1855	}
1856
1857	if (WARN_ON(!list_empty(&domain->devices)))
1858		return;
1859
1860	kfree(domain);
1861}
1862
1863/*
1864 * Get the PASID directory size for scalable mode context entry.
1865 * Value of X in the PDTS field of a scalable mode context entry
1866 * indicates PASID directory with 2^(X + 7) entries.
1867 */
1868static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1869{
1870	unsigned long pds, max_pde;
1871
1872	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1873	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1874	if (pds < 7)
1875		return 0;
1876
1877	return pds - 7;
1878}
1879
1880/*
1881 * Set the RID_PASID field of a scalable mode context entry. The
1882 * IOMMU hardware will use the PASID value set in this field for
1883 * DMA translations of DMA requests without PASID.
1884 */
1885static inline void
1886context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1887{
1888	context->hi |= pasid & ((1 << 20) - 1);
1889}
1890
1891/*
1892 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1893 * entry.
1894 */
1895static inline void context_set_sm_dte(struct context_entry *context)
1896{
1897	context->lo |= (1 << 2);
1898}
1899
1900/*
1901 * Set the PRE(Page Request Enable) field of a scalable mode context
1902 * entry.
1903 */
1904static inline void context_set_sm_pre(struct context_entry *context)
1905{
1906	context->lo |= (1 << 4);
1907}
1908
1909/* Convert value to context PASID directory size field coding. */
1910#define context_pdts(pds)	(((pds) & 0x7) << 9)
1911
1912static int domain_context_mapping_one(struct dmar_domain *domain,
1913				      struct intel_iommu *iommu,
1914				      struct pasid_table *table,
1915				      u8 bus, u8 devfn)
1916{
1917	struct device_domain_info *info =
1918			domain_lookup_dev_info(domain, iommu, bus, devfn);
1919	u16 did = domain_id_iommu(domain, iommu);
1920	int translation = CONTEXT_TT_MULTI_LEVEL;
 
1921	struct context_entry *context;
 
1922	int ret;
1923
1924	WARN_ON(did == 0);
1925
1926	if (hw_pass_through && domain_type_is_si(domain))
1927		translation = CONTEXT_TT_PASS_THROUGH;
1928
1929	pr_debug("Set context mapping for %02x:%02x.%d\n",
1930		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1931
1932	BUG_ON(!domain->pgd);
1933
 
1934	spin_lock(&iommu->lock);
 
1935	ret = -ENOMEM;
1936	context = iommu_context_addr(iommu, bus, devfn, 1);
1937	if (!context)
1938		goto out_unlock;
1939
1940	ret = 0;
1941	if (context_present(context) && !context_copied(iommu, bus, devfn))
1942		goto out_unlock;
1943
1944	/*
1945	 * For kdump cases, old valid entries may be cached due to the
1946	 * in-flight DMA and copied pgtable, but there is no unmapping
1947	 * behaviour for them, thus we need an explicit cache flush for
1948	 * the newly-mapped device. For kdump, at this point, the device
1949	 * is supposed to finish reset at its driver probe stage, so no
1950	 * in-flight DMA will exist, and we don't need to worry anymore
1951	 * hereafter.
1952	 */
1953	if (context_copied(iommu, bus, devfn)) {
1954		u16 did_old = context_domain_id(context);
1955
1956		if (did_old < cap_ndoms(iommu->cap)) {
1957			iommu->flush.flush_context(iommu, did_old,
1958						   (((u16)bus) << 8) | devfn,
1959						   DMA_CCMD_MASK_NOBIT,
1960						   DMA_CCMD_DEVICE_INVL);
1961			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1962						 DMA_TLB_DSI_FLUSH);
1963		}
1964
1965		clear_context_copied(iommu, bus, devfn);
1966	}
1967
1968	context_clear_entry(context);
1969
1970	if (sm_supported(iommu)) {
1971		unsigned long pds;
1972
1973		WARN_ON(!table);
1974
1975		/* Setup the PASID DIR pointer: */
1976		pds = context_get_sm_pds(table);
1977		context->lo = (u64)virt_to_phys(table->table) |
1978				context_pdts(pds);
1979
1980		/* Setup the RID_PASID field: */
1981		context_set_sm_rid2pasid(context, PASID_RID2PASID);
1982
1983		/*
1984		 * Setup the Device-TLB enable bit and Page request
1985		 * Enable bit:
1986		 */
 
1987		if (info && info->ats_supported)
1988			context_set_sm_dte(context);
1989		if (info && info->pri_supported)
1990			context_set_sm_pre(context);
1991		if (info && info->pasid_supported)
1992			context_set_pasid(context);
1993	} else {
1994		struct dma_pte *pgd = domain->pgd;
1995		int agaw;
1996
1997		context_set_domain_id(context, did);
1998
1999		if (translation != CONTEXT_TT_PASS_THROUGH) {
2000			/*
2001			 * Skip top levels of page tables for iommu which has
2002			 * less agaw than default. Unnecessary for PT mode.
2003			 */
2004			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2005				ret = -ENOMEM;
2006				pgd = phys_to_virt(dma_pte_addr(pgd));
2007				if (!dma_pte_present(pgd))
2008					goto out_unlock;
2009			}
2010
 
2011			if (info && info->ats_supported)
2012				translation = CONTEXT_TT_DEV_IOTLB;
2013			else
2014				translation = CONTEXT_TT_MULTI_LEVEL;
2015
2016			context_set_address_root(context, virt_to_phys(pgd));
2017			context_set_address_width(context, agaw);
2018		} else {
2019			/*
2020			 * In pass through mode, AW must be programmed to
2021			 * indicate the largest AGAW value supported by
2022			 * hardware. And ASR is ignored by hardware.
2023			 */
2024			context_set_address_width(context, iommu->msagaw);
2025		}
2026
2027		context_set_translation_type(context, translation);
2028	}
2029
2030	context_set_fault_enable(context);
2031	context_set_present(context);
2032	if (!ecap_coherent(iommu->ecap))
2033		clflush_cache_range(context, sizeof(*context));
2034
2035	/*
2036	 * It's a non-present to present mapping. If hardware doesn't cache
2037	 * non-present entry we only need to flush the write-buffer. If the
2038	 * _does_ cache non-present entries, then it does so in the special
2039	 * domain #0, which we have to flush:
2040	 */
2041	if (cap_caching_mode(iommu->cap)) {
2042		iommu->flush.flush_context(iommu, 0,
2043					   (((u16)bus) << 8) | devfn,
2044					   DMA_CCMD_MASK_NOBIT,
2045					   DMA_CCMD_DEVICE_INVL);
2046		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2047	} else {
2048		iommu_flush_write_buffer(iommu);
2049	}
 
2050
2051	ret = 0;
2052
2053out_unlock:
2054	spin_unlock(&iommu->lock);
 
2055
2056	return ret;
2057}
2058
2059struct domain_context_mapping_data {
2060	struct dmar_domain *domain;
2061	struct intel_iommu *iommu;
2062	struct pasid_table *table;
2063};
2064
2065static int domain_context_mapping_cb(struct pci_dev *pdev,
2066				     u16 alias, void *opaque)
2067{
2068	struct domain_context_mapping_data *data = opaque;
2069
2070	return domain_context_mapping_one(data->domain, data->iommu,
2071					  data->table, PCI_BUS_NUM(alias),
2072					  alias & 0xff);
2073}
2074
2075static int
2076domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2077{
2078	struct domain_context_mapping_data data;
2079	struct pasid_table *table;
2080	struct intel_iommu *iommu;
2081	u8 bus, devfn;
2082
2083	iommu = device_to_iommu(dev, &bus, &devfn);
2084	if (!iommu)
2085		return -ENODEV;
2086
2087	table = intel_pasid_get_table(dev);
2088
2089	if (!dev_is_pci(dev))
2090		return domain_context_mapping_one(domain, iommu, table,
2091						  bus, devfn);
2092
2093	data.domain = domain;
2094	data.iommu = iommu;
2095	data.table = table;
2096
2097	return pci_for_each_dma_alias(to_pci_dev(dev),
2098				      &domain_context_mapping_cb, &data);
2099}
2100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2101/* Returns a number of VTD pages, but aligned to MM page size */
2102static inline unsigned long aligned_nrpages(unsigned long host_addr,
2103					    size_t size)
2104{
2105	host_addr &= ~PAGE_MASK;
2106	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2107}
2108
2109/* Return largest possible superpage level for a given mapping */
2110static inline int hardware_largepage_caps(struct dmar_domain *domain,
2111					  unsigned long iov_pfn,
2112					  unsigned long phy_pfn,
2113					  unsigned long pages)
2114{
2115	int support, level = 1;
2116	unsigned long pfnmerge;
2117
2118	support = domain->iommu_superpage;
2119
2120	/* To use a large page, the virtual *and* physical addresses
2121	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2122	   of them will mean we have to use smaller pages. So just
2123	   merge them and check both at once. */
2124	pfnmerge = iov_pfn | phy_pfn;
2125
2126	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2127		pages >>= VTD_STRIDE_SHIFT;
2128		if (!pages)
2129			break;
2130		pfnmerge >>= VTD_STRIDE_SHIFT;
2131		level++;
2132		support--;
2133	}
2134	return level;
2135}
2136
2137/*
2138 * Ensure that old small page tables are removed to make room for superpage(s).
2139 * We're going to add new large pages, so make sure we don't remove their parent
2140 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2141 */
2142static void switch_to_super_page(struct dmar_domain *domain,
2143				 unsigned long start_pfn,
2144				 unsigned long end_pfn, int level)
2145{
2146	unsigned long lvl_pages = lvl_to_nr_pages(level);
2147	struct iommu_domain_info *info;
2148	struct dma_pte *pte = NULL;
2149	unsigned long i;
2150
2151	while (start_pfn <= end_pfn) {
2152		if (!pte)
2153			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2154
2155		if (dma_pte_present(pte)) {
2156			dma_pte_free_pagetable(domain, start_pfn,
2157					       start_pfn + lvl_pages - 1,
2158					       level + 1);
2159
2160			xa_for_each(&domain->iommu_array, i, info)
2161				iommu_flush_iotlb_psi(info->iommu, domain,
2162						      start_pfn, lvl_pages,
2163						      0, 0);
2164		}
2165
2166		pte++;
2167		start_pfn += lvl_pages;
2168		if (first_pte_in_page(pte))
2169			pte = NULL;
2170	}
2171}
2172
2173static int
2174__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2175		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2176{
2177	struct dma_pte *first_pte = NULL, *pte = NULL;
 
 
2178	unsigned int largepage_lvl = 0;
2179	unsigned long lvl_pages = 0;
2180	phys_addr_t pteval;
2181	u64 attr;
2182
2183	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2184
2185	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2186		return -EINVAL;
2187
2188	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2189	attr |= DMA_FL_PTE_PRESENT;
2190	if (domain->use_first_level) {
2191		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2192		if (prot & DMA_PTE_WRITE)
2193			attr |= DMA_FL_PTE_DIRTY;
2194	}
2195
2196	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
 
 
 
2197
2198	while (nr_pages > 0) {
2199		uint64_t tmp;
2200
 
 
 
 
 
 
 
 
 
 
2201		if (!pte) {
2202			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2203					phys_pfn, nr_pages);
2204
2205			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2206			if (!pte)
2207				return -ENOMEM;
2208			first_pte = pte;
2209
2210			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2211
2212			/* It is large page*/
2213			if (largepage_lvl > 1) {
2214				unsigned long end_pfn;
2215				unsigned long pages_to_remove;
2216
2217				pteval |= DMA_PTE_LARGE_PAGE;
2218				pages_to_remove = min_t(unsigned long, nr_pages,
2219							nr_pte_to_next_page(pte) * lvl_pages);
2220				end_pfn = iov_pfn + pages_to_remove - 1;
2221				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
 
 
 
 
 
 
 
 
 
2222			} else {
2223				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2224			}
2225
2226		}
2227		/* We don't need lock here, nobody else
2228		 * touches the iova range
2229		 */
2230		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2231		if (tmp) {
2232			static int dumps = 5;
2233			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2234				iov_pfn, tmp, (unsigned long long)pteval);
2235			if (dumps) {
2236				dumps--;
2237				debug_dma_dump_mappings(NULL);
2238			}
2239			WARN_ON(1);
2240		}
2241
 
 
 
 
 
2242		nr_pages -= lvl_pages;
2243		iov_pfn += lvl_pages;
2244		phys_pfn += lvl_pages;
2245		pteval += lvl_pages * VTD_PAGE_SIZE;
 
2246
2247		/* If the next PTE would be the first in a new page, then we
2248		 * need to flush the cache on the entries we've just written.
2249		 * And then we'll need to recalculate 'pte', so clear it and
2250		 * let it get set again in the if (!pte) block above.
2251		 *
2252		 * If we're done (!nr_pages) we need to flush the cache too.
2253		 *
2254		 * Also if we've been setting superpages, we may need to
2255		 * recalculate 'pte' and switch back to smaller pages for the
2256		 * end of the mapping, if the trailing size is not enough to
2257		 * use another superpage (i.e. nr_pages < lvl_pages).
2258		 */
2259		pte++;
2260		if (!nr_pages || first_pte_in_page(pte) ||
2261		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2262			domain_flush_cache(domain, first_pte,
2263					   (void *)pte - (void *)first_pte);
2264			pte = NULL;
2265		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2266	}
2267
2268	return 0;
2269}
2270
2271static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
 
 
 
 
 
 
 
 
 
2272{
2273	struct intel_iommu *iommu = info->iommu;
 
 
 
 
 
2274	struct context_entry *context;
2275	u16 did_old;
2276
2277	if (!iommu)
2278		return;
2279
2280	spin_lock(&iommu->lock);
2281	context = iommu_context_addr(iommu, bus, devfn, 0);
2282	if (!context) {
2283		spin_unlock(&iommu->lock);
2284		return;
2285	}
2286
2287	if (sm_supported(iommu)) {
2288		if (hw_pass_through && domain_type_is_si(info->domain))
2289			did_old = FLPT_DEFAULT_DID;
2290		else
2291			did_old = domain_id_iommu(info->domain, iommu);
2292	} else {
2293		did_old = context_domain_id(context);
2294	}
2295
2296	context_clear_entry(context);
2297	__iommu_flush_cache(iommu, context, sizeof(*context));
2298	spin_unlock(&iommu->lock);
2299	iommu->flush.flush_context(iommu,
2300				   did_old,
2301				   (((u16)bus) << 8) | devfn,
2302				   DMA_CCMD_MASK_NOBIT,
2303				   DMA_CCMD_DEVICE_INVL);
2304
2305	if (sm_supported(iommu))
2306		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2307
2308	iommu->flush.flush_iotlb(iommu,
2309				 did_old,
2310				 0,
2311				 0,
2312				 DMA_TLB_DSI_FLUSH);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2313
2314	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2315}
2316
2317static int domain_setup_first_level(struct intel_iommu *iommu,
2318				    struct dmar_domain *domain,
2319				    struct device *dev,
2320				    u32 pasid)
2321{
 
2322	struct dma_pte *pgd = domain->pgd;
2323	int agaw, level;
2324	int flags = 0;
2325
2326	/*
2327	 * Skip top levels of page tables for iommu which has
2328	 * less agaw than default. Unnecessary for PT mode.
2329	 */
2330	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2331		pgd = phys_to_virt(dma_pte_addr(pgd));
2332		if (!dma_pte_present(pgd))
2333			return -ENOMEM;
2334	}
2335
2336	level = agaw_to_level(agaw);
2337	if (level != 4 && level != 5)
2338		return -EINVAL;
2339
2340	if (pasid != PASID_RID2PASID)
2341		flags |= PASID_FLAG_SUPERVISOR_MODE;
2342	if (level == 5)
2343		flags |= PASID_FLAG_FL5LP;
2344
2345	if (domain->force_snooping)
2346		flags |= PASID_FLAG_PAGE_SNOOP;
2347
2348	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2349					     domain_id_iommu(domain, iommu),
2350					     flags);
2351}
2352
2353static bool dev_is_real_dma_subdevice(struct device *dev)
2354{
2355	return dev && dev_is_pci(dev) &&
2356	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2357}
2358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2359static int iommu_domain_identity_map(struct dmar_domain *domain,
2360				     unsigned long first_vpfn,
2361				     unsigned long last_vpfn)
2362{
2363	/*
2364	 * RMRR range might have overlap with physical memory range,
2365	 * clear it first
2366	 */
2367	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2368
2369	return __domain_mapping(domain, first_vpfn,
2370				first_vpfn, last_vpfn - first_vpfn + 1,
2371				DMA_PTE_READ|DMA_PTE_WRITE);
2372}
2373
2374static int md_domain_init(struct dmar_domain *domain, int guest_width);
2375
2376static int __init si_domain_init(int hw)
2377{
2378	struct dmar_rmrr_unit *rmrr;
2379	struct device *dev;
2380	int i, nid, ret;
2381
2382	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2383	if (!si_domain)
2384		return -EFAULT;
2385
2386	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2387		domain_exit(si_domain);
2388		si_domain = NULL;
2389		return -EFAULT;
2390	}
2391
2392	if (hw)
2393		return 0;
2394
2395	for_each_online_node(nid) {
2396		unsigned long start_pfn, end_pfn;
2397		int i;
2398
2399		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2400			ret = iommu_domain_identity_map(si_domain,
2401					mm_to_dma_pfn(start_pfn),
2402					mm_to_dma_pfn(end_pfn));
2403			if (ret)
2404				return ret;
2405		}
2406	}
2407
2408	/*
2409	 * Identity map the RMRRs so that devices with RMRRs could also use
2410	 * the si_domain.
2411	 */
2412	for_each_rmrr_units(rmrr) {
2413		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2414					  i, dev) {
2415			unsigned long long start = rmrr->base_address;
2416			unsigned long long end = rmrr->end_address;
2417
2418			if (WARN_ON(end < start ||
2419				    end >> agaw_to_width(si_domain->agaw)))
2420				continue;
2421
2422			ret = iommu_domain_identity_map(si_domain,
2423					mm_to_dma_pfn(start >> PAGE_SHIFT),
2424					mm_to_dma_pfn(end >> PAGE_SHIFT));
2425			if (ret)
2426				return ret;
2427		}
2428	}
2429
2430	return 0;
2431}
2432
2433static int dmar_domain_attach_device(struct dmar_domain *domain,
2434				     struct device *dev)
2435{
2436	struct device_domain_info *info = dev_iommu_priv_get(dev);
2437	struct intel_iommu *iommu;
2438	unsigned long flags;
2439	u8 bus, devfn;
2440	int ret;
2441
2442	iommu = device_to_iommu(dev, &bus, &devfn);
2443	if (!iommu)
2444		return -ENODEV;
2445
2446	ret = domain_attach_iommu(domain, iommu);
2447	if (ret)
2448		return ret;
2449	info->domain = domain;
2450	spin_lock_irqsave(&domain->lock, flags);
2451	list_add(&info->link, &domain->devices);
2452	spin_unlock_irqrestore(&domain->lock, flags);
2453
2454	/* PASID table is mandatory for a PCI device in scalable mode. */
2455	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2456		/* Setup the PASID entry for requests without PASID: */
2457		if (hw_pass_through && domain_type_is_si(domain))
2458			ret = intel_pasid_setup_pass_through(iommu, domain,
2459					dev, PASID_RID2PASID);
2460		else if (domain->use_first_level)
2461			ret = domain_setup_first_level(iommu, domain, dev,
2462					PASID_RID2PASID);
2463		else
2464			ret = intel_pasid_setup_second_level(iommu, domain,
2465					dev, PASID_RID2PASID);
2466		if (ret) {
2467			dev_err(dev, "Setup RID2PASID failed\n");
2468			device_block_translation(dev);
2469			return ret;
2470		}
2471	}
2472
2473	ret = domain_context_mapping(domain, dev);
2474	if (ret) {
2475		dev_err(dev, "Domain context map failed\n");
2476		device_block_translation(dev);
2477		return ret;
2478	}
2479
2480	iommu_enable_pci_caps(info);
2481
2482	return 0;
2483}
2484
2485static bool device_has_rmrr(struct device *dev)
2486{
2487	struct dmar_rmrr_unit *rmrr;
2488	struct device *tmp;
2489	int i;
2490
2491	rcu_read_lock();
2492	for_each_rmrr_units(rmrr) {
2493		/*
2494		 * Return TRUE if this RMRR contains the device that
2495		 * is passed in.
2496		 */
2497		for_each_active_dev_scope(rmrr->devices,
2498					  rmrr->devices_cnt, i, tmp)
2499			if (tmp == dev ||
2500			    is_downstream_to_pci_bridge(dev, tmp)) {
2501				rcu_read_unlock();
2502				return true;
2503			}
2504	}
2505	rcu_read_unlock();
2506	return false;
2507}
2508
2509/**
2510 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2511 * is relaxable (ie. is allowed to be not enforced under some conditions)
2512 * @dev: device handle
2513 *
2514 * We assume that PCI USB devices with RMRRs have them largely
2515 * for historical reasons and that the RMRR space is not actively used post
2516 * boot.  This exclusion may change if vendors begin to abuse it.
2517 *
2518 * The same exception is made for graphics devices, with the requirement that
2519 * any use of the RMRR regions will be torn down before assigning the device
2520 * to a guest.
2521 *
2522 * Return: true if the RMRR is relaxable, false otherwise
2523 */
2524static bool device_rmrr_is_relaxable(struct device *dev)
2525{
2526	struct pci_dev *pdev;
2527
2528	if (!dev_is_pci(dev))
2529		return false;
2530
2531	pdev = to_pci_dev(dev);
2532	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2533		return true;
2534	else
2535		return false;
2536}
2537
2538/*
2539 * There are a couple cases where we need to restrict the functionality of
2540 * devices associated with RMRRs.  The first is when evaluating a device for
2541 * identity mapping because problems exist when devices are moved in and out
2542 * of domains and their respective RMRR information is lost.  This means that
2543 * a device with associated RMRRs will never be in a "passthrough" domain.
2544 * The second is use of the device through the IOMMU API.  This interface
2545 * expects to have full control of the IOVA space for the device.  We cannot
2546 * satisfy both the requirement that RMRR access is maintained and have an
2547 * unencumbered IOVA space.  We also have no ability to quiesce the device's
2548 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2549 * We therefore prevent devices associated with an RMRR from participating in
2550 * the IOMMU API, which eliminates them from device assignment.
2551 *
2552 * In both cases, devices which have relaxable RMRRs are not concerned by this
2553 * restriction. See device_rmrr_is_relaxable comment.
2554 */
2555static bool device_is_rmrr_locked(struct device *dev)
2556{
2557	if (!device_has_rmrr(dev))
2558		return false;
2559
2560	if (device_rmrr_is_relaxable(dev))
2561		return false;
2562
2563	return true;
2564}
2565
2566/*
2567 * Return the required default domain type for a specific device.
2568 *
2569 * @dev: the device in query
2570 * @startup: true if this is during early boot
2571 *
2572 * Returns:
2573 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2574 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2575 *  - 0: both identity and dynamic domains work for this device
2576 */
2577static int device_def_domain_type(struct device *dev)
2578{
2579	if (dev_is_pci(dev)) {
2580		struct pci_dev *pdev = to_pci_dev(dev);
2581
 
 
 
 
 
 
 
2582		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2583			return IOMMU_DOMAIN_IDENTITY;
2584
2585		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2586			return IOMMU_DOMAIN_IDENTITY;
2587	}
2588
2589	return 0;
2590}
2591
2592static void intel_iommu_init_qi(struct intel_iommu *iommu)
2593{
2594	/*
2595	 * Start from the sane iommu hardware state.
2596	 * If the queued invalidation is already initialized by us
2597	 * (for example, while enabling interrupt-remapping) then
2598	 * we got the things already rolling from a sane state.
2599	 */
2600	if (!iommu->qi) {
2601		/*
2602		 * Clear any previous faults.
2603		 */
2604		dmar_fault(-1, iommu);
2605		/*
2606		 * Disable queued invalidation if supported and already enabled
2607		 * before OS handover.
2608		 */
2609		dmar_disable_qi(iommu);
2610	}
2611
2612	if (dmar_enable_qi(iommu)) {
2613		/*
2614		 * Queued Invalidate not enabled, use Register Based Invalidate
2615		 */
2616		iommu->flush.flush_context = __iommu_flush_context;
2617		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2618		pr_info("%s: Using Register based invalidation\n",
2619			iommu->name);
2620	} else {
2621		iommu->flush.flush_context = qi_flush_context;
2622		iommu->flush.flush_iotlb = qi_flush_iotlb;
2623		pr_info("%s: Using Queued invalidation\n", iommu->name);
2624	}
2625}
2626
2627static int copy_context_table(struct intel_iommu *iommu,
2628			      struct root_entry *old_re,
2629			      struct context_entry **tbl,
2630			      int bus, bool ext)
2631{
2632	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2633	struct context_entry *new_ce = NULL, ce;
2634	struct context_entry *old_ce = NULL;
2635	struct root_entry re;
2636	phys_addr_t old_ce_phys;
2637
2638	tbl_idx = ext ? bus * 2 : bus;
2639	memcpy(&re, old_re, sizeof(re));
2640
2641	for (devfn = 0; devfn < 256; devfn++) {
2642		/* First calculate the correct index */
2643		idx = (ext ? devfn * 2 : devfn) % 256;
2644
2645		if (idx == 0) {
2646			/* First save what we may have and clean up */
2647			if (new_ce) {
2648				tbl[tbl_idx] = new_ce;
2649				__iommu_flush_cache(iommu, new_ce,
2650						    VTD_PAGE_SIZE);
2651				pos = 1;
2652			}
2653
2654			if (old_ce)
2655				memunmap(old_ce);
2656
2657			ret = 0;
2658			if (devfn < 0x80)
2659				old_ce_phys = root_entry_lctp(&re);
2660			else
2661				old_ce_phys = root_entry_uctp(&re);
2662
2663			if (!old_ce_phys) {
2664				if (ext && devfn == 0) {
2665					/* No LCTP, try UCTP */
2666					devfn = 0x7f;
2667					continue;
2668				} else {
2669					goto out;
2670				}
2671			}
2672
2673			ret = -ENOMEM;
2674			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2675					MEMREMAP_WB);
2676			if (!old_ce)
2677				goto out;
2678
2679			new_ce = alloc_pgtable_page(iommu->node);
2680			if (!new_ce)
2681				goto out_unmap;
2682
2683			ret = 0;
2684		}
2685
2686		/* Now copy the context entry */
2687		memcpy(&ce, old_ce + idx, sizeof(ce));
2688
2689		if (!context_present(&ce))
2690			continue;
2691
2692		did = context_domain_id(&ce);
2693		if (did >= 0 && did < cap_ndoms(iommu->cap))
2694			set_bit(did, iommu->domain_ids);
2695
2696		set_context_copied(iommu, bus, devfn);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2697		new_ce[idx] = ce;
2698	}
2699
2700	tbl[tbl_idx + pos] = new_ce;
2701
2702	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2703
2704out_unmap:
2705	memunmap(old_ce);
2706
2707out:
2708	return ret;
2709}
2710
2711static int copy_translation_tables(struct intel_iommu *iommu)
2712{
2713	struct context_entry **ctxt_tbls;
2714	struct root_entry *old_rt;
2715	phys_addr_t old_rt_phys;
2716	int ctxt_table_entries;
 
2717	u64 rtaddr_reg;
2718	int bus, ret;
2719	bool new_ext, ext;
2720
2721	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2722	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2723	new_ext    = !!sm_supported(iommu);
2724
2725	/*
2726	 * The RTT bit can only be changed when translation is disabled,
2727	 * but disabling translation means to open a window for data
2728	 * corruption. So bail out and don't copy anything if we would
2729	 * have to change the bit.
2730	 */
2731	if (new_ext != ext)
2732		return -EINVAL;
2733
2734	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2735	if (!iommu->copied_tables)
2736		return -ENOMEM;
2737
2738	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2739	if (!old_rt_phys)
2740		return -EINVAL;
2741
2742	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2743	if (!old_rt)
2744		return -ENOMEM;
2745
2746	/* This is too big for the stack - allocate it from slab */
2747	ctxt_table_entries = ext ? 512 : 256;
2748	ret = -ENOMEM;
2749	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2750	if (!ctxt_tbls)
2751		goto out_unmap;
2752
2753	for (bus = 0; bus < 256; bus++) {
2754		ret = copy_context_table(iommu, &old_rt[bus],
2755					 ctxt_tbls, bus, ext);
2756		if (ret) {
2757			pr_err("%s: Failed to copy context table for bus %d\n",
2758				iommu->name, bus);
2759			continue;
2760		}
2761	}
2762
2763	spin_lock(&iommu->lock);
2764
2765	/* Context tables are copied, now write them to the root_entry table */
2766	for (bus = 0; bus < 256; bus++) {
2767		int idx = ext ? bus * 2 : bus;
2768		u64 val;
2769
2770		if (ctxt_tbls[idx]) {
2771			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2772			iommu->root_entry[bus].lo = val;
2773		}
2774
2775		if (!ext || !ctxt_tbls[idx + 1])
2776			continue;
2777
2778		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2779		iommu->root_entry[bus].hi = val;
2780	}
2781
2782	spin_unlock(&iommu->lock);
2783
2784	kfree(ctxt_tbls);
2785
2786	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2787
2788	ret = 0;
2789
2790out_unmap:
2791	memunmap(old_rt);
2792
2793	return ret;
2794}
2795
2796#ifdef CONFIG_INTEL_IOMMU_SVM
2797static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2798{
2799	struct intel_iommu *iommu = data;
2800	ioasid_t ioasid;
2801
2802	if (!iommu)
2803		return INVALID_IOASID;
2804	/*
2805	 * VT-d virtual command interface always uses the full 20 bit
2806	 * PASID range. Host can partition guest PASID range based on
2807	 * policies but it is out of guest's control.
2808	 */
2809	if (min < PASID_MIN || max > intel_pasid_max_id)
2810		return INVALID_IOASID;
2811
2812	if (vcmd_alloc_pasid(iommu, &ioasid))
2813		return INVALID_IOASID;
2814
2815	return ioasid;
2816}
2817
2818static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2819{
2820	struct intel_iommu *iommu = data;
2821
2822	if (!iommu)
2823		return;
2824	/*
2825	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2826	 * We can only free the PASID when all the devices are unbound.
2827	 */
2828	if (ioasid_find(NULL, ioasid, NULL)) {
2829		pr_alert("Cannot free active IOASID %d\n", ioasid);
2830		return;
2831	}
2832	vcmd_free_pasid(iommu, ioasid);
2833}
2834
2835static void register_pasid_allocator(struct intel_iommu *iommu)
2836{
2837	/*
2838	 * If we are running in the host, no need for custom allocator
2839	 * in that PASIDs are allocated from the host system-wide.
2840	 */
2841	if (!cap_caching_mode(iommu->cap))
2842		return;
2843
2844	if (!sm_supported(iommu)) {
2845		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2846		return;
2847	}
2848
2849	/*
2850	 * Register a custom PASID allocator if we are running in a guest,
2851	 * guest PASID must be obtained via virtual command interface.
2852	 * There can be multiple vIOMMUs in each guest but only one allocator
2853	 * is active. All vIOMMU allocators will eventually be calling the same
2854	 * host allocator.
2855	 */
2856	if (!vccap_pasid(iommu->vccap))
2857		return;
2858
2859	pr_info("Register custom PASID allocator\n");
2860	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2861	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2862	iommu->pasid_allocator.pdata = (void *)iommu;
2863	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2864		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2865		/*
2866		 * Disable scalable mode on this IOMMU if there
2867		 * is no custom allocator. Mixing SM capable vIOMMU
2868		 * and non-SM vIOMMU are not supported.
2869		 */
2870		intel_iommu_sm = 0;
2871	}
2872}
2873#endif
2874
2875static int __init init_dmars(void)
2876{
2877	struct dmar_drhd_unit *drhd;
2878	struct intel_iommu *iommu;
2879	int ret;
2880
2881	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2882	if (ret)
2883		goto free_iommu;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2884
2885	for_each_iommu(iommu, drhd) {
2886		if (drhd->ignored) {
2887			iommu_disable_translation(iommu);
2888			continue;
2889		}
2890
2891		/*
2892		 * Find the max pasid size of all IOMMU's in the system.
2893		 * We need to ensure the system pasid table is no bigger
2894		 * than the smallest supported.
2895		 */
2896		if (pasid_supported(iommu)) {
2897			u32 temp = 2 << ecap_pss(iommu->ecap);
2898
2899			intel_pasid_max_id = min_t(u32, temp,
2900						   intel_pasid_max_id);
2901		}
2902
 
 
2903		intel_iommu_init_qi(iommu);
2904
2905		ret = iommu_init_domains(iommu);
2906		if (ret)
2907			goto free_iommu;
2908
2909		init_translation_status(iommu);
2910
2911		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2912			iommu_disable_translation(iommu);
2913			clear_translation_pre_enabled(iommu);
2914			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2915				iommu->name);
2916		}
2917
2918		/*
2919		 * TBD:
2920		 * we could share the same root & context tables
2921		 * among all IOMMU's. Need to Split it later.
2922		 */
2923		ret = iommu_alloc_root_entry(iommu);
2924		if (ret)
2925			goto free_iommu;
2926
2927		if (translation_pre_enabled(iommu)) {
2928			pr_info("Translation already enabled - trying to copy translation structures\n");
2929
2930			ret = copy_translation_tables(iommu);
2931			if (ret) {
2932				/*
2933				 * We found the IOMMU with translation
2934				 * enabled - but failed to copy over the
2935				 * old root-entry table. Try to proceed
2936				 * by disabling translation now and
2937				 * allocating a clean root-entry table.
2938				 * This might cause DMAR faults, but
2939				 * probably the dump will still succeed.
2940				 */
2941				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2942				       iommu->name);
2943				iommu_disable_translation(iommu);
2944				clear_translation_pre_enabled(iommu);
2945			} else {
2946				pr_info("Copied translation tables from previous kernel for %s\n",
2947					iommu->name);
2948			}
2949		}
2950
2951		if (!ecap_pass_through(iommu->ecap))
2952			hw_pass_through = 0;
2953		intel_svm_check(iommu);
2954	}
2955
2956	/*
2957	 * Now that qi is enabled on all iommus, set the root entry and flush
2958	 * caches. This is required on some Intel X58 chipsets, otherwise the
2959	 * flush_context function will loop forever and the boot hangs.
2960	 */
2961	for_each_active_iommu(iommu, drhd) {
2962		iommu_flush_write_buffer(iommu);
2963#ifdef CONFIG_INTEL_IOMMU_SVM
2964		register_pasid_allocator(iommu);
2965#endif
2966		iommu_set_root_entry(iommu);
 
 
2967	}
2968
2969#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2970	dmar_map_gfx = 0;
2971#endif
2972
2973	if (!dmar_map_gfx)
2974		iommu_identity_mapping |= IDENTMAP_GFX;
2975
2976	check_tylersburg_isoch();
2977
2978	ret = si_domain_init(hw_pass_through);
2979	if (ret)
2980		goto free_iommu;
2981
2982	/*
2983	 * for each drhd
2984	 *   enable fault log
2985	 *   global invalidate context cache
2986	 *   global invalidate iotlb
2987	 *   enable translation
2988	 */
2989	for_each_iommu(iommu, drhd) {
2990		if (drhd->ignored) {
2991			/*
2992			 * we always have to disable PMRs or DMA may fail on
2993			 * this device
2994			 */
2995			if (force_on)
2996				iommu_disable_protect_mem_regions(iommu);
2997			continue;
2998		}
2999
3000		iommu_flush_write_buffer(iommu);
3001
3002#ifdef CONFIG_INTEL_IOMMU_SVM
3003		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3004			/*
3005			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3006			 * could cause possible lock race condition.
3007			 */
3008			up_write(&dmar_global_lock);
3009			ret = intel_svm_enable_prq(iommu);
3010			down_write(&dmar_global_lock);
3011			if (ret)
3012				goto free_iommu;
3013		}
3014#endif
3015		ret = dmar_set_interrupt(iommu);
3016		if (ret)
3017			goto free_iommu;
3018	}
3019
3020	return 0;
3021
3022free_iommu:
3023	for_each_active_iommu(iommu, drhd) {
3024		disable_dmar_iommu(iommu);
3025		free_dmar_iommu(iommu);
3026	}
3027	if (si_domain) {
3028		domain_exit(si_domain);
3029		si_domain = NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3030	}
3031
3032	return ret;
3033}
3034
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3035static void __init init_no_remapping_devices(void)
3036{
3037	struct dmar_drhd_unit *drhd;
3038	struct device *dev;
3039	int i;
3040
3041	for_each_drhd_unit(drhd) {
3042		if (!drhd->include_all) {
3043			for_each_active_dev_scope(drhd->devices,
3044						  drhd->devices_cnt, i, dev)
3045				break;
3046			/* ignore DMAR unit if no devices exist */
3047			if (i == drhd->devices_cnt)
3048				drhd->ignored = 1;
3049		}
3050	}
3051
3052	for_each_active_drhd_unit(drhd) {
3053		if (drhd->include_all)
3054			continue;
3055
3056		for_each_active_dev_scope(drhd->devices,
3057					  drhd->devices_cnt, i, dev)
3058			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3059				break;
3060		if (i < drhd->devices_cnt)
3061			continue;
3062
3063		/* This IOMMU has *only* gfx devices. Either bypass it or
3064		   set the gfx_mapped flag, as appropriate */
3065		drhd->gfx_dedicated = 1;
3066		if (!dmar_map_gfx)
3067			drhd->ignored = 1;
3068	}
3069}
3070
3071#ifdef CONFIG_SUSPEND
3072static int init_iommu_hw(void)
3073{
3074	struct dmar_drhd_unit *drhd;
3075	struct intel_iommu *iommu = NULL;
3076
3077	for_each_active_iommu(iommu, drhd)
3078		if (iommu->qi)
3079			dmar_reenable_qi(iommu);
3080
3081	for_each_iommu(iommu, drhd) {
3082		if (drhd->ignored) {
3083			/*
3084			 * we always have to disable PMRs or DMA may fail on
3085			 * this device
3086			 */
3087			if (force_on)
3088				iommu_disable_protect_mem_regions(iommu);
3089			continue;
3090		}
3091
3092		iommu_flush_write_buffer(iommu);
 
3093		iommu_set_root_entry(iommu);
 
 
 
 
3094		iommu_enable_translation(iommu);
3095		iommu_disable_protect_mem_regions(iommu);
3096	}
3097
3098	return 0;
3099}
3100
3101static void iommu_flush_all(void)
3102{
3103	struct dmar_drhd_unit *drhd;
3104	struct intel_iommu *iommu;
3105
3106	for_each_active_iommu(iommu, drhd) {
3107		iommu->flush.flush_context(iommu, 0, 0, 0,
3108					   DMA_CCMD_GLOBAL_INVL);
3109		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3110					 DMA_TLB_GLOBAL_FLUSH);
3111	}
3112}
3113
3114static int iommu_suspend(void)
3115{
3116	struct dmar_drhd_unit *drhd;
3117	struct intel_iommu *iommu = NULL;
3118	unsigned long flag;
3119
3120	for_each_active_iommu(iommu, drhd) {
3121		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3122					     GFP_KERNEL);
3123		if (!iommu->iommu_state)
3124			goto nomem;
3125	}
3126
3127	iommu_flush_all();
3128
3129	for_each_active_iommu(iommu, drhd) {
3130		iommu_disable_translation(iommu);
3131
3132		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3133
3134		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3135			readl(iommu->reg + DMAR_FECTL_REG);
3136		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3137			readl(iommu->reg + DMAR_FEDATA_REG);
3138		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3139			readl(iommu->reg + DMAR_FEADDR_REG);
3140		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3141			readl(iommu->reg + DMAR_FEUADDR_REG);
3142
3143		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3144	}
3145	return 0;
3146
3147nomem:
3148	for_each_active_iommu(iommu, drhd)
3149		kfree(iommu->iommu_state);
3150
3151	return -ENOMEM;
3152}
3153
3154static void iommu_resume(void)
3155{
3156	struct dmar_drhd_unit *drhd;
3157	struct intel_iommu *iommu = NULL;
3158	unsigned long flag;
3159
3160	if (init_iommu_hw()) {
3161		if (force_on)
3162			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3163		else
3164			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3165		return;
3166	}
3167
3168	for_each_active_iommu(iommu, drhd) {
3169
3170		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3171
3172		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3173			iommu->reg + DMAR_FECTL_REG);
3174		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3175			iommu->reg + DMAR_FEDATA_REG);
3176		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3177			iommu->reg + DMAR_FEADDR_REG);
3178		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3179			iommu->reg + DMAR_FEUADDR_REG);
3180
3181		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3182	}
3183
3184	for_each_active_iommu(iommu, drhd)
3185		kfree(iommu->iommu_state);
3186}
3187
3188static struct syscore_ops iommu_syscore_ops = {
3189	.resume		= iommu_resume,
3190	.suspend	= iommu_suspend,
3191};
3192
3193static void __init init_iommu_pm_ops(void)
3194{
3195	register_syscore_ops(&iommu_syscore_ops);
3196}
3197
3198#else
3199static inline void init_iommu_pm_ops(void) {}
3200#endif	/* CONFIG_PM */
3201
3202static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3203{
3204	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3205	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3206	    rmrr->end_address <= rmrr->base_address ||
3207	    arch_rmrr_sanity_check(rmrr))
3208		return -EINVAL;
3209
3210	return 0;
3211}
3212
3213int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3214{
3215	struct acpi_dmar_reserved_memory *rmrr;
3216	struct dmar_rmrr_unit *rmrru;
3217
3218	rmrr = (struct acpi_dmar_reserved_memory *)header;
3219	if (rmrr_sanity_check(rmrr)) {
3220		pr_warn(FW_BUG
3221			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3222			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3223			   rmrr->base_address, rmrr->end_address,
3224			   dmi_get_system_info(DMI_BIOS_VENDOR),
3225			   dmi_get_system_info(DMI_BIOS_VERSION),
3226			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3227		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3228	}
3229
3230	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3231	if (!rmrru)
3232		goto out;
3233
3234	rmrru->hdr = header;
3235
3236	rmrru->base_address = rmrr->base_address;
3237	rmrru->end_address = rmrr->end_address;
3238
3239	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3240				((void *)rmrr) + rmrr->header.length,
3241				&rmrru->devices_cnt);
3242	if (rmrru->devices_cnt && rmrru->devices == NULL)
3243		goto free_rmrru;
3244
3245	list_add(&rmrru->list, &dmar_rmrr_units);
3246
3247	return 0;
3248free_rmrru:
3249	kfree(rmrru);
3250out:
3251	return -ENOMEM;
3252}
3253
3254static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3255{
3256	struct dmar_atsr_unit *atsru;
3257	struct acpi_dmar_atsr *tmp;
3258
3259	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3260				dmar_rcu_check()) {
3261		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3262		if (atsr->segment != tmp->segment)
3263			continue;
3264		if (atsr->header.length != tmp->header.length)
3265			continue;
3266		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3267			return atsru;
3268	}
3269
3270	return NULL;
3271}
3272
3273int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3274{
3275	struct acpi_dmar_atsr *atsr;
3276	struct dmar_atsr_unit *atsru;
3277
3278	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3279		return 0;
3280
3281	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3282	atsru = dmar_find_atsr(atsr);
3283	if (atsru)
3284		return 0;
3285
3286	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3287	if (!atsru)
3288		return -ENOMEM;
3289
3290	/*
3291	 * If memory is allocated from slab by ACPI _DSM method, we need to
3292	 * copy the memory content because the memory buffer will be freed
3293	 * on return.
3294	 */
3295	atsru->hdr = (void *)(atsru + 1);
3296	memcpy(atsru->hdr, hdr, hdr->length);
3297	atsru->include_all = atsr->flags & 0x1;
3298	if (!atsru->include_all) {
3299		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3300				(void *)atsr + atsr->header.length,
3301				&atsru->devices_cnt);
3302		if (atsru->devices_cnt && atsru->devices == NULL) {
3303			kfree(atsru);
3304			return -ENOMEM;
3305		}
3306	}
3307
3308	list_add_rcu(&atsru->list, &dmar_atsr_units);
3309
3310	return 0;
3311}
3312
3313static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3314{
3315	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3316	kfree(atsru);
3317}
3318
3319int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3320{
3321	struct acpi_dmar_atsr *atsr;
3322	struct dmar_atsr_unit *atsru;
3323
3324	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3325	atsru = dmar_find_atsr(atsr);
3326	if (atsru) {
3327		list_del_rcu(&atsru->list);
3328		synchronize_rcu();
3329		intel_iommu_free_atsr(atsru);
3330	}
3331
3332	return 0;
3333}
3334
3335int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3336{
3337	int i;
3338	struct device *dev;
3339	struct acpi_dmar_atsr *atsr;
3340	struct dmar_atsr_unit *atsru;
3341
3342	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3343	atsru = dmar_find_atsr(atsr);
3344	if (!atsru)
3345		return 0;
3346
3347	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3348		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3349					  i, dev)
3350			return -EBUSY;
3351	}
3352
3353	return 0;
3354}
3355
3356static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3357{
3358	struct dmar_satc_unit *satcu;
3359	struct acpi_dmar_satc *tmp;
3360
3361	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3362				dmar_rcu_check()) {
3363		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3364		if (satc->segment != tmp->segment)
3365			continue;
3366		if (satc->header.length != tmp->header.length)
3367			continue;
3368		if (memcmp(satc, tmp, satc->header.length) == 0)
3369			return satcu;
3370	}
3371
3372	return NULL;
3373}
3374
3375int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3376{
3377	struct acpi_dmar_satc *satc;
3378	struct dmar_satc_unit *satcu;
3379
3380	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3381		return 0;
3382
3383	satc = container_of(hdr, struct acpi_dmar_satc, header);
3384	satcu = dmar_find_satc(satc);
3385	if (satcu)
3386		return 0;
3387
3388	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3389	if (!satcu)
3390		return -ENOMEM;
3391
3392	satcu->hdr = (void *)(satcu + 1);
3393	memcpy(satcu->hdr, hdr, hdr->length);
3394	satcu->atc_required = satc->flags & 0x1;
3395	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3396					      (void *)satc + satc->header.length,
3397					      &satcu->devices_cnt);
3398	if (satcu->devices_cnt && !satcu->devices) {
3399		kfree(satcu);
3400		return -ENOMEM;
3401	}
3402	list_add_rcu(&satcu->list, &dmar_satc_units);
3403
3404	return 0;
3405}
3406
3407static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3408{
3409	int sp, ret;
3410	struct intel_iommu *iommu = dmaru->iommu;
3411
3412	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3413	if (ret)
3414		goto out;
3415
3416	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3417		pr_warn("%s: Doesn't support hardware pass through.\n",
3418			iommu->name);
3419		return -ENXIO;
3420	}
3421
 
 
 
 
 
3422	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3423	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3424		pr_warn("%s: Doesn't support large page.\n",
3425			iommu->name);
3426		return -ENXIO;
3427	}
3428
3429	/*
3430	 * Disable translation if already enabled prior to OS handover.
3431	 */
3432	if (iommu->gcmd & DMA_GCMD_TE)
3433		iommu_disable_translation(iommu);
3434
 
3435	ret = iommu_init_domains(iommu);
3436	if (ret == 0)
3437		ret = iommu_alloc_root_entry(iommu);
3438	if (ret)
3439		goto out;
3440
3441	intel_svm_check(iommu);
3442
3443	if (dmaru->ignored) {
3444		/*
3445		 * we always have to disable PMRs or DMA may fail on this device
3446		 */
3447		if (force_on)
3448			iommu_disable_protect_mem_regions(iommu);
3449		return 0;
3450	}
3451
3452	intel_iommu_init_qi(iommu);
3453	iommu_flush_write_buffer(iommu);
3454
3455#ifdef CONFIG_INTEL_IOMMU_SVM
3456	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3457		ret = intel_svm_enable_prq(iommu);
3458		if (ret)
3459			goto disable_iommu;
3460	}
3461#endif
3462	ret = dmar_set_interrupt(iommu);
3463	if (ret)
3464		goto disable_iommu;
3465
3466	iommu_set_root_entry(iommu);
 
 
3467	iommu_enable_translation(iommu);
3468
3469	iommu_disable_protect_mem_regions(iommu);
3470	return 0;
3471
3472disable_iommu:
3473	disable_dmar_iommu(iommu);
3474out:
3475	free_dmar_iommu(iommu);
3476	return ret;
3477}
3478
3479int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3480{
3481	int ret = 0;
3482	struct intel_iommu *iommu = dmaru->iommu;
3483
3484	if (!intel_iommu_enabled)
3485		return 0;
3486	if (iommu == NULL)
3487		return -EINVAL;
3488
3489	if (insert) {
3490		ret = intel_iommu_add(dmaru);
3491	} else {
3492		disable_dmar_iommu(iommu);
3493		free_dmar_iommu(iommu);
3494	}
3495
3496	return ret;
3497}
3498
3499static void intel_iommu_free_dmars(void)
3500{
3501	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3502	struct dmar_atsr_unit *atsru, *atsr_n;
3503	struct dmar_satc_unit *satcu, *satc_n;
3504
3505	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3506		list_del(&rmrru->list);
3507		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3508		kfree(rmrru);
3509	}
3510
3511	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3512		list_del(&atsru->list);
3513		intel_iommu_free_atsr(atsru);
3514	}
3515	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3516		list_del(&satcu->list);
3517		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3518		kfree(satcu);
3519	}
3520}
3521
3522static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3523{
3524	struct dmar_satc_unit *satcu;
3525	struct acpi_dmar_satc *satc;
3526	struct device *tmp;
3527	int i;
3528
3529	dev = pci_physfn(dev);
3530	rcu_read_lock();
3531
3532	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3533		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3534		if (satc->segment != pci_domain_nr(dev->bus))
3535			continue;
3536		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3537			if (to_pci_dev(tmp) == dev)
3538				goto out;
3539	}
3540	satcu = NULL;
3541out:
3542	rcu_read_unlock();
3543	return satcu;
3544}
3545
3546static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3547{
3548	int i, ret = 1;
3549	struct pci_bus *bus;
3550	struct pci_dev *bridge = NULL;
3551	struct device *tmp;
3552	struct acpi_dmar_atsr *atsr;
3553	struct dmar_atsr_unit *atsru;
3554	struct dmar_satc_unit *satcu;
3555
3556	dev = pci_physfn(dev);
3557	satcu = dmar_find_matched_satc_unit(dev);
3558	if (satcu)
3559		/*
3560		 * This device supports ATS as it is in SATC table.
3561		 * When IOMMU is in legacy mode, enabling ATS is done
3562		 * automatically by HW for the device that requires
3563		 * ATS, hence OS should not enable this device ATS
3564		 * to avoid duplicated TLB invalidation.
3565		 */
3566		return !(satcu->atc_required && !sm_supported(iommu));
3567
3568	for (bus = dev->bus; bus; bus = bus->parent) {
3569		bridge = bus->self;
3570		/* If it's an integrated device, allow ATS */
3571		if (!bridge)
3572			return 1;
3573		/* Connected via non-PCIe: no ATS */
3574		if (!pci_is_pcie(bridge) ||
3575		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3576			return 0;
3577		/* If we found the root port, look it up in the ATSR */
3578		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3579			break;
3580	}
3581
3582	rcu_read_lock();
3583	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3584		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3585		if (atsr->segment != pci_domain_nr(dev->bus))
3586			continue;
3587
3588		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3589			if (tmp == &bridge->dev)
3590				goto out;
3591
3592		if (atsru->include_all)
3593			goto out;
3594	}
3595	ret = 0;
3596out:
3597	rcu_read_unlock();
3598
3599	return ret;
3600}
3601
3602int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3603{
3604	int ret;
3605	struct dmar_rmrr_unit *rmrru;
3606	struct dmar_atsr_unit *atsru;
3607	struct dmar_satc_unit *satcu;
3608	struct acpi_dmar_atsr *atsr;
3609	struct acpi_dmar_reserved_memory *rmrr;
3610	struct acpi_dmar_satc *satc;
3611
3612	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3613		return 0;
3614
3615	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3616		rmrr = container_of(rmrru->hdr,
3617				    struct acpi_dmar_reserved_memory, header);
3618		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3619			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3620				((void *)rmrr) + rmrr->header.length,
3621				rmrr->segment, rmrru->devices,
3622				rmrru->devices_cnt);
3623			if (ret < 0)
3624				return ret;
3625		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3626			dmar_remove_dev_scope(info, rmrr->segment,
3627				rmrru->devices, rmrru->devices_cnt);
3628		}
3629	}
3630
3631	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3632		if (atsru->include_all)
3633			continue;
3634
3635		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3636		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3637			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3638					(void *)atsr + atsr->header.length,
3639					atsr->segment, atsru->devices,
3640					atsru->devices_cnt);
3641			if (ret > 0)
3642				break;
3643			else if (ret < 0)
3644				return ret;
3645		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3646			if (dmar_remove_dev_scope(info, atsr->segment,
3647					atsru->devices, atsru->devices_cnt))
3648				break;
3649		}
3650	}
3651	list_for_each_entry(satcu, &dmar_satc_units, list) {
3652		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3653		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3654			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3655					(void *)satc + satc->header.length,
3656					satc->segment, satcu->devices,
3657					satcu->devices_cnt);
3658			if (ret > 0)
3659				break;
3660			else if (ret < 0)
3661				return ret;
3662		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3663			if (dmar_remove_dev_scope(info, satc->segment,
3664					satcu->devices, satcu->devices_cnt))
3665				break;
3666		}
3667	}
3668
3669	return 0;
3670}
3671
3672static int intel_iommu_memory_notifier(struct notifier_block *nb,
3673				       unsigned long val, void *v)
3674{
3675	struct memory_notify *mhp = v;
3676	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3677	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3678			mhp->nr_pages - 1);
3679
3680	switch (val) {
3681	case MEM_GOING_ONLINE:
3682		if (iommu_domain_identity_map(si_domain,
3683					      start_vpfn, last_vpfn)) {
3684			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3685				start_vpfn, last_vpfn);
3686			return NOTIFY_BAD;
3687		}
3688		break;
3689
3690	case MEM_OFFLINE:
3691	case MEM_CANCEL_ONLINE:
3692		{
3693			struct dmar_drhd_unit *drhd;
3694			struct intel_iommu *iommu;
3695			LIST_HEAD(freelist);
3696
3697			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
 
3698
3699			rcu_read_lock();
3700			for_each_active_iommu(iommu, drhd)
3701				iommu_flush_iotlb_psi(iommu, si_domain,
3702					start_vpfn, mhp->nr_pages,
3703					list_empty(&freelist), 0);
3704			rcu_read_unlock();
3705			put_pages_list(&freelist);
3706		}
3707		break;
3708	}
3709
3710	return NOTIFY_OK;
3711}
3712
3713static struct notifier_block intel_iommu_memory_nb = {
3714	.notifier_call = intel_iommu_memory_notifier,
3715	.priority = 0
3716};
3717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3718static void intel_disable_iommus(void)
3719{
3720	struct intel_iommu *iommu = NULL;
3721	struct dmar_drhd_unit *drhd;
3722
3723	for_each_iommu(iommu, drhd)
3724		iommu_disable_translation(iommu);
3725}
3726
3727void intel_iommu_shutdown(void)
3728{
3729	struct dmar_drhd_unit *drhd;
3730	struct intel_iommu *iommu = NULL;
3731
3732	if (no_iommu || dmar_disabled)
3733		return;
3734
3735	down_write(&dmar_global_lock);
3736
3737	/* Disable PMRs explicitly here. */
3738	for_each_iommu(iommu, drhd)
3739		iommu_disable_protect_mem_regions(iommu);
3740
3741	/* Make sure the IOMMUs are switched off */
3742	intel_disable_iommus();
3743
3744	up_write(&dmar_global_lock);
3745}
3746
3747static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3748{
3749	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3750
3751	return container_of(iommu_dev, struct intel_iommu, iommu);
3752}
3753
3754static ssize_t version_show(struct device *dev,
3755			    struct device_attribute *attr, char *buf)
 
3756{
3757	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3758	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3759	return sprintf(buf, "%d:%d\n",
3760		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3761}
3762static DEVICE_ATTR_RO(version);
3763
3764static ssize_t address_show(struct device *dev,
3765			    struct device_attribute *attr, char *buf)
 
3766{
3767	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3768	return sprintf(buf, "%llx\n", iommu->reg_phys);
3769}
3770static DEVICE_ATTR_RO(address);
3771
3772static ssize_t cap_show(struct device *dev,
3773			struct device_attribute *attr, char *buf)
 
3774{
3775	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3776	return sprintf(buf, "%llx\n", iommu->cap);
3777}
3778static DEVICE_ATTR_RO(cap);
3779
3780static ssize_t ecap_show(struct device *dev,
3781			 struct device_attribute *attr, char *buf)
 
3782{
3783	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3784	return sprintf(buf, "%llx\n", iommu->ecap);
3785}
3786static DEVICE_ATTR_RO(ecap);
3787
3788static ssize_t domains_supported_show(struct device *dev,
3789				      struct device_attribute *attr, char *buf)
 
3790{
3791	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3792	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3793}
3794static DEVICE_ATTR_RO(domains_supported);
3795
3796static ssize_t domains_used_show(struct device *dev,
3797				 struct device_attribute *attr, char *buf)
 
3798{
3799	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3800	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3801						  cap_ndoms(iommu->cap)));
3802}
3803static DEVICE_ATTR_RO(domains_used);
3804
3805static struct attribute *intel_iommu_attrs[] = {
3806	&dev_attr_version.attr,
3807	&dev_attr_address.attr,
3808	&dev_attr_cap.attr,
3809	&dev_attr_ecap.attr,
3810	&dev_attr_domains_supported.attr,
3811	&dev_attr_domains_used.attr,
3812	NULL,
3813};
3814
3815static struct attribute_group intel_iommu_group = {
3816	.name = "intel-iommu",
3817	.attrs = intel_iommu_attrs,
3818};
3819
3820const struct attribute_group *intel_iommu_groups[] = {
3821	&intel_iommu_group,
3822	NULL,
3823};
3824
3825static inline bool has_external_pci(void)
3826{
3827	struct pci_dev *pdev = NULL;
3828
3829	for_each_pci_dev(pdev)
3830		if (pdev->external_facing) {
3831			pci_dev_put(pdev);
3832			return true;
3833		}
3834
3835	return false;
3836}
3837
3838static int __init platform_optin_force_iommu(void)
3839{
3840	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3841		return 0;
3842
3843	if (no_iommu || dmar_disabled)
3844		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3845
3846	/*
3847	 * If Intel-IOMMU is disabled by default, we will apply identity
3848	 * map for all devices except those marked as being untrusted.
3849	 */
3850	if (dmar_disabled)
3851		iommu_set_default_passthrough(false);
3852
3853	dmar_disabled = 0;
3854	no_iommu = 0;
3855
3856	return 1;
3857}
3858
3859static int __init probe_acpi_namespace_devices(void)
3860{
3861	struct dmar_drhd_unit *drhd;
3862	/* To avoid a -Wunused-but-set-variable warning. */
3863	struct intel_iommu *iommu __maybe_unused;
3864	struct device *dev;
3865	int i, ret = 0;
3866
3867	for_each_active_iommu(iommu, drhd) {
3868		for_each_active_dev_scope(drhd->devices,
3869					  drhd->devices_cnt, i, dev) {
3870			struct acpi_device_physical_node *pn;
3871			struct iommu_group *group;
3872			struct acpi_device *adev;
3873
3874			if (dev->bus != &acpi_bus_type)
3875				continue;
3876
3877			adev = to_acpi_device(dev);
3878			mutex_lock(&adev->physical_node_lock);
3879			list_for_each_entry(pn,
3880					    &adev->physical_node_list, node) {
3881				group = iommu_group_get(pn->dev);
3882				if (group) {
3883					iommu_group_put(group);
3884					continue;
3885				}
3886
 
3887				ret = iommu_probe_device(pn->dev);
3888				if (ret)
3889					break;
3890			}
3891			mutex_unlock(&adev->physical_node_lock);
3892
3893			if (ret)
3894				return ret;
3895		}
3896	}
3897
3898	return 0;
3899}
3900
3901static __init int tboot_force_iommu(void)
3902{
3903	if (!tboot_enabled())
3904		return 0;
3905
3906	if (no_iommu || dmar_disabled)
3907		pr_warn("Forcing Intel-IOMMU to enabled\n");
3908
3909	dmar_disabled = 0;
3910	no_iommu = 0;
3911
3912	return 1;
3913}
3914
3915int __init intel_iommu_init(void)
3916{
3917	int ret = -ENODEV;
3918	struct dmar_drhd_unit *drhd;
3919	struct intel_iommu *iommu;
3920
3921	/*
3922	 * Intel IOMMU is required for a TXT/tboot launch or platform
3923	 * opt in, so enforce that.
3924	 */
3925	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3926		    platform_optin_force_iommu();
 
 
 
 
 
3927
3928	down_write(&dmar_global_lock);
3929	if (dmar_table_init()) {
3930		if (force_on)
3931			panic("tboot: Failed to initialize DMAR table\n");
3932		goto out_free_dmar;
3933	}
3934
3935	if (dmar_dev_scope_init() < 0) {
3936		if (force_on)
3937			panic("tboot: Failed to initialize DMAR device scope\n");
3938		goto out_free_dmar;
3939	}
3940
3941	up_write(&dmar_global_lock);
3942
3943	/*
3944	 * The bus notifier takes the dmar_global_lock, so lockdep will
3945	 * complain later when we register it under the lock.
3946	 */
3947	dmar_register_bus_notifier();
3948
3949	down_write(&dmar_global_lock);
3950
3951	if (!no_iommu)
3952		intel_iommu_debugfs_init();
3953
3954	if (no_iommu || dmar_disabled) {
3955		/*
3956		 * We exit the function here to ensure IOMMU's remapping and
3957		 * mempool aren't setup, which means that the IOMMU's PMRs
3958		 * won't be disabled via the call to init_dmars(). So disable
3959		 * it explicitly here. The PMRs were setup by tboot prior to
3960		 * calling SENTER, but the kernel is expected to reset/tear
3961		 * down the PMRs.
3962		 */
3963		if (intel_iommu_tboot_noforce) {
3964			for_each_iommu(iommu, drhd)
3965				iommu_disable_protect_mem_regions(iommu);
3966		}
3967
3968		/*
3969		 * Make sure the IOMMUs are switched off, even when we
3970		 * boot into a kexec kernel and the previous kernel left
3971		 * them enabled
3972		 */
3973		intel_disable_iommus();
3974		goto out_free_dmar;
3975	}
3976
3977	if (list_empty(&dmar_rmrr_units))
3978		pr_info("No RMRR found\n");
3979
3980	if (list_empty(&dmar_atsr_units))
3981		pr_info("No ATSR found\n");
3982
3983	if (list_empty(&dmar_satc_units))
3984		pr_info("No SATC found\n");
 
 
 
 
 
 
3985
3986	init_no_remapping_devices();
3987
3988	ret = init_dmars();
3989	if (ret) {
3990		if (force_on)
3991			panic("tboot: Failed to initialize DMARs\n");
3992		pr_err("Initialization failed\n");
3993		goto out_free_dmar;
3994	}
3995	up_write(&dmar_global_lock);
3996
3997	init_iommu_pm_ops();
3998
3999	down_read(&dmar_global_lock);
4000	for_each_active_iommu(iommu, drhd) {
4001		/*
4002		 * The flush queue implementation does not perform
4003		 * page-selective invalidations that are required for efficient
4004		 * TLB flushes in virtual environments.  The benefit of batching
4005		 * is likely to be much lower than the overhead of synchronizing
4006		 * the virtual and physical IOMMU page-tables.
4007		 */
4008		if (cap_caching_mode(iommu->cap)) {
4009			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4010			iommu_set_dma_strict();
4011		}
4012		iommu_device_sysfs_add(&iommu->iommu, NULL,
4013				       intel_iommu_groups,
4014				       "%s", iommu->name);
4015		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
 
4016	}
4017	up_read(&dmar_global_lock);
4018
 
4019	if (si_domain && !hw_pass_through)
4020		register_memory_notifier(&intel_iommu_memory_nb);
 
 
4021
4022	down_read(&dmar_global_lock);
4023	if (probe_acpi_namespace_devices())
4024		pr_warn("ACPI name space devices didn't probe correctly\n");
4025
4026	/* Finally, we enable the DMA remapping hardware. */
4027	for_each_iommu(iommu, drhd) {
4028		if (!drhd->ignored && !translation_pre_enabled(iommu))
4029			iommu_enable_translation(iommu);
4030
4031		iommu_disable_protect_mem_regions(iommu);
4032	}
4033	up_read(&dmar_global_lock);
4034
4035	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4036
4037	intel_iommu_enabled = 1;
4038
4039	return 0;
4040
 
 
4041out_free_dmar:
4042	intel_iommu_free_dmars();
4043	up_write(&dmar_global_lock);
 
4044	return ret;
4045}
4046
4047static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4048{
4049	struct device_domain_info *info = opaque;
4050
4051	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4052	return 0;
4053}
4054
4055/*
4056 * NB - intel-iommu lacks any sort of reference counting for the users of
4057 * dependent devices.  If multiple endpoints have intersecting dependent
4058 * devices, unbinding the driver from any one of them will possibly leave
4059 * the others unable to operate.
4060 */
4061static void domain_context_clear(struct device_domain_info *info)
4062{
4063	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4064		return;
4065
4066	pci_for_each_dma_alias(to_pci_dev(info->dev),
4067			       &domain_context_clear_one_cb, info);
4068}
4069
4070static void dmar_remove_one_dev_info(struct device *dev)
4071{
4072	struct device_domain_info *info = dev_iommu_priv_get(dev);
4073	struct dmar_domain *domain = info->domain;
4074	struct intel_iommu *iommu = info->iommu;
4075	unsigned long flags;
4076
4077	if (!dev_is_real_dma_subdevice(info->dev)) {
 
 
 
 
 
 
 
 
4078		if (dev_is_pci(info->dev) && sm_supported(iommu))
4079			intel_pasid_tear_down_entry(iommu, info->dev,
4080					PASID_RID2PASID, false);
4081
4082		iommu_disable_pci_caps(info);
4083		domain_context_clear(info);
 
 
4084	}
4085
4086	spin_lock_irqsave(&domain->lock, flags);
4087	list_del(&info->link);
4088	spin_unlock_irqrestore(&domain->lock, flags);
4089
 
4090	domain_detach_iommu(domain, iommu);
4091	info->domain = NULL;
 
 
4092}
4093
4094/*
4095 * Clear the page table pointer in context or pasid table entries so that
4096 * all DMA requests without PASID from the device are blocked. If the page
4097 * table has been set, clean up the data structures.
4098 */
4099static void device_block_translation(struct device *dev)
4100{
4101	struct device_domain_info *info = dev_iommu_priv_get(dev);
4102	struct intel_iommu *iommu = info->iommu;
4103	unsigned long flags;
4104
4105	iommu_disable_pci_caps(info);
4106	if (!dev_is_real_dma_subdevice(dev)) {
4107		if (sm_supported(iommu))
4108			intel_pasid_tear_down_entry(iommu, dev,
4109						    PASID_RID2PASID, false);
4110		else
4111			domain_context_clear(info);
4112	}
4113
4114	if (!info->domain)
4115		return;
4116
4117	spin_lock_irqsave(&info->domain->lock, flags);
4118	list_del(&info->link);
4119	spin_unlock_irqrestore(&info->domain->lock, flags);
4120
4121	domain_detach_iommu(info->domain, iommu);
4122	info->domain = NULL;
4123}
4124
4125static int md_domain_init(struct dmar_domain *domain, int guest_width)
4126{
4127	int adjust_width;
4128
4129	/* calculate AGAW */
4130	domain->gaw = guest_width;
4131	adjust_width = guestwidth_to_adjustwidth(guest_width);
4132	domain->agaw = width_to_agaw(adjust_width);
4133
4134	domain->iommu_coherency = false;
 
4135	domain->iommu_superpage = 0;
4136	domain->max_addr = 0;
4137
4138	/* always allocate the top pgd */
4139	domain->pgd = alloc_pgtable_page(domain->nid);
4140	if (!domain->pgd)
4141		return -ENOMEM;
4142	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4143	return 0;
4144}
4145
4146static int blocking_domain_attach_dev(struct iommu_domain *domain,
4147				      struct device *dev)
4148{
4149	device_block_translation(dev);
4150	return 0;
4151}
4152
4153static struct iommu_domain blocking_domain = {
4154	.ops = &(const struct iommu_domain_ops) {
4155		.attach_dev	= blocking_domain_attach_dev,
4156		.free		= intel_iommu_domain_free
4157	}
4158};
4159
4160static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4161{
4162	struct dmar_domain *dmar_domain;
4163	struct iommu_domain *domain;
4164
4165	switch (type) {
4166	case IOMMU_DOMAIN_BLOCKED:
4167		return &blocking_domain;
4168	case IOMMU_DOMAIN_DMA:
4169	case IOMMU_DOMAIN_DMA_FQ:
4170	case IOMMU_DOMAIN_UNMANAGED:
4171		dmar_domain = alloc_domain(type);
4172		if (!dmar_domain) {
4173			pr_err("Can't allocate dmar_domain\n");
4174			return NULL;
4175		}
4176		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4177			pr_err("Domain initialization failed\n");
4178			domain_exit(dmar_domain);
4179			return NULL;
4180		}
4181
 
 
 
 
 
4182		domain = &dmar_domain->domain;
4183		domain->geometry.aperture_start = 0;
4184		domain->geometry.aperture_end   =
4185				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4186		domain->geometry.force_aperture = true;
4187
4188		return domain;
4189	case IOMMU_DOMAIN_IDENTITY:
4190		return &si_domain->domain;
4191	case IOMMU_DOMAIN_SVA:
4192		return intel_svm_domain_alloc();
4193	default:
4194		return NULL;
4195	}
4196
4197	return NULL;
4198}
4199
4200static void intel_iommu_domain_free(struct iommu_domain *domain)
4201{
4202	if (domain != &si_domain->domain && domain != &blocking_domain)
4203		domain_exit(to_dmar_domain(domain));
4204}
4205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4206static int prepare_domain_attach_device(struct iommu_domain *domain,
4207					struct device *dev)
4208{
4209	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4210	struct intel_iommu *iommu;
4211	int addr_width;
4212
4213	iommu = device_to_iommu(dev, NULL, NULL);
4214	if (!iommu)
4215		return -ENODEV;
4216
4217	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4218		return -EINVAL;
4219
4220	/* check if this iommu agaw is sufficient for max mapped address */
4221	addr_width = agaw_to_width(iommu->agaw);
4222	if (addr_width > cap_mgaw(iommu->cap))
4223		addr_width = cap_mgaw(iommu->cap);
4224
4225	if (dmar_domain->max_addr > (1LL << addr_width))
4226		return -EINVAL;
 
 
 
 
4227	dmar_domain->gaw = addr_width;
4228
4229	/*
4230	 * Knock out extra levels of page tables if necessary
4231	 */
4232	while (iommu->agaw < dmar_domain->agaw) {
4233		struct dma_pte *pte;
4234
4235		pte = dmar_domain->pgd;
4236		if (dma_pte_present(pte)) {
4237			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
 
4238			free_pgtable_page(pte);
4239		}
4240		dmar_domain->agaw--;
4241	}
4242
4243	return 0;
4244}
4245
4246static int intel_iommu_attach_device(struct iommu_domain *domain,
4247				     struct device *dev)
4248{
4249	struct device_domain_info *info = dev_iommu_priv_get(dev);
4250	int ret;
4251
4252	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4253	    device_is_rmrr_locked(dev)) {
4254		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4255		return -EPERM;
4256	}
4257
4258	if (info->domain)
4259		device_block_translation(dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4260
4261	ret = prepare_domain_attach_device(domain, dev);
4262	if (ret)
4263		return ret;
4264
4265	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4266}
 
4267
4268static int intel_iommu_map(struct iommu_domain *domain,
4269			   unsigned long iova, phys_addr_t hpa,
4270			   size_t size, int iommu_prot, gfp_t gfp)
4271{
4272	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4273	u64 max_addr;
4274	int prot = 0;
 
4275
4276	if (iommu_prot & IOMMU_READ)
4277		prot |= DMA_PTE_READ;
4278	if (iommu_prot & IOMMU_WRITE)
4279		prot |= DMA_PTE_WRITE;
4280	if (dmar_domain->set_pte_snp)
4281		prot |= DMA_PTE_SNP;
4282
4283	max_addr = iova + size;
4284	if (dmar_domain->max_addr < max_addr) {
4285		u64 end;
4286
4287		/* check if minimum agaw is sufficient for mapped address */
4288		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4289		if (end < max_addr) {
4290			pr_err("%s: iommu width (%d) is not "
4291			       "sufficient for the mapped address (%llx)\n",
4292			       __func__, dmar_domain->gaw, max_addr);
4293			return -EFAULT;
4294		}
4295		dmar_domain->max_addr = max_addr;
4296	}
4297	/* Round up size to next multiple of PAGE_SIZE, if it and
4298	   the low bits of hpa would take us onto the next page */
4299	size = aligned_nrpages(hpa, size);
4300	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4301				hpa >> VTD_PAGE_SHIFT, size, prot);
4302}
4303
4304static int intel_iommu_map_pages(struct iommu_domain *domain,
4305				 unsigned long iova, phys_addr_t paddr,
4306				 size_t pgsize, size_t pgcount,
4307				 int prot, gfp_t gfp, size_t *mapped)
4308{
4309	unsigned long pgshift = __ffs(pgsize);
4310	size_t size = pgcount << pgshift;
4311	int ret;
4312
4313	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4314		return -EINVAL;
4315
4316	if (!IS_ALIGNED(iova | paddr, pgsize))
4317		return -EINVAL;
4318
4319	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4320	if (!ret && mapped)
4321		*mapped = size;
4322
4323	return ret;
4324}
4325
4326static size_t intel_iommu_unmap(struct iommu_domain *domain,
4327				unsigned long iova, size_t size,
4328				struct iommu_iotlb_gather *gather)
4329{
4330	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 
4331	unsigned long start_pfn, last_pfn;
4332	int level = 0;
 
4333
4334	/* Cope with horrid API which requires us to unmap more than the
4335	   size argument if it happens to be a large-page mapping. */
4336	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4337
4338	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4339		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4340
4341	start_pfn = iova >> VTD_PAGE_SHIFT;
4342	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4343
4344	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4345
4346	if (dmar_domain->max_addr == iova + size)
4347		dmar_domain->max_addr = iova;
4348
4349	iommu_iotlb_gather_add_page(domain, gather, iova, size);
4350
4351	return size;
4352}
4353
4354static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4355				      unsigned long iova,
4356				      size_t pgsize, size_t pgcount,
4357				      struct iommu_iotlb_gather *gather)
4358{
4359	unsigned long pgshift = __ffs(pgsize);
4360	size_t size = pgcount << pgshift;
4361
4362	return intel_iommu_unmap(domain, iova, size, gather);
4363}
4364
4365static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4366				 struct iommu_iotlb_gather *gather)
4367{
4368	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4369	unsigned long iova_pfn = IOVA_PFN(gather->start);
4370	size_t size = gather->end - gather->start;
4371	struct iommu_domain_info *info;
4372	unsigned long start_pfn;
4373	unsigned long nrpages;
4374	unsigned long i;
4375
4376	nrpages = aligned_nrpages(gather->start, size);
4377	start_pfn = mm_to_dma_pfn(iova_pfn);
4378
4379	xa_for_each(&dmar_domain->iommu_array, i, info)
4380		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4381				      start_pfn, nrpages,
4382				      list_empty(&gather->freelist), 0);
4383
4384	put_pages_list(&gather->freelist);
4385}
4386
4387static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4388					    dma_addr_t iova)
4389{
4390	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4391	struct dma_pte *pte;
4392	int level = 0;
4393	u64 phys = 0;
4394
4395	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4396	if (pte && dma_pte_present(pte))
4397		phys = dma_pte_addr(pte) +
4398			(iova & (BIT_MASK(level_to_offset_bits(level) +
4399						VTD_PAGE_SHIFT) - 1));
4400
4401	return phys;
4402}
4403
4404static bool domain_support_force_snooping(struct dmar_domain *domain)
4405{
4406	struct device_domain_info *info;
4407	bool support = true;
 
4408
4409	assert_spin_locked(&domain->lock);
4410	list_for_each_entry(info, &domain->devices, link) {
4411		if (!ecap_sc_support(info->iommu->ecap)) {
4412			support = false;
4413			break;
4414		}
4415	}
 
4416
4417	return support;
4418}
4419
4420static void domain_set_force_snooping(struct dmar_domain *domain)
4421{
4422	struct device_domain_info *info;
 
 
4423
4424	assert_spin_locked(&domain->lock);
4425	/*
4426	 * Second level page table supports per-PTE snoop control. The
4427	 * iommu_map() interface will handle this by setting SNP bit.
4428	 */
4429	if (!domain->use_first_level) {
4430		domain->set_pte_snp = true;
4431		return;
4432	}
 
4433
4434	list_for_each_entry(info, &domain->devices, link)
4435		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4436						     PASID_RID2PASID);
4437}
4438
4439static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4440{
4441	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4442	unsigned long flags;
4443
4444	if (dmar_domain->force_snooping)
4445		return true;
4446
4447	spin_lock_irqsave(&dmar_domain->lock, flags);
4448	if (!domain_support_force_snooping(dmar_domain)) {
4449		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4450		return false;
 
 
4451	}
 
4452
4453	domain_set_force_snooping(dmar_domain);
4454	dmar_domain->force_snooping = true;
4455	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4456
4457	return true;
4458}
4459
4460static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4461{
4462	struct device_domain_info *info = dev_iommu_priv_get(dev);
4463
4464	switch (cap) {
4465	case IOMMU_CAP_CACHE_COHERENCY:
4466		return true;
4467	case IOMMU_CAP_INTR_REMAP:
4468		return irq_remapping_enabled == 1;
4469	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4470		return dmar_platform_optin();
4471	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4472		return ecap_sc_support(info->iommu->ecap);
4473	default:
4474		return false;
4475	}
4476}
4477
4478static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4479{
4480	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4481	struct device_domain_info *info;
4482	struct intel_iommu *iommu;
4483	u8 bus, devfn;
4484	int ret;
4485
4486	iommu = device_to_iommu(dev, &bus, &devfn);
4487	if (!iommu || !iommu->iommu.ops)
4488		return ERR_PTR(-ENODEV);
4489
4490	info = kzalloc(sizeof(*info), GFP_KERNEL);
4491	if (!info)
4492		return ERR_PTR(-ENOMEM);
4493
4494	if (dev_is_real_dma_subdevice(dev)) {
4495		info->bus = pdev->bus->number;
4496		info->devfn = pdev->devfn;
4497		info->segment = pci_domain_nr(pdev->bus);
4498	} else {
4499		info->bus = bus;
4500		info->devfn = devfn;
4501		info->segment = iommu->segment;
4502	}
4503
4504	info->dev = dev;
4505	info->iommu = iommu;
4506	if (dev_is_pci(dev)) {
4507		if (ecap_dev_iotlb_support(iommu->ecap) &&
4508		    pci_ats_supported(pdev) &&
4509		    dmar_ats_supported(pdev, iommu)) {
4510			info->ats_supported = 1;
4511			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4512		}
4513		if (sm_supported(iommu)) {
4514			if (pasid_supported(iommu)) {
4515				int features = pci_pasid_features(pdev);
4516
4517				if (features >= 0)
4518					info->pasid_supported = features | 1;
4519			}
4520
4521			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4522			    pci_pri_supported(pdev))
4523				info->pri_supported = 1;
4524		}
4525	}
4526
4527	dev_iommu_priv_set(dev, info);
4528
4529	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4530		ret = intel_pasid_alloc_table(dev);
4531		if (ret) {
4532			dev_err(dev, "PASID table allocation failed\n");
4533			dev_iommu_priv_set(dev, NULL);
4534			kfree(info);
4535			return ERR_PTR(ret);
4536		}
4537	}
4538
4539	return &iommu->iommu;
4540}
4541
4542static void intel_iommu_release_device(struct device *dev)
4543{
4544	struct device_domain_info *info = dev_iommu_priv_get(dev);
 
 
 
 
4545
4546	dmar_remove_one_dev_info(dev);
4547	intel_pasid_free_table(dev);
4548	dev_iommu_priv_set(dev, NULL);
4549	kfree(info);
4550	set_dma_ops(dev, NULL);
4551}
4552
4553static void intel_iommu_probe_finalize(struct device *dev)
4554{
4555	set_dma_ops(dev, NULL);
4556	iommu_setup_dma_ops(dev, 0, U64_MAX);
 
 
 
 
 
 
 
4557}
4558
4559static void intel_iommu_get_resv_regions(struct device *device,
4560					 struct list_head *head)
4561{
4562	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4563	struct iommu_resv_region *reg;
4564	struct dmar_rmrr_unit *rmrr;
4565	struct device *i_dev;
4566	int i;
4567
4568	rcu_read_lock();
4569	for_each_rmrr_units(rmrr) {
4570		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4571					  i, i_dev) {
4572			struct iommu_resv_region *resv;
4573			enum iommu_resv_type type;
4574			size_t length;
4575
4576			if (i_dev != device &&
4577			    !is_downstream_to_pci_bridge(device, i_dev))
4578				continue;
4579
4580			length = rmrr->end_address - rmrr->base_address + 1;
4581
4582			type = device_rmrr_is_relaxable(device) ?
4583				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4584
4585			resv = iommu_alloc_resv_region(rmrr->base_address,
4586						       length, prot, type,
4587						       GFP_ATOMIC);
4588			if (!resv)
4589				break;
4590
4591			list_add_tail(&resv->list, head);
4592		}
4593	}
4594	rcu_read_unlock();
4595
4596#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4597	if (dev_is_pci(device)) {
4598		struct pci_dev *pdev = to_pci_dev(device);
4599
4600		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4601			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4602					IOMMU_RESV_DIRECT_RELAXABLE,
4603					GFP_KERNEL);
4604			if (reg)
4605				list_add_tail(&reg->list, head);
4606		}
4607	}
4608#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4609
4610	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4611				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4612				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4613	if (!reg)
4614		return;
4615	list_add_tail(&reg->list, head);
4616}
4617
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4618static struct iommu_group *intel_iommu_device_group(struct device *dev)
4619{
4620	if (dev_is_pci(dev))
4621		return pci_device_group(dev);
4622	return generic_device_group(dev);
4623}
4624
4625static int intel_iommu_enable_sva(struct device *dev)
4626{
4627	struct device_domain_info *info = dev_iommu_priv_get(dev);
4628	struct intel_iommu *iommu;
 
4629	int ret;
4630
4631	if (!info || dmar_disabled)
 
4632		return -EINVAL;
4633
4634	iommu = info->iommu;
4635	if (!iommu)
4636		return -EINVAL;
4637
4638	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
 
4639		return -ENODEV;
4640
4641	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4642		return -EINVAL;
 
 
4643
4644	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4645	if (!ret)
4646		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4647
4648	return ret;
 
 
 
 
 
 
 
 
 
 
 
4649}
4650
4651static int intel_iommu_disable_sva(struct device *dev)
 
 
 
 
 
 
 
 
 
 
4652{
4653	struct device_domain_info *info = dev_iommu_priv_get(dev);
4654	struct intel_iommu *iommu = info->iommu;
4655	int ret;
4656
4657	ret = iommu_unregister_device_fault_handler(dev);
4658	if (!ret)
4659		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
 
 
 
4660
4661	return ret;
 
 
 
4662}
4663
4664static int intel_iommu_enable_iopf(struct device *dev)
 
4665{
4666	struct device_domain_info *info = dev_iommu_priv_get(dev);
 
4667
4668	if (info && info->pri_supported)
4669		return 0;
 
4670
4671	return -ENODEV;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4672}
4673
4674static int
4675intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4676{
4677	switch (feat) {
4678	case IOMMU_DEV_FEAT_IOPF:
4679		return intel_iommu_enable_iopf(dev);
4680
4681	case IOMMU_DEV_FEAT_SVA:
4682		return intel_iommu_enable_sva(dev);
4683
4684	default:
4685		return -ENODEV;
 
 
 
4686	}
 
 
4687}
4688
4689static int
4690intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4691{
4692	switch (feat) {
4693	case IOMMU_DEV_FEAT_IOPF:
4694		return 0;
4695
4696	case IOMMU_DEV_FEAT_SVA:
4697		return intel_iommu_disable_sva(dev);
4698
4699	default:
4700		return -ENODEV;
4701	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4702}
4703
4704static bool intel_iommu_is_attach_deferred(struct device *dev)
 
 
4705{
4706	struct device_domain_info *info = dev_iommu_priv_get(dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4707
4708	return translation_pre_enabled(info->iommu) && !info->domain;
4709}
4710
4711/*
4712 * Check that the device does not live on an external facing PCI port that is
4713 * marked as untrusted. Such devices should not be able to apply quirks and
4714 * thus not be able to bypass the IOMMU restrictions.
4715 */
4716static bool risky_device(struct pci_dev *pdev)
4717{
4718	if (pdev->untrusted) {
4719		pci_info(pdev,
4720			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4721			 pdev->vendor, pdev->device);
4722		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4723		return true;
4724	}
4725	return false;
4726}
4727
4728static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4729				       unsigned long iova, size_t size)
4730{
4731	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4732	unsigned long pages = aligned_nrpages(iova, size);
4733	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4734	struct iommu_domain_info *info;
4735	unsigned long i;
4736
4737	xa_for_each(&dmar_domain->iommu_array, i, info)
4738		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4739}
4740
4741static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4742{
4743	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4744	struct iommu_domain *domain;
4745
4746	/* Domain type specific cleanup: */
4747	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4748	if (domain) {
4749		switch (domain->type) {
4750		case IOMMU_DOMAIN_SVA:
4751			intel_svm_remove_dev_pasid(dev, pasid);
4752			break;
4753		default:
4754			/* should never reach here */
4755			WARN_ON(1);
4756			break;
4757		}
4758	}
4759
4760	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4761}
4762
4763const struct iommu_ops intel_iommu_ops = {
4764	.capable		= intel_iommu_capable,
4765	.domain_alloc		= intel_iommu_domain_alloc,
 
 
 
 
 
 
 
 
 
 
4766	.probe_device		= intel_iommu_probe_device,
4767	.probe_finalize		= intel_iommu_probe_finalize,
4768	.release_device		= intel_iommu_release_device,
4769	.get_resv_regions	= intel_iommu_get_resv_regions,
 
 
4770	.device_group		= intel_iommu_device_group,
 
 
4771	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4772	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4773	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4774	.def_domain_type	= device_def_domain_type,
4775	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4776	.pgsize_bitmap		= SZ_4K,
4777#ifdef CONFIG_INTEL_IOMMU_SVM
 
 
 
 
 
 
4778	.page_response		= intel_svm_page_response,
4779#endif
4780	.default_domain_ops = &(const struct iommu_domain_ops) {
4781		.attach_dev		= intel_iommu_attach_device,
4782		.map_pages		= intel_iommu_map_pages,
4783		.unmap_pages		= intel_iommu_unmap_pages,
4784		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4785		.flush_iotlb_all        = intel_flush_iotlb_all,
4786		.iotlb_sync		= intel_iommu_tlb_sync,
4787		.iova_to_phys		= intel_iommu_iova_to_phys,
4788		.free			= intel_iommu_domain_free,
4789		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4790	}
4791};
4792
4793static void quirk_iommu_igfx(struct pci_dev *dev)
4794{
4795	if (risky_device(dev))
4796		return;
4797
4798	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4799	dmar_map_gfx = 0;
4800}
4801
4802/* G4x/GM45 integrated gfx dmar support is totally busted. */
4803DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4804DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4805DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4806DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4807DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4808DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4809DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4810
4811/* Broadwell igfx malfunctions with dmar */
4812DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4813DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4814DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4815DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4816DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4817DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4818DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4819DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4820DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4821DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4822DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4823DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4824DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4825DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4826DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4827DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4828DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4829DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4830DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4831DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4832DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4833DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4834DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4835DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4836
4837static void quirk_iommu_rwbf(struct pci_dev *dev)
4838{
4839	if (risky_device(dev))
4840		return;
4841
4842	/*
4843	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4844	 * but needs it. Same seems to hold for the desktop versions.
4845	 */
4846	pci_info(dev, "Forcing write-buffer flush capability\n");
4847	rwbf_quirk = 1;
4848}
4849
4850DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4851DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4852DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4853DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4854DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4855DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4856DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4857
4858#define GGC 0x52
4859#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4860#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4861#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4862#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4863#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4864#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4865#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4866#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4867
4868static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4869{
4870	unsigned short ggc;
4871
4872	if (risky_device(dev))
4873		return;
4874
4875	if (pci_read_config_word(dev, GGC, &ggc))
4876		return;
4877
4878	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4879		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4880		dmar_map_gfx = 0;
4881	} else if (dmar_map_gfx) {
4882		/* we have to ensure the gfx device is idle before we flush */
4883		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4884		iommu_set_dma_strict();
4885	}
4886}
4887DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4888DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4889DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4890DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4891
4892static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4893{
4894	unsigned short ver;
4895
4896	if (!IS_GFX_DEVICE(dev))
4897		return;
4898
4899	ver = (dev->device >> 8) & 0xff;
4900	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4901	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4902	    ver != 0x9a && ver != 0xa7)
4903		return;
4904
4905	if (risky_device(dev))
4906		return;
4907
4908	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4909	iommu_skip_te_disable = 1;
4910}
4911DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4912
4913/* On Tylersburg chipsets, some BIOSes have been known to enable the
4914   ISOCH DMAR unit for the Azalia sound device, but not give it any
4915   TLB entries, which causes it to deadlock. Check for that.  We do
4916   this in a function called from init_dmars(), instead of in a PCI
4917   quirk, because we don't want to print the obnoxious "BIOS broken"
4918   message if VT-d is actually disabled.
4919*/
4920static void __init check_tylersburg_isoch(void)
4921{
4922	struct pci_dev *pdev;
4923	uint32_t vtisochctrl;
4924
4925	/* If there's no Azalia in the system anyway, forget it. */
4926	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4927	if (!pdev)
4928		return;
4929
4930	if (risky_device(pdev)) {
4931		pci_dev_put(pdev);
4932		return;
4933	}
4934
4935	pci_dev_put(pdev);
4936
4937	/* System Management Registers. Might be hidden, in which case
4938	   we can't do the sanity check. But that's OK, because the
4939	   known-broken BIOSes _don't_ actually hide it, so far. */
4940	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4941	if (!pdev)
4942		return;
4943
4944	if (risky_device(pdev)) {
4945		pci_dev_put(pdev);
4946		return;
4947	}
4948
4949	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4950		pci_dev_put(pdev);
4951		return;
4952	}
4953
4954	pci_dev_put(pdev);
4955
4956	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4957	if (vtisochctrl & 1)
4958		return;
4959
4960	/* Drop all bits other than the number of TLB entries */
4961	vtisochctrl &= 0x1c;
4962
4963	/* If we have the recommended number of TLB entries (16), fine. */
4964	if (vtisochctrl == 0x10)
4965		return;
4966
4967	/* Zero TLB entries? You get to ride the short bus to school. */
4968	if (!vtisochctrl) {
4969		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4970		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4971		     dmi_get_system_info(DMI_BIOS_VENDOR),
4972		     dmi_get_system_info(DMI_BIOS_VERSION),
4973		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4974		iommu_identity_mapping |= IDENTMAP_AZALIA;
4975		return;
4976	}
4977
4978	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4979	       vtisochctrl);
4980}
4981
4982/*
4983 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4984 * invalidation completion before posted writes initiated with translated address
4985 * that utilized translations matching the invalidation address range, violating
4986 * the invalidation completion ordering.
4987 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4988 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4989 * under the control of the trusted/privileged host device driver must use this
4990 * quirk.
4991 * Device TLBs are invalidated under the following six conditions:
4992 * 1. Device driver does DMA API unmap IOVA
4993 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4994 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4995 *    exit_mmap() due to crash
4996 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4997 *    VM has to free pages that were unmapped
4998 * 5. Userspace driver unmaps a DMA buffer
4999 * 6. Cache invalidation in vSVA usage (upcoming)
5000 *
5001 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5002 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5003 * invalidate TLB the same way as normal user unmap which will use this quirk.
5004 * The dTLB invalidation after PASID cache flush does not need this quirk.
5005 *
5006 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5007 */
5008void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5009			       unsigned long address, unsigned long mask,
5010			       u32 pasid, u16 qdep)
5011{
5012	u16 sid;
5013
5014	if (likely(!info->dtlb_extra_inval))
5015		return;
5016
5017	sid = PCI_DEVID(info->bus, info->devfn);
5018	if (pasid == PASID_RID2PASID) {
5019		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5020				   qdep, address, mask);
5021	} else {
5022		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5023					 pasid, qdep, address, mask);
5024	}
5025}
v5.9
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright © 2006-2014 Intel Corporation.
   4 *
   5 * Authors: David Woodhouse <dwmw2@infradead.org>,
   6 *          Ashok Raj <ashok.raj@intel.com>,
   7 *          Shaohua Li <shaohua.li@intel.com>,
   8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9 *          Fenghua Yu <fenghua.yu@intel.com>
  10 *          Joerg Roedel <jroedel@suse.de>
  11 */
  12
  13#define pr_fmt(fmt)     "DMAR: " fmt
  14#define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16#include <linux/init.h>
  17#include <linux/bitmap.h>
  18#include <linux/debugfs.h>
  19#include <linux/export.h>
  20#include <linux/slab.h>
  21#include <linux/irq.h>
  22#include <linux/interrupt.h>
  23#include <linux/spinlock.h>
  24#include <linux/pci.h>
  25#include <linux/dmar.h>
  26#include <linux/dma-mapping.h>
  27#include <linux/mempool.h>
  28#include <linux/memory.h>
  29#include <linux/cpu.h>
  30#include <linux/timer.h>
  31#include <linux/io.h>
  32#include <linux/iova.h>
  33#include <linux/iommu.h>
  34#include <linux/intel-iommu.h>
  35#include <linux/syscore_ops.h>
  36#include <linux/tboot.h>
  37#include <linux/dmi.h>
  38#include <linux/pci-ats.h>
  39#include <linux/memblock.h>
  40#include <linux/dma-contiguous.h>
  41#include <linux/dma-direct.h>
  42#include <linux/crash_dump.h>
  43#include <linux/numa.h>
  44#include <linux/swiotlb.h>
  45#include <asm/irq_remapping.h>
  46#include <asm/cacheflush.h>
  47#include <asm/iommu.h>
  48#include <trace/events/intel_iommu.h>
  49
 
 
  50#include "../irq_remapping.h"
 
  51#include "pasid.h"
 
  52
  53#define ROOT_SIZE		VTD_PAGE_SIZE
  54#define CONTEXT_SIZE		VTD_PAGE_SIZE
  55
  56#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  57#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  58#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  59#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  60
  61#define IOAPIC_RANGE_START	(0xfee00000)
  62#define IOAPIC_RANGE_END	(0xfeefffff)
  63#define IOVA_START_ADDR		(0x1000)
  64
  65#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  66
  67#define MAX_AGAW_WIDTH 64
  68#define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  69
  70#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  71#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  72
  73/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  74   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  75#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
  76				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  77#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  78
  79/* IO virtual address start page frame number */
  80#define IOVA_START_PFN		(1)
  81
  82#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
  83
  84/* page table handling */
  85#define LEVEL_STRIDE		(9)
  86#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
  87
  88/*
  89 * This bitmap is used to advertise the page sizes our hardware support
  90 * to the IOMMU core, which will then use this information to split
  91 * physically contiguous memory regions it is mapping into page sizes
  92 * that we support.
  93 *
  94 * Traditionally the IOMMU core just handed us the mappings directly,
  95 * after making sure the size is an order of a 4KiB page and that the
  96 * mapping has natural alignment.
  97 *
  98 * To retain this behavior, we currently advertise that we support
  99 * all page sizes that are an order of 4KiB.
 100 *
 101 * If at some point we'd like to utilize the IOMMU core's new behavior,
 102 * we could change this to advertise the real page sizes we support.
 103 */
 104#define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
 105
 106static inline int agaw_to_level(int agaw)
 107{
 108	return agaw + 2;
 109}
 110
 111static inline int agaw_to_width(int agaw)
 112{
 113	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 114}
 115
 116static inline int width_to_agaw(int width)
 117{
 118	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 119}
 120
 121static inline unsigned int level_to_offset_bits(int level)
 122{
 123	return (level - 1) * LEVEL_STRIDE;
 124}
 125
 126static inline int pfn_level_offset(u64 pfn, int level)
 127{
 128	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 129}
 130
 131static inline u64 level_mask(int level)
 132{
 133	return -1ULL << level_to_offset_bits(level);
 134}
 135
 136static inline u64 level_size(int level)
 137{
 138	return 1ULL << level_to_offset_bits(level);
 139}
 140
 141static inline u64 align_to_level(u64 pfn, int level)
 142{
 143	return (pfn + level_size(level) - 1) & level_mask(level);
 144}
 145
 146static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 147{
 148	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 149}
 150
 151/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 152   are never going to work. */
 153static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 154{
 155	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 156}
 157
 158static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 159{
 160	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 161}
 162static inline unsigned long page_to_dma_pfn(struct page *pg)
 163{
 164	return mm_to_dma_pfn(page_to_pfn(pg));
 165}
 166static inline unsigned long virt_to_dma_pfn(void *p)
 167{
 168	return page_to_dma_pfn(virt_to_page(p));
 169}
 170
 171/* global iommu list, set NULL for ignored DMAR units */
 172static struct intel_iommu **g_iommus;
 173
 174static void __init check_tylersburg_isoch(void);
 175static int rwbf_quirk;
 176
 177/*
 178 * set to 1 to panic kernel if can't successfully enable VT-d
 179 * (used when kernel is launched w/ TXT)
 180 */
 181static int force_on = 0;
 182int intel_iommu_tboot_noforce;
 183static int no_platform_optin;
 184
 185#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 186
 187/*
 188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 189 * if marked present.
 190 */
 191static phys_addr_t root_entry_lctp(struct root_entry *re)
 192{
 193	if (!(re->lo & 1))
 194		return 0;
 195
 196	return re->lo & VTD_PAGE_MASK;
 197}
 198
 199/*
 200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 201 * if marked present.
 202 */
 203static phys_addr_t root_entry_uctp(struct root_entry *re)
 204{
 205	if (!(re->hi & 1))
 206		return 0;
 207
 208	return re->hi & VTD_PAGE_MASK;
 209}
 210
 211static inline void context_clear_pasid_enable(struct context_entry *context)
 212{
 213	context->lo &= ~(1ULL << 11);
 214}
 215
 216static inline bool context_pasid_enabled(struct context_entry *context)
 217{
 218	return !!(context->lo & (1ULL << 11));
 219}
 220
 221static inline void context_set_copied(struct context_entry *context)
 222{
 223	context->hi |= (1ull << 3);
 224}
 225
 226static inline bool context_copied(struct context_entry *context)
 227{
 228	return !!(context->hi & (1ULL << 3));
 229}
 230
 231static inline bool __context_present(struct context_entry *context)
 232{
 233	return (context->lo & 1);
 234}
 235
 236bool context_present(struct context_entry *context)
 237{
 238	return context_pasid_enabled(context) ?
 239	     __context_present(context) :
 240	     __context_present(context) && !context_copied(context);
 241}
 242
 243static inline void context_set_present(struct context_entry *context)
 244{
 245	context->lo |= 1;
 246}
 247
 248static inline void context_set_fault_enable(struct context_entry *context)
 249{
 250	context->lo &= (((u64)-1) << 2) | 1;
 251}
 252
 253static inline void context_set_translation_type(struct context_entry *context,
 254						unsigned long value)
 255{
 256	context->lo &= (((u64)-1) << 4) | 3;
 257	context->lo |= (value & 3) << 2;
 258}
 259
 260static inline void context_set_address_root(struct context_entry *context,
 261					    unsigned long value)
 262{
 263	context->lo &= ~VTD_PAGE_MASK;
 264	context->lo |= value & VTD_PAGE_MASK;
 265}
 266
 267static inline void context_set_address_width(struct context_entry *context,
 268					     unsigned long value)
 269{
 270	context->hi |= value & 7;
 271}
 272
 273static inline void context_set_domain_id(struct context_entry *context,
 274					 unsigned long value)
 275{
 276	context->hi |= (value & ((1 << 16) - 1)) << 8;
 277}
 278
 
 
 
 
 
 279static inline int context_domain_id(struct context_entry *c)
 280{
 281	return((c->hi >> 8) & 0xffff);
 282}
 283
 284static inline void context_clear_entry(struct context_entry *context)
 285{
 286	context->lo = 0;
 287	context->hi = 0;
 288}
 289
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 290/*
 291 * This domain is a statically identity mapping domain.
 292 *	1. This domain creats a static 1:1 mapping to all usable memory.
 293 * 	2. It maps to each iommu if successful.
 294 *	3. Each iommu mapps to this domain if successful.
 295 */
 296static struct dmar_domain *si_domain;
 297static int hw_pass_through = 1;
 298
 299#define for_each_domain_iommu(idx, domain)			\
 300	for (idx = 0; idx < g_num_of_iommus; idx++)		\
 301		if (domain->iommu_refcnt[idx])
 302
 303struct dmar_rmrr_unit {
 304	struct list_head list;		/* list of rmrr units	*/
 305	struct acpi_dmar_header *hdr;	/* ACPI header		*/
 306	u64	base_address;		/* reserved base address*/
 307	u64	end_address;		/* reserved end address */
 308	struct dmar_dev_scope *devices;	/* target devices */
 309	int	devices_cnt;		/* target device count */
 310};
 311
 312struct dmar_atsr_unit {
 313	struct list_head list;		/* list of ATSR units */
 314	struct acpi_dmar_header *hdr;	/* ACPI header */
 315	struct dmar_dev_scope *devices;	/* target devices */
 316	int devices_cnt;		/* target device count */
 317	u8 include_all:1;		/* include all ports */
 318};
 319
 
 
 
 
 
 
 
 
 
 320static LIST_HEAD(dmar_atsr_units);
 321static LIST_HEAD(dmar_rmrr_units);
 
 322
 323#define for_each_rmrr_units(rmrr) \
 324	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 325
 326/* bitmap for indexing intel_iommus */
 327static int g_num_of_iommus;
 328
 329static void domain_exit(struct dmar_domain *domain);
 330static void domain_remove_dev_info(struct dmar_domain *domain);
 331static void dmar_remove_one_dev_info(struct device *dev);
 332static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 333static int intel_iommu_attach_device(struct iommu_domain *domain,
 334				     struct device *dev);
 335static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 336					    dma_addr_t iova);
 337
 338#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 339int dmar_disabled = 0;
 340#else
 341int dmar_disabled = 1;
 342#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
 343
 344#ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
 345int intel_iommu_sm = 1;
 346#else
 347int intel_iommu_sm;
 348#endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
 349
 350int intel_iommu_enabled = 0;
 351EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 352
 353static int dmar_map_gfx = 1;
 354static int dmar_forcedac;
 355static int intel_iommu_strict;
 356static int intel_iommu_superpage = 1;
 357static int iommu_identity_mapping;
 358static int intel_no_bounce;
 359static int iommu_skip_te_disable;
 360
 361#define IDENTMAP_GFX		2
 362#define IDENTMAP_AZALIA		4
 363
 364int intel_iommu_gfx_mapped;
 365EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 366
 367#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
 368struct device_domain_info *get_domain_info(struct device *dev)
 369{
 370	struct device_domain_info *info;
 371
 372	if (!dev)
 373		return NULL;
 374
 375	info = dev_iommu_priv_get(dev);
 376	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
 377		return NULL;
 378
 379	return info;
 380}
 381
 382DEFINE_SPINLOCK(device_domain_lock);
 383static LIST_HEAD(device_domain_list);
 384
 385#define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&	\
 386				to_pci_dev(d)->untrusted)
 387
 388/*
 389 * Iterate over elements in device_domain_list and call the specified
 390 * callback @fn against each element.
 391 */
 392int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 393				     void *data), void *data)
 394{
 395	int ret = 0;
 396	unsigned long flags;
 397	struct device_domain_info *info;
 398
 399	spin_lock_irqsave(&device_domain_lock, flags);
 400	list_for_each_entry(info, &device_domain_list, global) {
 401		ret = fn(info, data);
 402		if (ret) {
 403			spin_unlock_irqrestore(&device_domain_lock, flags);
 404			return ret;
 405		}
 406	}
 407	spin_unlock_irqrestore(&device_domain_lock, flags);
 408
 409	return 0;
 410}
 411
 412const struct iommu_ops intel_iommu_ops;
 413
 414static bool translation_pre_enabled(struct intel_iommu *iommu)
 415{
 416	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 417}
 418
 419static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 420{
 421	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 422}
 423
 424static void init_translation_status(struct intel_iommu *iommu)
 425{
 426	u32 gsts;
 427
 428	gsts = readl(iommu->reg + DMAR_GSTS_REG);
 429	if (gsts & DMA_GSTS_TES)
 430		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 431}
 432
 433static int __init intel_iommu_setup(char *str)
 434{
 435	if (!str)
 436		return -EINVAL;
 
 437	while (*str) {
 438		if (!strncmp(str, "on", 2)) {
 439			dmar_disabled = 0;
 440			pr_info("IOMMU enabled\n");
 441		} else if (!strncmp(str, "off", 3)) {
 442			dmar_disabled = 1;
 443			no_platform_optin = 1;
 444			pr_info("IOMMU disabled\n");
 445		} else if (!strncmp(str, "igfx_off", 8)) {
 446			dmar_map_gfx = 0;
 447			pr_info("Disable GFX device mapping\n");
 448		} else if (!strncmp(str, "forcedac", 8)) {
 449			pr_info("Forcing DAC for PCI devices\n");
 450			dmar_forcedac = 1;
 451		} else if (!strncmp(str, "strict", 6)) {
 452			pr_info("Disable batched IOTLB flush\n");
 453			intel_iommu_strict = 1;
 454		} else if (!strncmp(str, "sp_off", 6)) {
 455			pr_info("Disable supported super page\n");
 456			intel_iommu_superpage = 0;
 457		} else if (!strncmp(str, "sm_on", 5)) {
 458			pr_info("Intel-IOMMU: scalable mode supported\n");
 459			intel_iommu_sm = 1;
 
 
 
 460		} else if (!strncmp(str, "tboot_noforce", 13)) {
 461			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 462			intel_iommu_tboot_noforce = 1;
 463		} else if (!strncmp(str, "nobounce", 8)) {
 464			pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
 465			intel_no_bounce = 1;
 466		}
 467
 468		str += strcspn(str, ",");
 469		while (*str == ',')
 470			str++;
 471	}
 472	return 0;
 
 473}
 474__setup("intel_iommu=", intel_iommu_setup);
 475
 476static struct kmem_cache *iommu_domain_cache;
 477static struct kmem_cache *iommu_devinfo_cache;
 478
 479static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 480{
 481	struct dmar_domain **domains;
 482	int idx = did >> 8;
 483
 484	domains = iommu->domains[idx];
 485	if (!domains)
 486		return NULL;
 487
 488	return domains[did & 0xff];
 489}
 490
 491static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 492			     struct dmar_domain *domain)
 493{
 494	struct dmar_domain **domains;
 495	int idx = did >> 8;
 496
 497	if (!iommu->domains[idx]) {
 498		size_t size = 256 * sizeof(struct dmar_domain *);
 499		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 500	}
 501
 502	domains = iommu->domains[idx];
 503	if (WARN_ON(!domains))
 504		return;
 505	else
 506		domains[did & 0xff] = domain;
 507}
 508
 509void *alloc_pgtable_page(int node)
 510{
 511	struct page *page;
 512	void *vaddr = NULL;
 513
 514	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 515	if (page)
 516		vaddr = page_address(page);
 517	return vaddr;
 518}
 519
 520void free_pgtable_page(void *vaddr)
 521{
 522	free_page((unsigned long)vaddr);
 523}
 524
 525static inline void *alloc_domain_mem(void)
 526{
 527	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 528}
 529
 530static void free_domain_mem(void *vaddr)
 
 531{
 532	kmem_cache_free(iommu_domain_cache, vaddr);
 533}
 534
 535static inline void * alloc_devinfo_mem(void)
 536{
 537	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 538}
 539
 540static inline void free_devinfo_mem(void *vaddr)
 
 
 
 
 
 541{
 542	kmem_cache_free(iommu_devinfo_cache, vaddr);
 543}
 544
 545static inline int domain_type_is_si(struct dmar_domain *domain)
 546{
 547	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 548}
 549
 550static inline bool domain_use_first_level(struct dmar_domain *domain)
 551{
 552	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
 553}
 554
 555static inline int domain_pfn_supported(struct dmar_domain *domain,
 556				       unsigned long pfn)
 557{
 558	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 559
 560	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 561}
 562
 563static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 564{
 565	unsigned long sagaw;
 566	int agaw = -1;
 567
 568	sagaw = cap_sagaw(iommu->cap);
 569	for (agaw = width_to_agaw(max_gaw);
 570	     agaw >= 0; agaw--) {
 571		if (test_bit(agaw, &sagaw))
 572			break;
 573	}
 574
 575	return agaw;
 576}
 577
 578/*
 579 * Calculate max SAGAW for each iommu.
 580 */
 581int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 582{
 583	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 584}
 585
 586/*
 587 * calculate agaw for each iommu.
 588 * "SAGAW" may be different across iommus, use a default agaw, and
 589 * get a supported less agaw for iommus that don't support the default agaw.
 590 */
 591int iommu_calculate_agaw(struct intel_iommu *iommu)
 592{
 593	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 594}
 595
 596/* This functionin only returns single iommu in a domain */
 597struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 598{
 599	int iommu_id;
 600
 601	/* si_domain and vm domain should not get here. */
 602	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
 603		return NULL;
 604
 605	for_each_domain_iommu(iommu_id, domain)
 606		break;
 607
 608	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 609		return NULL;
 610
 611	return g_iommus[iommu_id];
 612}
 613
 614static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 615{
 616	return sm_supported(iommu) ?
 617			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 618}
 619
 620static void domain_update_iommu_coherency(struct dmar_domain *domain)
 621{
 
 622	struct dmar_drhd_unit *drhd;
 623	struct intel_iommu *iommu;
 624	bool found = false;
 625	int i;
 626
 627	domain->iommu_coherency = 1;
 628
 629	for_each_domain_iommu(i, domain) {
 630		found = true;
 631		if (!iommu_paging_structure_coherency(g_iommus[i])) {
 632			domain->iommu_coherency = 0;
 633			break;
 634		}
 635	}
 636	if (found)
 637		return;
 638
 639	/* No hardware attached; use lowest common denominator */
 640	rcu_read_lock();
 641	for_each_active_iommu(iommu, drhd) {
 642		if (!iommu_paging_structure_coherency(iommu)) {
 643			domain->iommu_coherency = 0;
 644			break;
 645		}
 646	}
 647	rcu_read_unlock();
 648}
 649
 650static int domain_update_iommu_snooping(struct intel_iommu *skip)
 651{
 652	struct dmar_drhd_unit *drhd;
 653	struct intel_iommu *iommu;
 654	int ret = 1;
 655
 656	rcu_read_lock();
 657	for_each_active_iommu(iommu, drhd) {
 658		if (iommu != skip) {
 659			if (!ecap_sc_support(iommu->ecap)) {
 660				ret = 0;
 661				break;
 662			}
 663		}
 664	}
 665	rcu_read_unlock();
 666
 667	return ret;
 668}
 669
 670static int domain_update_iommu_superpage(struct dmar_domain *domain,
 671					 struct intel_iommu *skip)
 672{
 673	struct dmar_drhd_unit *drhd;
 674	struct intel_iommu *iommu;
 675	int mask = 0x3;
 676
 677	if (!intel_iommu_superpage) {
 678		return 0;
 679	}
 680
 681	/* set iommu_superpage to the smallest common denominator */
 682	rcu_read_lock();
 683	for_each_active_iommu(iommu, drhd) {
 684		if (iommu != skip) {
 685			if (domain && domain_use_first_level(domain)) {
 686				if (!cap_fl1gp_support(iommu->cap))
 687					mask = 0x1;
 688			} else {
 689				mask &= cap_super_page_val(iommu->cap);
 690			}
 691
 692			if (!mask)
 693				break;
 694		}
 695	}
 696	rcu_read_unlock();
 697
 698	return fls(mask);
 699}
 700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 701/* Some capabilities may be different across iommus */
 702static void domain_update_iommu_cap(struct dmar_domain *domain)
 703{
 704	domain_update_iommu_coherency(domain);
 705	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 706	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 707}
 708
 709struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 710					 u8 devfn, int alloc)
 711{
 712	struct root_entry *root = &iommu->root_entry[bus];
 713	struct context_entry *context;
 714	u64 *entry;
 715
 
 
 
 
 
 
 
 716	entry = &root->lo;
 717	if (sm_supported(iommu)) {
 718		if (devfn >= 0x80) {
 719			devfn -= 0x80;
 720			entry = &root->hi;
 721		}
 722		devfn *= 2;
 723	}
 724	if (*entry & 1)
 725		context = phys_to_virt(*entry & VTD_PAGE_MASK);
 726	else {
 727		unsigned long phy_addr;
 728		if (!alloc)
 729			return NULL;
 730
 731		context = alloc_pgtable_page(iommu->node);
 732		if (!context)
 733			return NULL;
 734
 735		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 736		phy_addr = virt_to_phys((void *)context);
 737		*entry = phy_addr | 1;
 738		__iommu_flush_cache(iommu, entry, sizeof(*entry));
 739	}
 740	return &context[devfn];
 741}
 742
 743static bool attach_deferred(struct device *dev)
 744{
 745	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
 746}
 747
 748/**
 749 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 750 *				 sub-hierarchy of a candidate PCI-PCI bridge
 751 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 752 * @bridge: the candidate PCI-PCI bridge
 753 *
 754 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 755 */
 756static bool
 757is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 758{
 759	struct pci_dev *pdev, *pbridge;
 760
 761	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 762		return false;
 763
 764	pdev = to_pci_dev(dev);
 765	pbridge = to_pci_dev(bridge);
 766
 767	if (pbridge->subordinate &&
 768	    pbridge->subordinate->number <= pdev->bus->number &&
 769	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
 770		return true;
 771
 772	return false;
 773}
 774
 775static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 776{
 777	struct dmar_drhd_unit *drhd;
 778	u32 vtbar;
 779	int rc;
 780
 781	/* We know that this device on this chipset has its own IOMMU.
 782	 * If we find it under a different IOMMU, then the BIOS is lying
 783	 * to us. Hope that the IOMMU for this device is actually
 784	 * disabled, and it needs no translation...
 785	 */
 786	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 787	if (rc) {
 788		/* "can't" happen */
 789		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 790		return false;
 791	}
 792	vtbar &= 0xffff0000;
 793
 794	/* we know that the this iommu should be at offset 0xa000 from vtbar */
 795	drhd = dmar_find_matched_drhd_unit(pdev);
 796	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 797		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 798		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 799		return true;
 800	}
 801
 802	return false;
 803}
 804
 805static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 806{
 807	if (!iommu || iommu->drhd->ignored)
 808		return true;
 809
 810	if (dev_is_pci(dev)) {
 811		struct pci_dev *pdev = to_pci_dev(dev);
 812
 813		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 814		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 815		    quirk_ioat_snb_local_iommu(pdev))
 816			return true;
 817	}
 818
 819	return false;
 820}
 821
 822struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 823{
 824	struct dmar_drhd_unit *drhd = NULL;
 825	struct pci_dev *pdev = NULL;
 826	struct intel_iommu *iommu;
 827	struct device *tmp;
 828	u16 segment = 0;
 829	int i;
 830
 831	if (!dev)
 832		return NULL;
 833
 834	if (dev_is_pci(dev)) {
 835		struct pci_dev *pf_pdev;
 836
 837		pdev = pci_real_dma_dev(to_pci_dev(dev));
 838
 839		/* VFs aren't listed in scope tables; we need to look up
 840		 * the PF instead to find the IOMMU. */
 841		pf_pdev = pci_physfn(pdev);
 842		dev = &pf_pdev->dev;
 843		segment = pci_domain_nr(pdev->bus);
 844	} else if (has_acpi_companion(dev))
 845		dev = &ACPI_COMPANION(dev)->dev;
 846
 847	rcu_read_lock();
 848	for_each_iommu(iommu, drhd) {
 849		if (pdev && segment != drhd->segment)
 850			continue;
 851
 852		for_each_active_dev_scope(drhd->devices,
 853					  drhd->devices_cnt, i, tmp) {
 854			if (tmp == dev) {
 855				/* For a VF use its original BDF# not that of the PF
 856				 * which we used for the IOMMU lookup. Strictly speaking
 857				 * we could do this for all PCI devices; we only need to
 858				 * get the BDF# from the scope table for ACPI matches. */
 859				if (pdev && pdev->is_virtfn)
 860					goto got_pdev;
 861
 862				if (bus && devfn) {
 863					*bus = drhd->devices[i].bus;
 864					*devfn = drhd->devices[i].devfn;
 865				}
 866				goto out;
 867			}
 868
 869			if (is_downstream_to_pci_bridge(dev, tmp))
 870				goto got_pdev;
 871		}
 872
 873		if (pdev && drhd->include_all) {
 874		got_pdev:
 875			if (bus && devfn) {
 876				*bus = pdev->bus->number;
 877				*devfn = pdev->devfn;
 878			}
 879			goto out;
 880		}
 881	}
 882	iommu = NULL;
 883 out:
 884	if (iommu_is_dummy(iommu, dev))
 885		iommu = NULL;
 886
 887	rcu_read_unlock();
 888
 889	return iommu;
 890}
 891
 892static void domain_flush_cache(struct dmar_domain *domain,
 893			       void *addr, int size)
 894{
 895	if (!domain->iommu_coherency)
 896		clflush_cache_range(addr, size);
 897}
 898
 899static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 900{
 901	struct context_entry *context;
 902	int ret = 0;
 903	unsigned long flags;
 904
 905	spin_lock_irqsave(&iommu->lock, flags);
 906	context = iommu_context_addr(iommu, bus, devfn, 0);
 907	if (context)
 908		ret = context_present(context);
 909	spin_unlock_irqrestore(&iommu->lock, flags);
 910	return ret;
 911}
 912
 913static void free_context_table(struct intel_iommu *iommu)
 914{
 915	int i;
 916	unsigned long flags;
 917	struct context_entry *context;
 918
 919	spin_lock_irqsave(&iommu->lock, flags);
 920	if (!iommu->root_entry) {
 921		goto out;
 922	}
 923	for (i = 0; i < ROOT_ENTRY_NR; i++) {
 924		context = iommu_context_addr(iommu, i, 0, 0);
 925		if (context)
 926			free_pgtable_page(context);
 927
 928		if (!sm_supported(iommu))
 929			continue;
 930
 931		context = iommu_context_addr(iommu, i, 0x80, 0);
 932		if (context)
 933			free_pgtable_page(context);
 
 934
 935	}
 936	free_pgtable_page(iommu->root_entry);
 937	iommu->root_entry = NULL;
 938out:
 939	spin_unlock_irqrestore(&iommu->lock, flags);
 940}
 941
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 942static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 943				      unsigned long pfn, int *target_level)
 944{
 945	struct dma_pte *parent, *pte;
 946	int level = agaw_to_level(domain->agaw);
 947	int offset;
 948
 949	BUG_ON(!domain->pgd);
 950
 951	if (!domain_pfn_supported(domain, pfn))
 952		/* Address beyond IOMMU's addressing capabilities. */
 953		return NULL;
 954
 955	parent = domain->pgd;
 956
 957	while (1) {
 958		void *tmp_page;
 959
 960		offset = pfn_level_offset(pfn, level);
 961		pte = &parent[offset];
 962		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 963			break;
 964		if (level == *target_level)
 965			break;
 966
 967		if (!dma_pte_present(pte)) {
 968			uint64_t pteval;
 969
 970			tmp_page = alloc_pgtable_page(domain->nid);
 971
 972			if (!tmp_page)
 973				return NULL;
 974
 975			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 976			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 977			if (domain_use_first_level(domain))
 978				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
 
 979			if (cmpxchg64(&pte->val, 0ULL, pteval))
 980				/* Someone else set it while we were thinking; use theirs. */
 981				free_pgtable_page(tmp_page);
 982			else
 983				domain_flush_cache(domain, pte, sizeof(*pte));
 984		}
 985		if (level == 1)
 986			break;
 987
 988		parent = phys_to_virt(dma_pte_addr(pte));
 989		level--;
 990	}
 991
 992	if (!*target_level)
 993		*target_level = level;
 994
 995	return pte;
 996}
 997
 998/* return address's pte at specific level */
 999static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1000					 unsigned long pfn,
1001					 int level, int *large_page)
1002{
1003	struct dma_pte *parent, *pte;
1004	int total = agaw_to_level(domain->agaw);
1005	int offset;
1006
1007	parent = domain->pgd;
1008	while (level <= total) {
1009		offset = pfn_level_offset(pfn, total);
1010		pte = &parent[offset];
1011		if (level == total)
1012			return pte;
1013
1014		if (!dma_pte_present(pte)) {
1015			*large_page = total;
1016			break;
1017		}
1018
1019		if (dma_pte_superpage(pte)) {
1020			*large_page = total;
1021			return pte;
1022		}
1023
1024		parent = phys_to_virt(dma_pte_addr(pte));
1025		total--;
1026	}
1027	return NULL;
1028}
1029
1030/* clear last level pte, a tlb flush should be followed */
1031static void dma_pte_clear_range(struct dmar_domain *domain,
1032				unsigned long start_pfn,
1033				unsigned long last_pfn)
1034{
1035	unsigned int large_page;
1036	struct dma_pte *first_pte, *pte;
1037
1038	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1039	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1040	BUG_ON(start_pfn > last_pfn);
1041
1042	/* we don't need lock here; nobody else touches the iova range */
1043	do {
1044		large_page = 1;
1045		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1046		if (!pte) {
1047			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1048			continue;
1049		}
1050		do {
1051			dma_clear_pte(pte);
1052			start_pfn += lvl_to_nr_pages(large_page);
1053			pte++;
1054		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1055
1056		domain_flush_cache(domain, first_pte,
1057				   (void *)pte - (void *)first_pte);
1058
1059	} while (start_pfn && start_pfn <= last_pfn);
1060}
1061
1062static void dma_pte_free_level(struct dmar_domain *domain, int level,
1063			       int retain_level, struct dma_pte *pte,
1064			       unsigned long pfn, unsigned long start_pfn,
1065			       unsigned long last_pfn)
1066{
1067	pfn = max(start_pfn, pfn);
1068	pte = &pte[pfn_level_offset(pfn, level)];
1069
1070	do {
1071		unsigned long level_pfn;
1072		struct dma_pte *level_pte;
1073
1074		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1075			goto next;
1076
1077		level_pfn = pfn & level_mask(level);
1078		level_pte = phys_to_virt(dma_pte_addr(pte));
1079
1080		if (level > 2) {
1081			dma_pte_free_level(domain, level - 1, retain_level,
1082					   level_pte, level_pfn, start_pfn,
1083					   last_pfn);
1084		}
1085
1086		/*
1087		 * Free the page table if we're below the level we want to
1088		 * retain and the range covers the entire table.
1089		 */
1090		if (level < retain_level && !(start_pfn > level_pfn ||
1091		      last_pfn < level_pfn + level_size(level) - 1)) {
1092			dma_clear_pte(pte);
1093			domain_flush_cache(domain, pte, sizeof(*pte));
1094			free_pgtable_page(level_pte);
1095		}
1096next:
1097		pfn += level_size(level);
1098	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1099}
1100
1101/*
1102 * clear last level (leaf) ptes and free page table pages below the
1103 * level we wish to keep intact.
1104 */
1105static void dma_pte_free_pagetable(struct dmar_domain *domain,
1106				   unsigned long start_pfn,
1107				   unsigned long last_pfn,
1108				   int retain_level)
1109{
1110	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1111	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1112	BUG_ON(start_pfn > last_pfn);
1113
1114	dma_pte_clear_range(domain, start_pfn, last_pfn);
1115
1116	/* We don't need lock here; nobody else touches the iova range */
1117	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1118			   domain->pgd, 0, start_pfn, last_pfn);
1119
1120	/* free pgd */
1121	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1122		free_pgtable_page(domain->pgd);
1123		domain->pgd = NULL;
1124	}
1125}
1126
1127/* When a page at a given level is being unlinked from its parent, we don't
1128   need to *modify* it at all. All we need to do is make a list of all the
1129   pages which can be freed just as soon as we've flushed the IOTLB and we
1130   know the hardware page-walk will no longer touch them.
1131   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1132   be freed. */
1133static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1134					    int level, struct dma_pte *pte,
1135					    struct page *freelist)
1136{
1137	struct page *pg;
1138
1139	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1140	pg->freelist = freelist;
1141	freelist = pg;
1142
1143	if (level == 1)
1144		return freelist;
1145
1146	pte = page_address(pg);
1147	do {
1148		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1149			freelist = dma_pte_list_pagetables(domain, level - 1,
1150							   pte, freelist);
1151		pte++;
1152	} while (!first_pte_in_page(pte));
1153
1154	return freelist;
1155}
1156
1157static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1158					struct dma_pte *pte, unsigned long pfn,
1159					unsigned long start_pfn,
1160					unsigned long last_pfn,
1161					struct page *freelist)
1162{
1163	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1164
1165	pfn = max(start_pfn, pfn);
1166	pte = &pte[pfn_level_offset(pfn, level)];
1167
1168	do {
1169		unsigned long level_pfn;
1170
1171		if (!dma_pte_present(pte))
1172			goto next;
1173
1174		level_pfn = pfn & level_mask(level);
1175
1176		/* If range covers entire pagetable, free it */
1177		if (start_pfn <= level_pfn &&
1178		    last_pfn >= level_pfn + level_size(level) - 1) {
1179			/* These suborbinate page tables are going away entirely. Don't
1180			   bother to clear them; we're just going to *free* them. */
1181			if (level > 1 && !dma_pte_superpage(pte))
1182				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1183
1184			dma_clear_pte(pte);
1185			if (!first_pte)
1186				first_pte = pte;
1187			last_pte = pte;
1188		} else if (level > 1) {
1189			/* Recurse down into a level that isn't *entirely* obsolete */
1190			freelist = dma_pte_clear_level(domain, level - 1,
1191						       phys_to_virt(dma_pte_addr(pte)),
1192						       level_pfn, start_pfn, last_pfn,
1193						       freelist);
1194		}
1195next:
1196		pfn += level_size(level);
1197	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1198
1199	if (first_pte)
1200		domain_flush_cache(domain, first_pte,
1201				   (void *)++last_pte - (void *)first_pte);
1202
1203	return freelist;
1204}
1205
1206/* We can't just free the pages because the IOMMU may still be walking
1207   the page tables, and may have cached the intermediate levels. The
1208   pages can only be freed after the IOTLB flush has been done. */
1209static struct page *domain_unmap(struct dmar_domain *domain,
1210				 unsigned long start_pfn,
1211				 unsigned long last_pfn)
1212{
1213	struct page *freelist;
1214
1215	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1216	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1217	BUG_ON(start_pfn > last_pfn);
1218
1219	/* we don't need lock here; nobody else touches the iova range */
1220	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1221				       domain->pgd, 0, start_pfn, last_pfn, NULL);
1222
1223	/* free pgd */
1224	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1225		struct page *pgd_page = virt_to_page(domain->pgd);
1226		pgd_page->freelist = freelist;
1227		freelist = pgd_page;
1228
1229		domain->pgd = NULL;
1230	}
1231
1232	return freelist;
1233}
1234
1235static void dma_free_pagelist(struct page *freelist)
1236{
1237	struct page *pg;
1238
1239	while ((pg = freelist)) {
1240		freelist = pg->freelist;
1241		free_pgtable_page(page_address(pg));
1242	}
1243}
1244
1245static void iova_entry_free(unsigned long data)
1246{
1247	struct page *freelist = (struct page *)data;
1248
1249	dma_free_pagelist(freelist);
1250}
1251
1252/* iommu handling */
1253static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1254{
1255	struct root_entry *root;
1256	unsigned long flags;
1257
1258	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1259	if (!root) {
1260		pr_err("Allocating root entry for %s failed\n",
1261			iommu->name);
1262		return -ENOMEM;
1263	}
1264
1265	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1266
1267	spin_lock_irqsave(&iommu->lock, flags);
1268	iommu->root_entry = root;
1269	spin_unlock_irqrestore(&iommu->lock, flags);
1270
1271	return 0;
1272}
1273
1274static void iommu_set_root_entry(struct intel_iommu *iommu)
1275{
1276	u64 addr;
1277	u32 sts;
1278	unsigned long flag;
1279
1280	addr = virt_to_phys(iommu->root_entry);
1281	if (sm_supported(iommu))
1282		addr |= DMA_RTADDR_SMT;
1283
1284	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1286
1287	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1288
1289	/* Make sure hardware complete it */
1290	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1291		      readl, (sts & DMA_GSTS_RTPS), sts);
1292
1293	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 
 
 
 
 
 
 
 
 
 
 
 
1294}
1295
1296void iommu_flush_write_buffer(struct intel_iommu *iommu)
1297{
1298	u32 val;
1299	unsigned long flag;
1300
1301	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1302		return;
1303
1304	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1305	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1306
1307	/* Make sure hardware complete it */
1308	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1309		      readl, (!(val & DMA_GSTS_WBFS)), val);
1310
1311	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1312}
1313
1314/* return value determine if we need a write buffer flush */
1315static void __iommu_flush_context(struct intel_iommu *iommu,
1316				  u16 did, u16 source_id, u8 function_mask,
1317				  u64 type)
1318{
1319	u64 val = 0;
1320	unsigned long flag;
1321
1322	switch (type) {
1323	case DMA_CCMD_GLOBAL_INVL:
1324		val = DMA_CCMD_GLOBAL_INVL;
1325		break;
1326	case DMA_CCMD_DOMAIN_INVL:
1327		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1328		break;
1329	case DMA_CCMD_DEVICE_INVL:
1330		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1331			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1332		break;
1333	default:
1334		BUG();
1335	}
1336	val |= DMA_CCMD_ICC;
1337
1338	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1339	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1340
1341	/* Make sure hardware complete it */
1342	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1343		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1344
1345	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1346}
1347
1348/* return value determine if we need a write buffer flush */
1349static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1350				u64 addr, unsigned int size_order, u64 type)
1351{
1352	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1353	u64 val = 0, val_iva = 0;
1354	unsigned long flag;
1355
1356	switch (type) {
1357	case DMA_TLB_GLOBAL_FLUSH:
1358		/* global flush doesn't need set IVA_REG */
1359		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1360		break;
1361	case DMA_TLB_DSI_FLUSH:
1362		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1363		break;
1364	case DMA_TLB_PSI_FLUSH:
1365		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1366		/* IH bit is passed in as part of address */
1367		val_iva = size_order | addr;
1368		break;
1369	default:
1370		BUG();
1371	}
1372	/* Note: set drain read/write */
1373#if 0
1374	/*
1375	 * This is probably to be super secure.. Looks like we can
1376	 * ignore it without any impact.
1377	 */
1378	if (cap_read_drain(iommu->cap))
1379		val |= DMA_TLB_READ_DRAIN;
1380#endif
1381	if (cap_write_drain(iommu->cap))
1382		val |= DMA_TLB_WRITE_DRAIN;
1383
1384	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1385	/* Note: Only uses first TLB reg currently */
1386	if (val_iva)
1387		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1388	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1389
1390	/* Make sure hardware complete it */
1391	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1392		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1393
1394	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1395
1396	/* check IOTLB invalidation granularity */
1397	if (DMA_TLB_IAIG(val) == 0)
1398		pr_err("Flush IOTLB failed\n");
1399	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1400		pr_debug("TLB flush request %Lx, actual %Lx\n",
1401			(unsigned long long)DMA_TLB_IIRG(type),
1402			(unsigned long long)DMA_TLB_IAIG(val));
1403}
1404
1405static struct device_domain_info *
1406iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1407			 u8 bus, u8 devfn)
1408{
1409	struct device_domain_info *info;
 
1410
1411	assert_spin_locked(&device_domain_lock);
1412
1413	if (!iommu->qi)
1414		return NULL;
1415
1416	list_for_each_entry(info, &domain->devices, link)
1417		if (info->iommu == iommu && info->bus == bus &&
1418		    info->devfn == devfn) {
1419			if (info->ats_supported && info->dev)
1420				return info;
1421			break;
1422		}
 
 
1423
1424	return NULL;
1425}
1426
1427static void domain_update_iotlb(struct dmar_domain *domain)
1428{
1429	struct device_domain_info *info;
1430	bool has_iotlb_device = false;
 
1431
1432	assert_spin_locked(&device_domain_lock);
1433
1434	list_for_each_entry(info, &domain->devices, link) {
1435		struct pci_dev *pdev;
1436
1437		if (!info->dev || !dev_is_pci(info->dev))
1438			continue;
1439
1440		pdev = to_pci_dev(info->dev);
1441		if (pdev->ats_enabled) {
1442			has_iotlb_device = true;
1443			break;
1444		}
1445	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1446
1447	domain->has_iotlb_device = has_iotlb_device;
1448}
1449
1450static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1451{
1452	struct pci_dev *pdev;
1453
1454	assert_spin_locked(&device_domain_lock);
1455
1456	if (!info || !dev_is_pci(info->dev))
1457		return;
1458
1459	pdev = to_pci_dev(info->dev);
1460	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1461	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1462	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1463	 * reserved, which should be set to 0.
1464	 */
1465	if (!ecap_dit(info->iommu->ecap))
1466		info->pfsid = 0;
1467	else {
1468		struct pci_dev *pf_pdev;
1469
1470		/* pdev will be returned if device is not a vf */
1471		pf_pdev = pci_physfn(pdev);
1472		info->pfsid = pci_dev_id(pf_pdev);
1473	}
1474
1475#ifdef CONFIG_INTEL_IOMMU_SVM
1476	/* The PCIe spec, in its wisdom, declares that the behaviour of
1477	   the device if you enable PASID support after ATS support is
1478	   undefined. So always enable PASID support on devices which
1479	   have it, even if we can't yet know if we're ever going to
1480	   use it. */
1481	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1482		info->pasid_enabled = 1;
1483
1484	if (info->pri_supported &&
1485	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1486	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1487		info->pri_enabled = 1;
1488#endif
1489	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1490	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1491		info->ats_enabled = 1;
1492		domain_update_iotlb(info->domain);
1493		info->ats_qdep = pci_ats_queue_depth(pdev);
1494	}
1495}
1496
1497static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1498{
1499	struct pci_dev *pdev;
1500
1501	assert_spin_locked(&device_domain_lock);
1502
1503	if (!dev_is_pci(info->dev))
1504		return;
1505
1506	pdev = to_pci_dev(info->dev);
1507
1508	if (info->ats_enabled) {
1509		pci_disable_ats(pdev);
1510		info->ats_enabled = 0;
1511		domain_update_iotlb(info->domain);
1512	}
1513#ifdef CONFIG_INTEL_IOMMU_SVM
1514	if (info->pri_enabled) {
1515		pci_disable_pri(pdev);
1516		info->pri_enabled = 0;
1517	}
 
1518	if (info->pasid_enabled) {
1519		pci_disable_pasid(pdev);
1520		info->pasid_enabled = 0;
1521	}
1522#endif
1523}
1524
1525static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1526				  u64 addr, unsigned mask)
1527{
1528	u16 sid, qdep;
1529	unsigned long flags;
1530	struct device_domain_info *info;
1531
1532	if (!domain->has_iotlb_device)
1533		return;
1534
1535	spin_lock_irqsave(&device_domain_lock, flags);
1536	list_for_each_entry(info, &domain->devices, link) {
1537		if (!info->ats_enabled)
1538			continue;
1539
1540		sid = info->bus << 8 | info->devfn;
1541		qdep = info->ats_qdep;
1542		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1543				qdep, addr, mask);
1544	}
1545	spin_unlock_irqrestore(&device_domain_lock, flags);
1546}
1547
1548static void domain_flush_piotlb(struct intel_iommu *iommu,
1549				struct dmar_domain *domain,
1550				u64 addr, unsigned long npages, bool ih)
1551{
1552	u16 did = domain->iommu_did[iommu->seq_id];
 
1553
1554	if (domain->default_pasid)
1555		qi_flush_piotlb(iommu, did, domain->default_pasid,
1556				addr, npages, ih);
1557
1558	if (!list_empty(&domain->devices))
1559		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
 
 
1560}
1561
1562static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1563				  struct dmar_domain *domain,
1564				  unsigned long pfn, unsigned int pages,
1565				  int ih, int map)
1566{
1567	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
 
1568	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1569	u16 did = domain->iommu_did[iommu->seq_id];
1570
1571	BUG_ON(pages == 0);
1572
1573	if (ih)
1574		ih = 1 << 6;
1575
1576	if (domain_use_first_level(domain)) {
1577		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1578	} else {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1579		/*
1580		 * Fallback to domain selective flush if no PSI support or
1581		 * the size is too big. PSI requires page size to be 2 ^ x,
1582		 * and the base address is naturally aligned to the size.
1583		 */
1584		if (!cap_pgsel_inv(iommu->cap) ||
1585		    mask > cap_max_amask_val(iommu->cap))
1586			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1587							DMA_TLB_DSI_FLUSH);
1588		else
1589			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1590							DMA_TLB_PSI_FLUSH);
1591	}
1592
1593	/*
1594	 * In caching mode, changes of pages from non-present to present require
1595	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1596	 */
1597	if (!cap_caching_mode(iommu->cap) || !map)
1598		iommu_flush_dev_iotlb(domain, addr, mask);
1599}
1600
1601/* Notification for newly created mappings */
1602static inline void __mapping_notify_one(struct intel_iommu *iommu,
1603					struct dmar_domain *domain,
1604					unsigned long pfn, unsigned int pages)
1605{
1606	/*
1607	 * It's a non-present to present mapping. Only flush if caching mode
1608	 * and second level.
1609	 */
1610	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1611		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1612	else
1613		iommu_flush_write_buffer(iommu);
1614}
1615
1616static void iommu_flush_iova(struct iova_domain *iovad)
1617{
1618	struct dmar_domain *domain;
1619	int idx;
 
1620
1621	domain = container_of(iovad, struct dmar_domain, iovad);
 
 
1622
1623	for_each_domain_iommu(idx, domain) {
1624		struct intel_iommu *iommu = g_iommus[idx];
1625		u16 did = domain->iommu_did[iommu->seq_id];
1626
1627		if (domain_use_first_level(domain))
1628			domain_flush_piotlb(iommu, domain, 0, -1, 0);
1629		else
1630			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1631						 DMA_TLB_DSI_FLUSH);
1632
1633		if (!cap_caching_mode(iommu->cap))
1634			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1635					      0, MAX_AGAW_PFN_WIDTH);
1636	}
1637}
1638
1639static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1640{
1641	u32 pmen;
1642	unsigned long flags;
1643
1644	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1645		return;
1646
1647	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1648	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1649	pmen &= ~DMA_PMEN_EPM;
1650	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1651
1652	/* wait for the protected region status bit to clear */
1653	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1654		readl, !(pmen & DMA_PMEN_PRS), pmen);
1655
1656	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1657}
1658
1659static void iommu_enable_translation(struct intel_iommu *iommu)
1660{
1661	u32 sts;
1662	unsigned long flags;
1663
1664	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1665	iommu->gcmd |= DMA_GCMD_TE;
1666	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1667
1668	/* Make sure hardware complete it */
1669	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1670		      readl, (sts & DMA_GSTS_TES), sts);
1671
1672	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1673}
1674
1675static void iommu_disable_translation(struct intel_iommu *iommu)
1676{
1677	u32 sts;
1678	unsigned long flag;
1679
1680	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1681	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1682		return;
1683
1684	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1685	iommu->gcmd &= ~DMA_GCMD_TE;
1686	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1687
1688	/* Make sure hardware complete it */
1689	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1690		      readl, (!(sts & DMA_GSTS_TES)), sts);
1691
1692	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1693}
1694
1695static int iommu_init_domains(struct intel_iommu *iommu)
1696{
1697	u32 ndomains, nlongs;
1698	size_t size;
1699
1700	ndomains = cap_ndoms(iommu->cap);
1701	pr_debug("%s: Number of Domains supported <%d>\n",
1702		 iommu->name, ndomains);
1703	nlongs = BITS_TO_LONGS(ndomains);
1704
1705	spin_lock_init(&iommu->lock);
1706
1707	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1708	if (!iommu->domain_ids) {
1709		pr_err("%s: Allocating domain id array failed\n",
1710		       iommu->name);
1711		return -ENOMEM;
1712	}
1713
1714	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1715	iommu->domains = kzalloc(size, GFP_KERNEL);
1716
1717	if (iommu->domains) {
1718		size = 256 * sizeof(struct dmar_domain *);
1719		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1720	}
1721
1722	if (!iommu->domains || !iommu->domains[0]) {
1723		pr_err("%s: Allocating domain array failed\n",
1724		       iommu->name);
1725		kfree(iommu->domain_ids);
1726		kfree(iommu->domains);
1727		iommu->domain_ids = NULL;
1728		iommu->domains    = NULL;
1729		return -ENOMEM;
1730	}
1731
1732	/*
1733	 * If Caching mode is set, then invalid translations are tagged
1734	 * with domain-id 0, hence we need to pre-allocate it. We also
1735	 * use domain-id 0 as a marker for non-allocated domain-id, so
1736	 * make sure it is not used for a real domain.
1737	 */
1738	set_bit(0, iommu->domain_ids);
1739
1740	/*
1741	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1742	 * entry for first-level or pass-through translation modes should
1743	 * be programmed with a domain id different from those used for
1744	 * second-level or nested translation. We reserve a domain id for
1745	 * this purpose.
1746	 */
1747	if (sm_supported(iommu))
1748		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1749
1750	return 0;
1751}
1752
1753static void disable_dmar_iommu(struct intel_iommu *iommu)
1754{
1755	struct device_domain_info *info, *tmp;
1756	unsigned long flags;
1757
1758	if (!iommu->domains || !iommu->domain_ids)
 
 
 
 
 
1759		return;
1760
1761	spin_lock_irqsave(&device_domain_lock, flags);
1762	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1763		if (info->iommu != iommu)
1764			continue;
1765
1766		if (!info->dev || !info->domain)
1767			continue;
1768
1769		__dmar_remove_one_dev_info(info);
1770	}
1771	spin_unlock_irqrestore(&device_domain_lock, flags);
1772
1773	if (iommu->gcmd & DMA_GCMD_TE)
1774		iommu_disable_translation(iommu);
1775}
1776
1777static void free_dmar_iommu(struct intel_iommu *iommu)
1778{
1779	if ((iommu->domains) && (iommu->domain_ids)) {
1780		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1781		int i;
1782
1783		for (i = 0; i < elems; i++)
1784			kfree(iommu->domains[i]);
1785		kfree(iommu->domains);
1786		kfree(iommu->domain_ids);
1787		iommu->domains = NULL;
1788		iommu->domain_ids = NULL;
1789	}
1790
1791	g_iommus[iommu->seq_id] = NULL;
 
 
 
1792
1793	/* free context mapping */
1794	free_context_table(iommu);
1795
1796#ifdef CONFIG_INTEL_IOMMU_SVM
1797	if (pasid_supported(iommu)) {
1798		if (ecap_prs(iommu->ecap))
1799			intel_svm_finish_prq(iommu);
1800	}
1801	if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1802		ioasid_unregister_allocator(&iommu->pasid_allocator);
1803
1804#endif
1805}
1806
1807/*
1808 * Check and return whether first level is used by default for
1809 * DMA translation.
1810 */
1811static bool first_level_by_default(void)
1812{
1813	struct dmar_drhd_unit *drhd;
1814	struct intel_iommu *iommu;
1815	static int first_level_support = -1;
1816
1817	if (likely(first_level_support != -1))
1818		return first_level_support;
1819
1820	first_level_support = 1;
1821
1822	rcu_read_lock();
1823	for_each_active_iommu(iommu, drhd) {
1824		if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1825			first_level_support = 0;
1826			break;
1827		}
1828	}
1829	rcu_read_unlock();
1830
1831	return first_level_support;
 
1832}
1833
1834static struct dmar_domain *alloc_domain(int flags)
1835{
1836	struct dmar_domain *domain;
1837
1838	domain = alloc_domain_mem();
1839	if (!domain)
1840		return NULL;
1841
1842	memset(domain, 0, sizeof(*domain));
1843	domain->nid = NUMA_NO_NODE;
1844	domain->flags = flags;
1845	if (first_level_by_default())
1846		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1847	domain->has_iotlb_device = false;
1848	INIT_LIST_HEAD(&domain->devices);
 
 
1849
1850	return domain;
1851}
1852
1853/* Must be called with iommu->lock */
1854static int domain_attach_iommu(struct dmar_domain *domain,
1855			       struct intel_iommu *iommu)
1856{
 
1857	unsigned long ndomains;
1858	int num;
1859
1860	assert_spin_locked(&device_domain_lock);
1861	assert_spin_locked(&iommu->lock);
 
1862
1863	domain->iommu_refcnt[iommu->seq_id] += 1;
1864	domain->iommu_count += 1;
1865	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1866		ndomains = cap_ndoms(iommu->cap);
1867		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
 
 
 
1868
1869		if (num >= ndomains) {
1870			pr_err("%s: No free domain ids\n", iommu->name);
1871			domain->iommu_refcnt[iommu->seq_id] -= 1;
1872			domain->iommu_count -= 1;
1873			return -ENOSPC;
1874		}
1875
1876		set_bit(num, iommu->domain_ids);
1877		set_iommu_domain(iommu, num, domain);
1878
1879		domain->iommu_did[iommu->seq_id] = num;
1880		domain->nid			 = iommu->node;
1881
1882		domain_update_iommu_cap(domain);
 
 
1883	}
 
1884
 
1885	return 0;
 
 
 
 
 
 
 
1886}
1887
1888static int domain_detach_iommu(struct dmar_domain *domain,
1889			       struct intel_iommu *iommu)
1890{
1891	int num, count;
1892
1893	assert_spin_locked(&device_domain_lock);
1894	assert_spin_locked(&iommu->lock);
1895
1896	domain->iommu_refcnt[iommu->seq_id] -= 1;
1897	count = --domain->iommu_count;
1898	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1899		num = domain->iommu_did[iommu->seq_id];
1900		clear_bit(num, iommu->domain_ids);
1901		set_iommu_domain(iommu, num, NULL);
1902
 
 
 
 
 
 
1903		domain_update_iommu_cap(domain);
1904		domain->iommu_did[iommu->seq_id] = 0;
1905	}
1906
1907	return count;
1908}
1909
1910static struct iova_domain reserved_iova_list;
1911static struct lock_class_key reserved_rbtree_key;
1912
1913static int dmar_init_reserved_ranges(void)
1914{
1915	struct pci_dev *pdev = NULL;
1916	struct iova *iova;
1917	int i;
1918
1919	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1920
1921	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1922		&reserved_rbtree_key);
1923
1924	/* IOAPIC ranges shouldn't be accessed by DMA */
1925	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1926		IOVA_PFN(IOAPIC_RANGE_END));
1927	if (!iova) {
1928		pr_err("Reserve IOAPIC range failed\n");
1929		return -ENODEV;
1930	}
1931
1932	/* Reserve all PCI MMIO to avoid peer-to-peer access */
1933	for_each_pci_dev(pdev) {
1934		struct resource *r;
1935
1936		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1937			r = &pdev->resource[i];
1938			if (!r->flags || !(r->flags & IORESOURCE_MEM))
1939				continue;
1940			iova = reserve_iova(&reserved_iova_list,
1941					    IOVA_PFN(r->start),
1942					    IOVA_PFN(r->end));
1943			if (!iova) {
1944				pci_err(pdev, "Reserve iova for %pR failed\n", r);
1945				return -ENODEV;
1946			}
1947		}
1948	}
1949	return 0;
1950}
1951
1952static inline int guestwidth_to_adjustwidth(int gaw)
1953{
1954	int agaw;
1955	int r = (gaw - 12) % 9;
1956
1957	if (r == 0)
1958		agaw = gaw;
1959	else
1960		agaw = gaw + 9 - r;
1961	if (agaw > 64)
1962		agaw = 64;
1963	return agaw;
1964}
1965
1966static void domain_exit(struct dmar_domain *domain)
1967{
1968
1969	/* Remove associated devices and clear attached or cached domains */
1970	domain_remove_dev_info(domain);
1971
1972	/* destroy iovas */
1973	if (domain->domain.type == IOMMU_DOMAIN_DMA)
1974		put_iova_domain(&domain->iovad);
1975
1976	if (domain->pgd) {
1977		struct page *freelist;
1978
1979		freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1980		dma_free_pagelist(freelist);
1981	}
1982
1983	free_domain_mem(domain);
 
 
 
1984}
1985
1986/*
1987 * Get the PASID directory size for scalable mode context entry.
1988 * Value of X in the PDTS field of a scalable mode context entry
1989 * indicates PASID directory with 2^(X + 7) entries.
1990 */
1991static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1992{
1993	int pds, max_pde;
1994
1995	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1996	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1997	if (pds < 7)
1998		return 0;
1999
2000	return pds - 7;
2001}
2002
2003/*
2004 * Set the RID_PASID field of a scalable mode context entry. The
2005 * IOMMU hardware will use the PASID value set in this field for
2006 * DMA translations of DMA requests without PASID.
2007 */
2008static inline void
2009context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2010{
2011	context->hi |= pasid & ((1 << 20) - 1);
2012}
2013
2014/*
2015 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2016 * entry.
2017 */
2018static inline void context_set_sm_dte(struct context_entry *context)
2019{
2020	context->lo |= (1 << 2);
2021}
2022
2023/*
2024 * Set the PRE(Page Request Enable) field of a scalable mode context
2025 * entry.
2026 */
2027static inline void context_set_sm_pre(struct context_entry *context)
2028{
2029	context->lo |= (1 << 4);
2030}
2031
2032/* Convert value to context PASID directory size field coding. */
2033#define context_pdts(pds)	(((pds) & 0x7) << 9)
2034
2035static int domain_context_mapping_one(struct dmar_domain *domain,
2036				      struct intel_iommu *iommu,
2037				      struct pasid_table *table,
2038				      u8 bus, u8 devfn)
2039{
2040	u16 did = domain->iommu_did[iommu->seq_id];
 
 
2041	int translation = CONTEXT_TT_MULTI_LEVEL;
2042	struct device_domain_info *info = NULL;
2043	struct context_entry *context;
2044	unsigned long flags;
2045	int ret;
2046
2047	WARN_ON(did == 0);
2048
2049	if (hw_pass_through && domain_type_is_si(domain))
2050		translation = CONTEXT_TT_PASS_THROUGH;
2051
2052	pr_debug("Set context mapping for %02x:%02x.%d\n",
2053		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2054
2055	BUG_ON(!domain->pgd);
2056
2057	spin_lock_irqsave(&device_domain_lock, flags);
2058	spin_lock(&iommu->lock);
2059
2060	ret = -ENOMEM;
2061	context = iommu_context_addr(iommu, bus, devfn, 1);
2062	if (!context)
2063		goto out_unlock;
2064
2065	ret = 0;
2066	if (context_present(context))
2067		goto out_unlock;
2068
2069	/*
2070	 * For kdump cases, old valid entries may be cached due to the
2071	 * in-flight DMA and copied pgtable, but there is no unmapping
2072	 * behaviour for them, thus we need an explicit cache flush for
2073	 * the newly-mapped device. For kdump, at this point, the device
2074	 * is supposed to finish reset at its driver probe stage, so no
2075	 * in-flight DMA will exist, and we don't need to worry anymore
2076	 * hereafter.
2077	 */
2078	if (context_copied(context)) {
2079		u16 did_old = context_domain_id(context);
2080
2081		if (did_old < cap_ndoms(iommu->cap)) {
2082			iommu->flush.flush_context(iommu, did_old,
2083						   (((u16)bus) << 8) | devfn,
2084						   DMA_CCMD_MASK_NOBIT,
2085						   DMA_CCMD_DEVICE_INVL);
2086			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2087						 DMA_TLB_DSI_FLUSH);
2088		}
 
 
2089	}
2090
2091	context_clear_entry(context);
2092
2093	if (sm_supported(iommu)) {
2094		unsigned long pds;
2095
2096		WARN_ON(!table);
2097
2098		/* Setup the PASID DIR pointer: */
2099		pds = context_get_sm_pds(table);
2100		context->lo = (u64)virt_to_phys(table->table) |
2101				context_pdts(pds);
2102
2103		/* Setup the RID_PASID field: */
2104		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2105
2106		/*
2107		 * Setup the Device-TLB enable bit and Page request
2108		 * Enable bit:
2109		 */
2110		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2111		if (info && info->ats_supported)
2112			context_set_sm_dte(context);
2113		if (info && info->pri_supported)
2114			context_set_sm_pre(context);
 
 
2115	} else {
2116		struct dma_pte *pgd = domain->pgd;
2117		int agaw;
2118
2119		context_set_domain_id(context, did);
2120
2121		if (translation != CONTEXT_TT_PASS_THROUGH) {
2122			/*
2123			 * Skip top levels of page tables for iommu which has
2124			 * less agaw than default. Unnecessary for PT mode.
2125			 */
2126			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2127				ret = -ENOMEM;
2128				pgd = phys_to_virt(dma_pte_addr(pgd));
2129				if (!dma_pte_present(pgd))
2130					goto out_unlock;
2131			}
2132
2133			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2134			if (info && info->ats_supported)
2135				translation = CONTEXT_TT_DEV_IOTLB;
2136			else
2137				translation = CONTEXT_TT_MULTI_LEVEL;
2138
2139			context_set_address_root(context, virt_to_phys(pgd));
2140			context_set_address_width(context, agaw);
2141		} else {
2142			/*
2143			 * In pass through mode, AW must be programmed to
2144			 * indicate the largest AGAW value supported by
2145			 * hardware. And ASR is ignored by hardware.
2146			 */
2147			context_set_address_width(context, iommu->msagaw);
2148		}
2149
2150		context_set_translation_type(context, translation);
2151	}
2152
2153	context_set_fault_enable(context);
2154	context_set_present(context);
2155	if (!ecap_coherent(iommu->ecap))
2156		clflush_cache_range(context, sizeof(*context));
2157
2158	/*
2159	 * It's a non-present to present mapping. If hardware doesn't cache
2160	 * non-present entry we only need to flush the write-buffer. If the
2161	 * _does_ cache non-present entries, then it does so in the special
2162	 * domain #0, which we have to flush:
2163	 */
2164	if (cap_caching_mode(iommu->cap)) {
2165		iommu->flush.flush_context(iommu, 0,
2166					   (((u16)bus) << 8) | devfn,
2167					   DMA_CCMD_MASK_NOBIT,
2168					   DMA_CCMD_DEVICE_INVL);
2169		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2170	} else {
2171		iommu_flush_write_buffer(iommu);
2172	}
2173	iommu_enable_dev_iotlb(info);
2174
2175	ret = 0;
2176
2177out_unlock:
2178	spin_unlock(&iommu->lock);
2179	spin_unlock_irqrestore(&device_domain_lock, flags);
2180
2181	return ret;
2182}
2183
2184struct domain_context_mapping_data {
2185	struct dmar_domain *domain;
2186	struct intel_iommu *iommu;
2187	struct pasid_table *table;
2188};
2189
2190static int domain_context_mapping_cb(struct pci_dev *pdev,
2191				     u16 alias, void *opaque)
2192{
2193	struct domain_context_mapping_data *data = opaque;
2194
2195	return domain_context_mapping_one(data->domain, data->iommu,
2196					  data->table, PCI_BUS_NUM(alias),
2197					  alias & 0xff);
2198}
2199
2200static int
2201domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2202{
2203	struct domain_context_mapping_data data;
2204	struct pasid_table *table;
2205	struct intel_iommu *iommu;
2206	u8 bus, devfn;
2207
2208	iommu = device_to_iommu(dev, &bus, &devfn);
2209	if (!iommu)
2210		return -ENODEV;
2211
2212	table = intel_pasid_get_table(dev);
2213
2214	if (!dev_is_pci(dev))
2215		return domain_context_mapping_one(domain, iommu, table,
2216						  bus, devfn);
2217
2218	data.domain = domain;
2219	data.iommu = iommu;
2220	data.table = table;
2221
2222	return pci_for_each_dma_alias(to_pci_dev(dev),
2223				      &domain_context_mapping_cb, &data);
2224}
2225
2226static int domain_context_mapped_cb(struct pci_dev *pdev,
2227				    u16 alias, void *opaque)
2228{
2229	struct intel_iommu *iommu = opaque;
2230
2231	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2232}
2233
2234static int domain_context_mapped(struct device *dev)
2235{
2236	struct intel_iommu *iommu;
2237	u8 bus, devfn;
2238
2239	iommu = device_to_iommu(dev, &bus, &devfn);
2240	if (!iommu)
2241		return -ENODEV;
2242
2243	if (!dev_is_pci(dev))
2244		return device_context_mapped(iommu, bus, devfn);
2245
2246	return !pci_for_each_dma_alias(to_pci_dev(dev),
2247				       domain_context_mapped_cb, iommu);
2248}
2249
2250/* Returns a number of VTD pages, but aligned to MM page size */
2251static inline unsigned long aligned_nrpages(unsigned long host_addr,
2252					    size_t size)
2253{
2254	host_addr &= ~PAGE_MASK;
2255	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2256}
2257
2258/* Return largest possible superpage level for a given mapping */
2259static inline int hardware_largepage_caps(struct dmar_domain *domain,
2260					  unsigned long iov_pfn,
2261					  unsigned long phy_pfn,
2262					  unsigned long pages)
2263{
2264	int support, level = 1;
2265	unsigned long pfnmerge;
2266
2267	support = domain->iommu_superpage;
2268
2269	/* To use a large page, the virtual *and* physical addresses
2270	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2271	   of them will mean we have to use smaller pages. So just
2272	   merge them and check both at once. */
2273	pfnmerge = iov_pfn | phy_pfn;
2274
2275	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2276		pages >>= VTD_STRIDE_SHIFT;
2277		if (!pages)
2278			break;
2279		pfnmerge >>= VTD_STRIDE_SHIFT;
2280		level++;
2281		support--;
2282	}
2283	return level;
2284}
2285
2286static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2287			    struct scatterlist *sg, unsigned long phys_pfn,
2288			    unsigned long nr_pages, int prot)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2289{
2290	struct dma_pte *first_pte = NULL, *pte = NULL;
2291	phys_addr_t pteval;
2292	unsigned long sg_res = 0;
2293	unsigned int largepage_lvl = 0;
2294	unsigned long lvl_pages = 0;
 
2295	u64 attr;
2296
2297	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2298
2299	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2300		return -EINVAL;
2301
2302	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2303	if (domain_use_first_level(domain))
2304		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
 
 
 
 
2305
2306	if (!sg) {
2307		sg_res = nr_pages;
2308		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2309	}
2310
2311	while (nr_pages > 0) {
2312		uint64_t tmp;
2313
2314		if (!sg_res) {
2315			unsigned int pgoff = sg->offset & ~PAGE_MASK;
2316
2317			sg_res = aligned_nrpages(sg->offset, sg->length);
2318			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2319			sg->dma_length = sg->length;
2320			pteval = (sg_phys(sg) - pgoff) | attr;
2321			phys_pfn = pteval >> VTD_PAGE_SHIFT;
2322		}
2323
2324		if (!pte) {
2325			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
 
2326
2327			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2328			if (!pte)
2329				return -ENOMEM;
 
 
 
 
2330			/* It is large page*/
2331			if (largepage_lvl > 1) {
2332				unsigned long nr_superpages, end_pfn;
 
2333
2334				pteval |= DMA_PTE_LARGE_PAGE;
2335				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2336
2337				nr_superpages = sg_res / lvl_pages;
2338				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2339
2340				/*
2341				 * Ensure that old small page tables are
2342				 * removed to make room for superpage(s).
2343				 * We're adding new large pages, so make sure
2344				 * we don't remove their parent tables.
2345				 */
2346				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2347						       largepage_lvl + 1);
2348			} else {
2349				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2350			}
2351
2352		}
2353		/* We don't need lock here, nobody else
2354		 * touches the iova range
2355		 */
2356		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2357		if (tmp) {
2358			static int dumps = 5;
2359			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2360				iov_pfn, tmp, (unsigned long long)pteval);
2361			if (dumps) {
2362				dumps--;
2363				debug_dma_dump_mappings(NULL);
2364			}
2365			WARN_ON(1);
2366		}
2367
2368		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2369
2370		BUG_ON(nr_pages < lvl_pages);
2371		BUG_ON(sg_res < lvl_pages);
2372
2373		nr_pages -= lvl_pages;
2374		iov_pfn += lvl_pages;
2375		phys_pfn += lvl_pages;
2376		pteval += lvl_pages * VTD_PAGE_SIZE;
2377		sg_res -= lvl_pages;
2378
2379		/* If the next PTE would be the first in a new page, then we
2380		   need to flush the cache on the entries we've just written.
2381		   And then we'll need to recalculate 'pte', so clear it and
2382		   let it get set again in the if (!pte) block above.
2383
2384		   If we're done (!nr_pages) we need to flush the cache too.
2385
2386		   Also if we've been setting superpages, we may need to
2387		   recalculate 'pte' and switch back to smaller pages for the
2388		   end of the mapping, if the trailing size is not enough to
2389		   use another superpage (i.e. sg_res < lvl_pages). */
 
2390		pte++;
2391		if (!nr_pages || first_pte_in_page(pte) ||
2392		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2393			domain_flush_cache(domain, first_pte,
2394					   (void *)pte - (void *)first_pte);
2395			pte = NULL;
2396		}
2397
2398		if (!sg_res && nr_pages)
2399			sg = sg_next(sg);
2400	}
2401	return 0;
2402}
2403
2404static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2405			  struct scatterlist *sg, unsigned long phys_pfn,
2406			  unsigned long nr_pages, int prot)
2407{
2408	int iommu_id, ret;
2409	struct intel_iommu *iommu;
2410
2411	/* Do the real mapping first */
2412	ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2413	if (ret)
2414		return ret;
2415
2416	for_each_domain_iommu(iommu_id, domain) {
2417		iommu = g_iommus[iommu_id];
2418		__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2419	}
2420
2421	return 0;
2422}
2423
2424static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2425				    struct scatterlist *sg, unsigned long nr_pages,
2426				    int prot)
2427{
2428	return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2429}
2430
2431static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2432				     unsigned long phys_pfn, unsigned long nr_pages,
2433				     int prot)
2434{
2435	return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2436}
2437
2438static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2439{
2440	unsigned long flags;
2441	struct context_entry *context;
2442	u16 did_old;
2443
2444	if (!iommu)
2445		return;
2446
2447	spin_lock_irqsave(&iommu->lock, flags);
2448	context = iommu_context_addr(iommu, bus, devfn, 0);
2449	if (!context) {
2450		spin_unlock_irqrestore(&iommu->lock, flags);
2451		return;
2452	}
2453	did_old = context_domain_id(context);
 
 
 
 
 
 
 
 
 
2454	context_clear_entry(context);
2455	__iommu_flush_cache(iommu, context, sizeof(*context));
2456	spin_unlock_irqrestore(&iommu->lock, flags);
2457	iommu->flush.flush_context(iommu,
2458				   did_old,
2459				   (((u16)bus) << 8) | devfn,
2460				   DMA_CCMD_MASK_NOBIT,
2461				   DMA_CCMD_DEVICE_INVL);
 
 
 
 
2462	iommu->flush.flush_iotlb(iommu,
2463				 did_old,
2464				 0,
2465				 0,
2466				 DMA_TLB_DSI_FLUSH);
2467}
2468
2469static inline void unlink_domain_info(struct device_domain_info *info)
2470{
2471	assert_spin_locked(&device_domain_lock);
2472	list_del(&info->link);
2473	list_del(&info->global);
2474	if (info->dev)
2475		dev_iommu_priv_set(info->dev, NULL);
2476}
2477
2478static void domain_remove_dev_info(struct dmar_domain *domain)
2479{
2480	struct device_domain_info *info, *tmp;
2481	unsigned long flags;
2482
2483	spin_lock_irqsave(&device_domain_lock, flags);
2484	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2485		__dmar_remove_one_dev_info(info);
2486	spin_unlock_irqrestore(&device_domain_lock, flags);
2487}
2488
2489struct dmar_domain *find_domain(struct device *dev)
2490{
2491	struct device_domain_info *info;
2492
2493	if (unlikely(attach_deferred(dev)))
2494		return NULL;
2495
2496	/* No lock here, assumes no domain exit in normal case */
2497	info = get_domain_info(dev);
2498	if (likely(info))
2499		return info->domain;
2500
2501	return NULL;
2502}
2503
2504static void do_deferred_attach(struct device *dev)
2505{
2506	struct iommu_domain *domain;
2507
2508	dev_iommu_priv_set(dev, NULL);
2509	domain = iommu_get_domain_for_dev(dev);
2510	if (domain)
2511		intel_iommu_attach_device(domain, dev);
2512}
2513
2514static inline struct device_domain_info *
2515dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2516{
2517	struct device_domain_info *info;
2518
2519	list_for_each_entry(info, &device_domain_list, global)
2520		if (info->segment == segment && info->bus == bus &&
2521		    info->devfn == devfn)
2522			return info;
2523
2524	return NULL;
2525}
2526
2527static int domain_setup_first_level(struct intel_iommu *iommu,
2528				    struct dmar_domain *domain,
2529				    struct device *dev,
2530				    int pasid)
2531{
2532	int flags = PASID_FLAG_SUPERVISOR_MODE;
2533	struct dma_pte *pgd = domain->pgd;
2534	int agaw, level;
 
2535
2536	/*
2537	 * Skip top levels of page tables for iommu which has
2538	 * less agaw than default. Unnecessary for PT mode.
2539	 */
2540	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2541		pgd = phys_to_virt(dma_pte_addr(pgd));
2542		if (!dma_pte_present(pgd))
2543			return -ENOMEM;
2544	}
2545
2546	level = agaw_to_level(agaw);
2547	if (level != 4 && level != 5)
2548		return -EINVAL;
2549
2550	flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
 
 
 
 
 
 
2551
2552	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2553					     domain->iommu_did[iommu->seq_id],
2554					     flags);
2555}
2556
2557static bool dev_is_real_dma_subdevice(struct device *dev)
2558{
2559	return dev && dev_is_pci(dev) &&
2560	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2561}
2562
2563static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2564						    int bus, int devfn,
2565						    struct device *dev,
2566						    struct dmar_domain *domain)
2567{
2568	struct dmar_domain *found = NULL;
2569	struct device_domain_info *info;
2570	unsigned long flags;
2571	int ret;
2572
2573	info = alloc_devinfo_mem();
2574	if (!info)
2575		return NULL;
2576
2577	if (!dev_is_real_dma_subdevice(dev)) {
2578		info->bus = bus;
2579		info->devfn = devfn;
2580		info->segment = iommu->segment;
2581	} else {
2582		struct pci_dev *pdev = to_pci_dev(dev);
2583
2584		info->bus = pdev->bus->number;
2585		info->devfn = pdev->devfn;
2586		info->segment = pci_domain_nr(pdev->bus);
2587	}
2588
2589	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2590	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2591	info->ats_qdep = 0;
2592	info->dev = dev;
2593	info->domain = domain;
2594	info->iommu = iommu;
2595	info->pasid_table = NULL;
2596	info->auxd_enabled = 0;
2597	INIT_LIST_HEAD(&info->auxiliary_domains);
2598
2599	if (dev && dev_is_pci(dev)) {
2600		struct pci_dev *pdev = to_pci_dev(info->dev);
2601
2602		if (ecap_dev_iotlb_support(iommu->ecap) &&
2603		    pci_ats_supported(pdev) &&
2604		    dmar_find_matched_atsr_unit(pdev))
2605			info->ats_supported = 1;
2606
2607		if (sm_supported(iommu)) {
2608			if (pasid_supported(iommu)) {
2609				int features = pci_pasid_features(pdev);
2610				if (features >= 0)
2611					info->pasid_supported = features | 1;
2612			}
2613
2614			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2615			    pci_pri_supported(pdev))
2616				info->pri_supported = 1;
2617		}
2618	}
2619
2620	spin_lock_irqsave(&device_domain_lock, flags);
2621	if (dev)
2622		found = find_domain(dev);
2623
2624	if (!found) {
2625		struct device_domain_info *info2;
2626		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2627						       info->devfn);
2628		if (info2) {
2629			found      = info2->domain;
2630			info2->dev = dev;
2631		}
2632	}
2633
2634	if (found) {
2635		spin_unlock_irqrestore(&device_domain_lock, flags);
2636		free_devinfo_mem(info);
2637		/* Caller must free the original domain */
2638		return found;
2639	}
2640
2641	spin_lock(&iommu->lock);
2642	ret = domain_attach_iommu(domain, iommu);
2643	spin_unlock(&iommu->lock);
2644
2645	if (ret) {
2646		spin_unlock_irqrestore(&device_domain_lock, flags);
2647		free_devinfo_mem(info);
2648		return NULL;
2649	}
2650
2651	list_add(&info->link, &domain->devices);
2652	list_add(&info->global, &device_domain_list);
2653	if (dev)
2654		dev_iommu_priv_set(dev, info);
2655	spin_unlock_irqrestore(&device_domain_lock, flags);
2656
2657	/* PASID table is mandatory for a PCI device in scalable mode. */
2658	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2659		ret = intel_pasid_alloc_table(dev);
2660		if (ret) {
2661			dev_err(dev, "PASID table allocation failed\n");
2662			dmar_remove_one_dev_info(dev);
2663			return NULL;
2664		}
2665
2666		/* Setup the PASID entry for requests without PASID: */
2667		spin_lock_irqsave(&iommu->lock, flags);
2668		if (hw_pass_through && domain_type_is_si(domain))
2669			ret = intel_pasid_setup_pass_through(iommu, domain,
2670					dev, PASID_RID2PASID);
2671		else if (domain_use_first_level(domain))
2672			ret = domain_setup_first_level(iommu, domain, dev,
2673					PASID_RID2PASID);
2674		else
2675			ret = intel_pasid_setup_second_level(iommu, domain,
2676					dev, PASID_RID2PASID);
2677		spin_unlock_irqrestore(&iommu->lock, flags);
2678		if (ret) {
2679			dev_err(dev, "Setup RID2PASID failed\n");
2680			dmar_remove_one_dev_info(dev);
2681			return NULL;
2682		}
2683	}
2684
2685	if (dev && domain_context_mapping(domain, dev)) {
2686		dev_err(dev, "Domain context map failed\n");
2687		dmar_remove_one_dev_info(dev);
2688		return NULL;
2689	}
2690
2691	return domain;
2692}
2693
2694static int iommu_domain_identity_map(struct dmar_domain *domain,
2695				     unsigned long first_vpfn,
2696				     unsigned long last_vpfn)
2697{
2698	/*
2699	 * RMRR range might have overlap with physical memory range,
2700	 * clear it first
2701	 */
2702	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2703
2704	return __domain_mapping(domain, first_vpfn, NULL,
2705				first_vpfn, last_vpfn - first_vpfn + 1,
2706				DMA_PTE_READ|DMA_PTE_WRITE);
2707}
2708
2709static int md_domain_init(struct dmar_domain *domain, int guest_width);
2710
2711static int __init si_domain_init(int hw)
2712{
2713	struct dmar_rmrr_unit *rmrr;
2714	struct device *dev;
2715	int i, nid, ret;
2716
2717	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2718	if (!si_domain)
2719		return -EFAULT;
2720
2721	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2722		domain_exit(si_domain);
 
2723		return -EFAULT;
2724	}
2725
2726	if (hw)
2727		return 0;
2728
2729	for_each_online_node(nid) {
2730		unsigned long start_pfn, end_pfn;
2731		int i;
2732
2733		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2734			ret = iommu_domain_identity_map(si_domain,
2735					mm_to_dma_pfn(start_pfn),
2736					mm_to_dma_pfn(end_pfn));
2737			if (ret)
2738				return ret;
2739		}
2740	}
2741
2742	/*
2743	 * Identity map the RMRRs so that devices with RMRRs could also use
2744	 * the si_domain.
2745	 */
2746	for_each_rmrr_units(rmrr) {
2747		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2748					  i, dev) {
2749			unsigned long long start = rmrr->base_address;
2750			unsigned long long end = rmrr->end_address;
2751
2752			if (WARN_ON(end < start ||
2753				    end >> agaw_to_width(si_domain->agaw)))
2754				continue;
2755
2756			ret = iommu_domain_identity_map(si_domain,
2757					mm_to_dma_pfn(start >> PAGE_SHIFT),
2758					mm_to_dma_pfn(end >> PAGE_SHIFT));
2759			if (ret)
2760				return ret;
2761		}
2762	}
2763
2764	return 0;
2765}
2766
2767static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
 
2768{
2769	struct dmar_domain *ndomain;
2770	struct intel_iommu *iommu;
 
2771	u8 bus, devfn;
 
2772
2773	iommu = device_to_iommu(dev, &bus, &devfn);
2774	if (!iommu)
2775		return -ENODEV;
2776
2777	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2778	if (ndomain != domain)
2779		return -EBUSY;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2780
2781	return 0;
2782}
2783
2784static bool device_has_rmrr(struct device *dev)
2785{
2786	struct dmar_rmrr_unit *rmrr;
2787	struct device *tmp;
2788	int i;
2789
2790	rcu_read_lock();
2791	for_each_rmrr_units(rmrr) {
2792		/*
2793		 * Return TRUE if this RMRR contains the device that
2794		 * is passed in.
2795		 */
2796		for_each_active_dev_scope(rmrr->devices,
2797					  rmrr->devices_cnt, i, tmp)
2798			if (tmp == dev ||
2799			    is_downstream_to_pci_bridge(dev, tmp)) {
2800				rcu_read_unlock();
2801				return true;
2802			}
2803	}
2804	rcu_read_unlock();
2805	return false;
2806}
2807
2808/**
2809 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2810 * is relaxable (ie. is allowed to be not enforced under some conditions)
2811 * @dev: device handle
2812 *
2813 * We assume that PCI USB devices with RMRRs have them largely
2814 * for historical reasons and that the RMRR space is not actively used post
2815 * boot.  This exclusion may change if vendors begin to abuse it.
2816 *
2817 * The same exception is made for graphics devices, with the requirement that
2818 * any use of the RMRR regions will be torn down before assigning the device
2819 * to a guest.
2820 *
2821 * Return: true if the RMRR is relaxable, false otherwise
2822 */
2823static bool device_rmrr_is_relaxable(struct device *dev)
2824{
2825	struct pci_dev *pdev;
2826
2827	if (!dev_is_pci(dev))
2828		return false;
2829
2830	pdev = to_pci_dev(dev);
2831	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2832		return true;
2833	else
2834		return false;
2835}
2836
2837/*
2838 * There are a couple cases where we need to restrict the functionality of
2839 * devices associated with RMRRs.  The first is when evaluating a device for
2840 * identity mapping because problems exist when devices are moved in and out
2841 * of domains and their respective RMRR information is lost.  This means that
2842 * a device with associated RMRRs will never be in a "passthrough" domain.
2843 * The second is use of the device through the IOMMU API.  This interface
2844 * expects to have full control of the IOVA space for the device.  We cannot
2845 * satisfy both the requirement that RMRR access is maintained and have an
2846 * unencumbered IOVA space.  We also have no ability to quiesce the device's
2847 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2848 * We therefore prevent devices associated with an RMRR from participating in
2849 * the IOMMU API, which eliminates them from device assignment.
2850 *
2851 * In both cases, devices which have relaxable RMRRs are not concerned by this
2852 * restriction. See device_rmrr_is_relaxable comment.
2853 */
2854static bool device_is_rmrr_locked(struct device *dev)
2855{
2856	if (!device_has_rmrr(dev))
2857		return false;
2858
2859	if (device_rmrr_is_relaxable(dev))
2860		return false;
2861
2862	return true;
2863}
2864
2865/*
2866 * Return the required default domain type for a specific device.
2867 *
2868 * @dev: the device in query
2869 * @startup: true if this is during early boot
2870 *
2871 * Returns:
2872 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2873 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2874 *  - 0: both identity and dynamic domains work for this device
2875 */
2876static int device_def_domain_type(struct device *dev)
2877{
2878	if (dev_is_pci(dev)) {
2879		struct pci_dev *pdev = to_pci_dev(dev);
2880
2881		/*
2882		 * Prevent any device marked as untrusted from getting
2883		 * placed into the statically identity mapping domain.
2884		 */
2885		if (pdev->untrusted)
2886			return IOMMU_DOMAIN_DMA;
2887
2888		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2889			return IOMMU_DOMAIN_IDENTITY;
2890
2891		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2892			return IOMMU_DOMAIN_IDENTITY;
2893	}
2894
2895	return 0;
2896}
2897
2898static void intel_iommu_init_qi(struct intel_iommu *iommu)
2899{
2900	/*
2901	 * Start from the sane iommu hardware state.
2902	 * If the queued invalidation is already initialized by us
2903	 * (for example, while enabling interrupt-remapping) then
2904	 * we got the things already rolling from a sane state.
2905	 */
2906	if (!iommu->qi) {
2907		/*
2908		 * Clear any previous faults.
2909		 */
2910		dmar_fault(-1, iommu);
2911		/*
2912		 * Disable queued invalidation if supported and already enabled
2913		 * before OS handover.
2914		 */
2915		dmar_disable_qi(iommu);
2916	}
2917
2918	if (dmar_enable_qi(iommu)) {
2919		/*
2920		 * Queued Invalidate not enabled, use Register Based Invalidate
2921		 */
2922		iommu->flush.flush_context = __iommu_flush_context;
2923		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2924		pr_info("%s: Using Register based invalidation\n",
2925			iommu->name);
2926	} else {
2927		iommu->flush.flush_context = qi_flush_context;
2928		iommu->flush.flush_iotlb = qi_flush_iotlb;
2929		pr_info("%s: Using Queued invalidation\n", iommu->name);
2930	}
2931}
2932
2933static int copy_context_table(struct intel_iommu *iommu,
2934			      struct root_entry *old_re,
2935			      struct context_entry **tbl,
2936			      int bus, bool ext)
2937{
2938	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2939	struct context_entry *new_ce = NULL, ce;
2940	struct context_entry *old_ce = NULL;
2941	struct root_entry re;
2942	phys_addr_t old_ce_phys;
2943
2944	tbl_idx = ext ? bus * 2 : bus;
2945	memcpy(&re, old_re, sizeof(re));
2946
2947	for (devfn = 0; devfn < 256; devfn++) {
2948		/* First calculate the correct index */
2949		idx = (ext ? devfn * 2 : devfn) % 256;
2950
2951		if (idx == 0) {
2952			/* First save what we may have and clean up */
2953			if (new_ce) {
2954				tbl[tbl_idx] = new_ce;
2955				__iommu_flush_cache(iommu, new_ce,
2956						    VTD_PAGE_SIZE);
2957				pos = 1;
2958			}
2959
2960			if (old_ce)
2961				memunmap(old_ce);
2962
2963			ret = 0;
2964			if (devfn < 0x80)
2965				old_ce_phys = root_entry_lctp(&re);
2966			else
2967				old_ce_phys = root_entry_uctp(&re);
2968
2969			if (!old_ce_phys) {
2970				if (ext && devfn == 0) {
2971					/* No LCTP, try UCTP */
2972					devfn = 0x7f;
2973					continue;
2974				} else {
2975					goto out;
2976				}
2977			}
2978
2979			ret = -ENOMEM;
2980			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2981					MEMREMAP_WB);
2982			if (!old_ce)
2983				goto out;
2984
2985			new_ce = alloc_pgtable_page(iommu->node);
2986			if (!new_ce)
2987				goto out_unmap;
2988
2989			ret = 0;
2990		}
2991
2992		/* Now copy the context entry */
2993		memcpy(&ce, old_ce + idx, sizeof(ce));
2994
2995		if (!__context_present(&ce))
2996			continue;
2997
2998		did = context_domain_id(&ce);
2999		if (did >= 0 && did < cap_ndoms(iommu->cap))
3000			set_bit(did, iommu->domain_ids);
3001
3002		/*
3003		 * We need a marker for copied context entries. This
3004		 * marker needs to work for the old format as well as
3005		 * for extended context entries.
3006		 *
3007		 * Bit 67 of the context entry is used. In the old
3008		 * format this bit is available to software, in the
3009		 * extended format it is the PGE bit, but PGE is ignored
3010		 * by HW if PASIDs are disabled (and thus still
3011		 * available).
3012		 *
3013		 * So disable PASIDs first and then mark the entry
3014		 * copied. This means that we don't copy PASID
3015		 * translations from the old kernel, but this is fine as
3016		 * faults there are not fatal.
3017		 */
3018		context_clear_pasid_enable(&ce);
3019		context_set_copied(&ce);
3020
3021		new_ce[idx] = ce;
3022	}
3023
3024	tbl[tbl_idx + pos] = new_ce;
3025
3026	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3027
3028out_unmap:
3029	memunmap(old_ce);
3030
3031out:
3032	return ret;
3033}
3034
3035static int copy_translation_tables(struct intel_iommu *iommu)
3036{
3037	struct context_entry **ctxt_tbls;
3038	struct root_entry *old_rt;
3039	phys_addr_t old_rt_phys;
3040	int ctxt_table_entries;
3041	unsigned long flags;
3042	u64 rtaddr_reg;
3043	int bus, ret;
3044	bool new_ext, ext;
3045
3046	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3047	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3048	new_ext    = !!ecap_ecs(iommu->ecap);
3049
3050	/*
3051	 * The RTT bit can only be changed when translation is disabled,
3052	 * but disabling translation means to open a window for data
3053	 * corruption. So bail out and don't copy anything if we would
3054	 * have to change the bit.
3055	 */
3056	if (new_ext != ext)
3057		return -EINVAL;
3058
 
 
 
 
3059	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3060	if (!old_rt_phys)
3061		return -EINVAL;
3062
3063	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3064	if (!old_rt)
3065		return -ENOMEM;
3066
3067	/* This is too big for the stack - allocate it from slab */
3068	ctxt_table_entries = ext ? 512 : 256;
3069	ret = -ENOMEM;
3070	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3071	if (!ctxt_tbls)
3072		goto out_unmap;
3073
3074	for (bus = 0; bus < 256; bus++) {
3075		ret = copy_context_table(iommu, &old_rt[bus],
3076					 ctxt_tbls, bus, ext);
3077		if (ret) {
3078			pr_err("%s: Failed to copy context table for bus %d\n",
3079				iommu->name, bus);
3080			continue;
3081		}
3082	}
3083
3084	spin_lock_irqsave(&iommu->lock, flags);
3085
3086	/* Context tables are copied, now write them to the root_entry table */
3087	for (bus = 0; bus < 256; bus++) {
3088		int idx = ext ? bus * 2 : bus;
3089		u64 val;
3090
3091		if (ctxt_tbls[idx]) {
3092			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3093			iommu->root_entry[bus].lo = val;
3094		}
3095
3096		if (!ext || !ctxt_tbls[idx + 1])
3097			continue;
3098
3099		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3100		iommu->root_entry[bus].hi = val;
3101	}
3102
3103	spin_unlock_irqrestore(&iommu->lock, flags);
3104
3105	kfree(ctxt_tbls);
3106
3107	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3108
3109	ret = 0;
3110
3111out_unmap:
3112	memunmap(old_rt);
3113
3114	return ret;
3115}
3116
3117#ifdef CONFIG_INTEL_IOMMU_SVM
3118static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3119{
3120	struct intel_iommu *iommu = data;
3121	ioasid_t ioasid;
3122
3123	if (!iommu)
3124		return INVALID_IOASID;
3125	/*
3126	 * VT-d virtual command interface always uses the full 20 bit
3127	 * PASID range. Host can partition guest PASID range based on
3128	 * policies but it is out of guest's control.
3129	 */
3130	if (min < PASID_MIN || max > intel_pasid_max_id)
3131		return INVALID_IOASID;
3132
3133	if (vcmd_alloc_pasid(iommu, &ioasid))
3134		return INVALID_IOASID;
3135
3136	return ioasid;
3137}
3138
3139static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3140{
3141	struct intel_iommu *iommu = data;
3142
3143	if (!iommu)
3144		return;
3145	/*
3146	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3147	 * We can only free the PASID when all the devices are unbound.
3148	 */
3149	if (ioasid_find(NULL, ioasid, NULL)) {
3150		pr_alert("Cannot free active IOASID %d\n", ioasid);
3151		return;
3152	}
3153	vcmd_free_pasid(iommu, ioasid);
3154}
3155
3156static void register_pasid_allocator(struct intel_iommu *iommu)
3157{
3158	/*
3159	 * If we are running in the host, no need for custom allocator
3160	 * in that PASIDs are allocated from the host system-wide.
3161	 */
3162	if (!cap_caching_mode(iommu->cap))
3163		return;
3164
3165	if (!sm_supported(iommu)) {
3166		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3167		return;
3168	}
3169
3170	/*
3171	 * Register a custom PASID allocator if we are running in a guest,
3172	 * guest PASID must be obtained via virtual command interface.
3173	 * There can be multiple vIOMMUs in each guest but only one allocator
3174	 * is active. All vIOMMU allocators will eventually be calling the same
3175	 * host allocator.
3176	 */
3177	if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3178		return;
3179
3180	pr_info("Register custom PASID allocator\n");
3181	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3182	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3183	iommu->pasid_allocator.pdata = (void *)iommu;
3184	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3185		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3186		/*
3187		 * Disable scalable mode on this IOMMU if there
3188		 * is no custom allocator. Mixing SM capable vIOMMU
3189		 * and non-SM vIOMMU are not supported.
3190		 */
3191		intel_iommu_sm = 0;
3192	}
3193}
3194#endif
3195
3196static int __init init_dmars(void)
3197{
3198	struct dmar_drhd_unit *drhd;
3199	struct intel_iommu *iommu;
3200	int ret;
3201
3202	/*
3203	 * for each drhd
3204	 *    allocate root
3205	 *    initialize and program root entry to not present
3206	 * endfor
3207	 */
3208	for_each_drhd_unit(drhd) {
3209		/*
3210		 * lock not needed as this is only incremented in the single
3211		 * threaded kernel __init code path all other access are read
3212		 * only
3213		 */
3214		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3215			g_num_of_iommus++;
3216			continue;
3217		}
3218		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3219	}
3220
3221	/* Preallocate enough resources for IOMMU hot-addition */
3222	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3223		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3224
3225	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3226			GFP_KERNEL);
3227	if (!g_iommus) {
3228		pr_err("Allocating global iommu array failed\n");
3229		ret = -ENOMEM;
3230		goto error;
3231	}
3232
3233	for_each_iommu(iommu, drhd) {
3234		if (drhd->ignored) {
3235			iommu_disable_translation(iommu);
3236			continue;
3237		}
3238
3239		/*
3240		 * Find the max pasid size of all IOMMU's in the system.
3241		 * We need to ensure the system pasid table is no bigger
3242		 * than the smallest supported.
3243		 */
3244		if (pasid_supported(iommu)) {
3245			u32 temp = 2 << ecap_pss(iommu->ecap);
3246
3247			intel_pasid_max_id = min_t(u32, temp,
3248						   intel_pasid_max_id);
3249		}
3250
3251		g_iommus[iommu->seq_id] = iommu;
3252
3253		intel_iommu_init_qi(iommu);
3254
3255		ret = iommu_init_domains(iommu);
3256		if (ret)
3257			goto free_iommu;
3258
3259		init_translation_status(iommu);
3260
3261		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3262			iommu_disable_translation(iommu);
3263			clear_translation_pre_enabled(iommu);
3264			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3265				iommu->name);
3266		}
3267
3268		/*
3269		 * TBD:
3270		 * we could share the same root & context tables
3271		 * among all IOMMU's. Need to Split it later.
3272		 */
3273		ret = iommu_alloc_root_entry(iommu);
3274		if (ret)
3275			goto free_iommu;
3276
3277		if (translation_pre_enabled(iommu)) {
3278			pr_info("Translation already enabled - trying to copy translation structures\n");
3279
3280			ret = copy_translation_tables(iommu);
3281			if (ret) {
3282				/*
3283				 * We found the IOMMU with translation
3284				 * enabled - but failed to copy over the
3285				 * old root-entry table. Try to proceed
3286				 * by disabling translation now and
3287				 * allocating a clean root-entry table.
3288				 * This might cause DMAR faults, but
3289				 * probably the dump will still succeed.
3290				 */
3291				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3292				       iommu->name);
3293				iommu_disable_translation(iommu);
3294				clear_translation_pre_enabled(iommu);
3295			} else {
3296				pr_info("Copied translation tables from previous kernel for %s\n",
3297					iommu->name);
3298			}
3299		}
3300
3301		if (!ecap_pass_through(iommu->ecap))
3302			hw_pass_through = 0;
3303		intel_svm_check(iommu);
3304	}
3305
3306	/*
3307	 * Now that qi is enabled on all iommus, set the root entry and flush
3308	 * caches. This is required on some Intel X58 chipsets, otherwise the
3309	 * flush_context function will loop forever and the boot hangs.
3310	 */
3311	for_each_active_iommu(iommu, drhd) {
3312		iommu_flush_write_buffer(iommu);
3313#ifdef CONFIG_INTEL_IOMMU_SVM
3314		register_pasid_allocator(iommu);
3315#endif
3316		iommu_set_root_entry(iommu);
3317		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3318		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3319	}
3320
3321#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3322	dmar_map_gfx = 0;
3323#endif
3324
3325	if (!dmar_map_gfx)
3326		iommu_identity_mapping |= IDENTMAP_GFX;
3327
3328	check_tylersburg_isoch();
3329
3330	ret = si_domain_init(hw_pass_through);
3331	if (ret)
3332		goto free_iommu;
3333
3334	/*
3335	 * for each drhd
3336	 *   enable fault log
3337	 *   global invalidate context cache
3338	 *   global invalidate iotlb
3339	 *   enable translation
3340	 */
3341	for_each_iommu(iommu, drhd) {
3342		if (drhd->ignored) {
3343			/*
3344			 * we always have to disable PMRs or DMA may fail on
3345			 * this device
3346			 */
3347			if (force_on)
3348				iommu_disable_protect_mem_regions(iommu);
3349			continue;
3350		}
3351
3352		iommu_flush_write_buffer(iommu);
3353
3354#ifdef CONFIG_INTEL_IOMMU_SVM
3355		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3356			/*
3357			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3358			 * could cause possible lock race condition.
3359			 */
3360			up_write(&dmar_global_lock);
3361			ret = intel_svm_enable_prq(iommu);
3362			down_write(&dmar_global_lock);
3363			if (ret)
3364				goto free_iommu;
3365		}
3366#endif
3367		ret = dmar_set_interrupt(iommu);
3368		if (ret)
3369			goto free_iommu;
3370	}
3371
3372	return 0;
3373
3374free_iommu:
3375	for_each_active_iommu(iommu, drhd) {
3376		disable_dmar_iommu(iommu);
3377		free_dmar_iommu(iommu);
3378	}
3379
3380	kfree(g_iommus);
3381
3382error:
3383	return ret;
3384}
3385
3386/* This takes a number of _MM_ pages, not VTD pages */
3387static unsigned long intel_alloc_iova(struct device *dev,
3388				     struct dmar_domain *domain,
3389				     unsigned long nrpages, uint64_t dma_mask)
3390{
3391	unsigned long iova_pfn;
3392
3393	/*
3394	 * Restrict dma_mask to the width that the iommu can handle.
3395	 * First-level translation restricts the input-address to a
3396	 * canonical address (i.e., address bits 63:N have the same
3397	 * value as address bit [N-1], where N is 48-bits with 4-level
3398	 * paging and 57-bits with 5-level paging). Hence, skip bit
3399	 * [N-1].
3400	 */
3401	if (domain_use_first_level(domain))
3402		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3403				 dma_mask);
3404	else
3405		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3406				 dma_mask);
3407
3408	/* Ensure we reserve the whole size-aligned region */
3409	nrpages = __roundup_pow_of_two(nrpages);
3410
3411	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3412		/*
3413		 * First try to allocate an io virtual address in
3414		 * DMA_BIT_MASK(32) and if that fails then try allocating
3415		 * from higher range
3416		 */
3417		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3418					   IOVA_PFN(DMA_BIT_MASK(32)), false);
3419		if (iova_pfn)
3420			return iova_pfn;
3421	}
3422	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3423				   IOVA_PFN(dma_mask), true);
3424	if (unlikely(!iova_pfn)) {
3425		dev_err_once(dev, "Allocating %ld-page iova failed\n",
3426			     nrpages);
3427		return 0;
3428	}
3429
3430	return iova_pfn;
3431}
3432
3433static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3434				     size_t size, int dir, u64 dma_mask)
3435{
3436	struct dmar_domain *domain;
3437	phys_addr_t start_paddr;
3438	unsigned long iova_pfn;
3439	int prot = 0;
3440	int ret;
3441	struct intel_iommu *iommu;
3442	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3443
3444	BUG_ON(dir == DMA_NONE);
3445
3446	if (unlikely(attach_deferred(dev)))
3447		do_deferred_attach(dev);
3448
3449	domain = find_domain(dev);
3450	if (!domain)
3451		return DMA_MAPPING_ERROR;
3452
3453	iommu = domain_get_iommu(domain);
3454	size = aligned_nrpages(paddr, size);
3455
3456	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3457	if (!iova_pfn)
3458		goto error;
3459
3460	/*
3461	 * Check if DMAR supports zero-length reads on write only
3462	 * mappings..
3463	 */
3464	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3465			!cap_zlr(iommu->cap))
3466		prot |= DMA_PTE_READ;
3467	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3468		prot |= DMA_PTE_WRITE;
3469	/*
3470	 * paddr - (paddr + size) might be partial page, we should map the whole
3471	 * page.  Note: if two part of one page are separately mapped, we
3472	 * might have two guest_addr mapping to the same host paddr, but this
3473	 * is not a big problem
3474	 */
3475	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3476				 mm_to_dma_pfn(paddr_pfn), size, prot);
3477	if (ret)
3478		goto error;
3479
3480	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3481	start_paddr += paddr & ~PAGE_MASK;
3482
3483	trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3484
3485	return start_paddr;
3486
3487error:
3488	if (iova_pfn)
3489		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3490	dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3491		size, (unsigned long long)paddr, dir);
3492	return DMA_MAPPING_ERROR;
3493}
3494
3495static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3496				 unsigned long offset, size_t size,
3497				 enum dma_data_direction dir,
3498				 unsigned long attrs)
3499{
3500	return __intel_map_single(dev, page_to_phys(page) + offset,
3501				  size, dir, *dev->dma_mask);
3502}
3503
3504static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3505				     size_t size, enum dma_data_direction dir,
3506				     unsigned long attrs)
3507{
3508	return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3509}
3510
3511static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3512{
3513	struct dmar_domain *domain;
3514	unsigned long start_pfn, last_pfn;
3515	unsigned long nrpages;
3516	unsigned long iova_pfn;
3517	struct intel_iommu *iommu;
3518	struct page *freelist;
3519	struct pci_dev *pdev = NULL;
3520
3521	domain = find_domain(dev);
3522	BUG_ON(!domain);
3523
3524	iommu = domain_get_iommu(domain);
3525
3526	iova_pfn = IOVA_PFN(dev_addr);
3527
3528	nrpages = aligned_nrpages(dev_addr, size);
3529	start_pfn = mm_to_dma_pfn(iova_pfn);
3530	last_pfn = start_pfn + nrpages - 1;
3531
3532	if (dev_is_pci(dev))
3533		pdev = to_pci_dev(dev);
3534
3535	freelist = domain_unmap(domain, start_pfn, last_pfn);
3536	if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3537			!has_iova_flush_queue(&domain->iovad)) {
3538		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3539				      nrpages, !freelist, 0);
3540		/* free iova */
3541		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3542		dma_free_pagelist(freelist);
3543	} else {
3544		queue_iova(&domain->iovad, iova_pfn, nrpages,
3545			   (unsigned long)freelist);
3546		/*
3547		 * queue up the release of the unmap to save the 1/6th of the
3548		 * cpu used up by the iotlb flush operation...
3549		 */
3550	}
3551
3552	trace_unmap_single(dev, dev_addr, size);
3553}
3554
3555static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3556			     size_t size, enum dma_data_direction dir,
3557			     unsigned long attrs)
3558{
3559	intel_unmap(dev, dev_addr, size);
3560}
3561
3562static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3563		size_t size, enum dma_data_direction dir, unsigned long attrs)
3564{
3565	intel_unmap(dev, dev_addr, size);
3566}
3567
3568static void *intel_alloc_coherent(struct device *dev, size_t size,
3569				  dma_addr_t *dma_handle, gfp_t flags,
3570				  unsigned long attrs)
3571{
3572	struct page *page = NULL;
3573	int order;
3574
3575	if (unlikely(attach_deferred(dev)))
3576		do_deferred_attach(dev);
3577
3578	size = PAGE_ALIGN(size);
3579	order = get_order(size);
3580
3581	if (gfpflags_allow_blocking(flags)) {
3582		unsigned int count = size >> PAGE_SHIFT;
3583
3584		page = dma_alloc_from_contiguous(dev, count, order,
3585						 flags & __GFP_NOWARN);
3586	}
3587
3588	if (!page)
3589		page = alloc_pages(flags, order);
3590	if (!page)
3591		return NULL;
3592	memset(page_address(page), 0, size);
3593
3594	*dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3595					 DMA_BIDIRECTIONAL,
3596					 dev->coherent_dma_mask);
3597	if (*dma_handle != DMA_MAPPING_ERROR)
3598		return page_address(page);
3599	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3600		__free_pages(page, order);
3601
3602	return NULL;
3603}
3604
3605static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3606				dma_addr_t dma_handle, unsigned long attrs)
3607{
3608	int order;
3609	struct page *page = virt_to_page(vaddr);
3610
3611	size = PAGE_ALIGN(size);
3612	order = get_order(size);
3613
3614	intel_unmap(dev, dma_handle, size);
3615	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3616		__free_pages(page, order);
3617}
3618
3619static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3620			   int nelems, enum dma_data_direction dir,
3621			   unsigned long attrs)
3622{
3623	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3624	unsigned long nrpages = 0;
3625	struct scatterlist *sg;
3626	int i;
3627
3628	for_each_sg(sglist, sg, nelems, i) {
3629		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3630	}
3631
3632	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3633
3634	trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3635}
3636
3637static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3638			enum dma_data_direction dir, unsigned long attrs)
3639{
3640	int i;
3641	struct dmar_domain *domain;
3642	size_t size = 0;
3643	int prot = 0;
3644	unsigned long iova_pfn;
3645	int ret;
3646	struct scatterlist *sg;
3647	unsigned long start_vpfn;
3648	struct intel_iommu *iommu;
3649
3650	BUG_ON(dir == DMA_NONE);
3651
3652	if (unlikely(attach_deferred(dev)))
3653		do_deferred_attach(dev);
3654
3655	domain = find_domain(dev);
3656	if (!domain)
3657		return 0;
3658
3659	iommu = domain_get_iommu(domain);
3660
3661	for_each_sg(sglist, sg, nelems, i)
3662		size += aligned_nrpages(sg->offset, sg->length);
3663
3664	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3665				*dev->dma_mask);
3666	if (!iova_pfn) {
3667		sglist->dma_length = 0;
3668		return 0;
3669	}
3670
3671	/*
3672	 * Check if DMAR supports zero-length reads on write only
3673	 * mappings..
3674	 */
3675	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3676			!cap_zlr(iommu->cap))
3677		prot |= DMA_PTE_READ;
3678	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3679		prot |= DMA_PTE_WRITE;
3680
3681	start_vpfn = mm_to_dma_pfn(iova_pfn);
3682
3683	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3684	if (unlikely(ret)) {
3685		dma_pte_free_pagetable(domain, start_vpfn,
3686				       start_vpfn + size - 1,
3687				       agaw_to_level(domain->agaw) + 1);
3688		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3689		return 0;
3690	}
3691
3692	for_each_sg(sglist, sg, nelems, i)
3693		trace_map_sg(dev, i + 1, nelems, sg);
3694
3695	return nelems;
3696}
3697
3698static u64 intel_get_required_mask(struct device *dev)
3699{
3700	return DMA_BIT_MASK(32);
3701}
3702
3703static const struct dma_map_ops intel_dma_ops = {
3704	.alloc = intel_alloc_coherent,
3705	.free = intel_free_coherent,
3706	.map_sg = intel_map_sg,
3707	.unmap_sg = intel_unmap_sg,
3708	.map_page = intel_map_page,
3709	.unmap_page = intel_unmap_page,
3710	.map_resource = intel_map_resource,
3711	.unmap_resource = intel_unmap_resource,
3712	.dma_supported = dma_direct_supported,
3713	.mmap = dma_common_mmap,
3714	.get_sgtable = dma_common_get_sgtable,
3715	.get_required_mask = intel_get_required_mask,
3716};
3717
3718static void
3719bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3720		   enum dma_data_direction dir, enum dma_sync_target target)
3721{
3722	struct dmar_domain *domain;
3723	phys_addr_t tlb_addr;
3724
3725	domain = find_domain(dev);
3726	if (WARN_ON(!domain))
3727		return;
3728
3729	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3730	if (is_swiotlb_buffer(tlb_addr))
3731		swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3732}
3733
3734static dma_addr_t
3735bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3736		  enum dma_data_direction dir, unsigned long attrs,
3737		  u64 dma_mask)
3738{
3739	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3740	struct dmar_domain *domain;
3741	struct intel_iommu *iommu;
3742	unsigned long iova_pfn;
3743	unsigned long nrpages;
3744	phys_addr_t tlb_addr;
3745	int prot = 0;
3746	int ret;
3747
3748	if (unlikely(attach_deferred(dev)))
3749		do_deferred_attach(dev);
3750
3751	domain = find_domain(dev);
3752
3753	if (WARN_ON(dir == DMA_NONE || !domain))
3754		return DMA_MAPPING_ERROR;
3755
3756	iommu = domain_get_iommu(domain);
3757	if (WARN_ON(!iommu))
3758		return DMA_MAPPING_ERROR;
3759
3760	nrpages = aligned_nrpages(0, size);
3761	iova_pfn = intel_alloc_iova(dev, domain,
3762				    dma_to_mm_pfn(nrpages), dma_mask);
3763	if (!iova_pfn)
3764		return DMA_MAPPING_ERROR;
3765
3766	/*
3767	 * Check if DMAR supports zero-length reads on write only
3768	 * mappings..
3769	 */
3770	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3771			!cap_zlr(iommu->cap))
3772		prot |= DMA_PTE_READ;
3773	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3774		prot |= DMA_PTE_WRITE;
3775
3776	/*
3777	 * If both the physical buffer start address and size are
3778	 * page aligned, we don't need to use a bounce page.
3779	 */
3780	if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3781		tlb_addr = swiotlb_tbl_map_single(dev,
3782				__phys_to_dma(dev, io_tlb_start),
3783				paddr, size, aligned_size, dir, attrs);
3784		if (tlb_addr == DMA_MAPPING_ERROR) {
3785			goto swiotlb_error;
3786		} else {
3787			/* Cleanup the padding area. */
3788			void *padding_start = phys_to_virt(tlb_addr);
3789			size_t padding_size = aligned_size;
3790
3791			if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3792			    (dir == DMA_TO_DEVICE ||
3793			     dir == DMA_BIDIRECTIONAL)) {
3794				padding_start += size;
3795				padding_size -= size;
3796			}
3797
3798			memset(padding_start, 0, padding_size);
3799		}
3800	} else {
3801		tlb_addr = paddr;
3802	}
3803
3804	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3805				 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3806	if (ret)
3807		goto mapping_error;
3808
3809	trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3810
3811	return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3812
3813mapping_error:
3814	if (is_swiotlb_buffer(tlb_addr))
3815		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3816					 aligned_size, dir, attrs);
3817swiotlb_error:
3818	free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3819	dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3820		size, (unsigned long long)paddr, dir);
3821
3822	return DMA_MAPPING_ERROR;
3823}
3824
3825static void
3826bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3827		    enum dma_data_direction dir, unsigned long attrs)
3828{
3829	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3830	struct dmar_domain *domain;
3831	phys_addr_t tlb_addr;
3832
3833	domain = find_domain(dev);
3834	if (WARN_ON(!domain))
3835		return;
3836
3837	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3838	if (WARN_ON(!tlb_addr))
3839		return;
3840
3841	intel_unmap(dev, dev_addr, size);
3842	if (is_swiotlb_buffer(tlb_addr))
3843		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3844					 aligned_size, dir, attrs);
3845
3846	trace_bounce_unmap_single(dev, dev_addr, size);
3847}
3848
3849static dma_addr_t
3850bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3851		size_t size, enum dma_data_direction dir, unsigned long attrs)
3852{
3853	return bounce_map_single(dev, page_to_phys(page) + offset,
3854				 size, dir, attrs, *dev->dma_mask);
3855}
3856
3857static dma_addr_t
3858bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3859		    enum dma_data_direction dir, unsigned long attrs)
3860{
3861	return bounce_map_single(dev, phys_addr, size,
3862				 dir, attrs, *dev->dma_mask);
3863}
3864
3865static void
3866bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3867		  enum dma_data_direction dir, unsigned long attrs)
3868{
3869	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3870}
3871
3872static void
3873bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3874		      enum dma_data_direction dir, unsigned long attrs)
3875{
3876	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3877}
3878
3879static void
3880bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3881		enum dma_data_direction dir, unsigned long attrs)
3882{
3883	struct scatterlist *sg;
3884	int i;
3885
3886	for_each_sg(sglist, sg, nelems, i)
3887		bounce_unmap_page(dev, sg->dma_address,
3888				  sg_dma_len(sg), dir, attrs);
3889}
3890
3891static int
3892bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3893	      enum dma_data_direction dir, unsigned long attrs)
3894{
3895	int i;
3896	struct scatterlist *sg;
3897
3898	for_each_sg(sglist, sg, nelems, i) {
3899		sg->dma_address = bounce_map_page(dev, sg_page(sg),
3900						  sg->offset, sg->length,
3901						  dir, attrs);
3902		if (sg->dma_address == DMA_MAPPING_ERROR)
3903			goto out_unmap;
3904		sg_dma_len(sg) = sg->length;
3905	}
3906
3907	for_each_sg(sglist, sg, nelems, i)
3908		trace_bounce_map_sg(dev, i + 1, nelems, sg);
3909
3910	return nelems;
3911
3912out_unmap:
3913	bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3914	return 0;
3915}
3916
3917static void
3918bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3919			   size_t size, enum dma_data_direction dir)
3920{
3921	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3922}
3923
3924static void
3925bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3926			      size_t size, enum dma_data_direction dir)
3927{
3928	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3929}
3930
3931static void
3932bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3933		       int nelems, enum dma_data_direction dir)
3934{
3935	struct scatterlist *sg;
3936	int i;
3937
3938	for_each_sg(sglist, sg, nelems, i)
3939		bounce_sync_single(dev, sg_dma_address(sg),
3940				   sg_dma_len(sg), dir, SYNC_FOR_CPU);
3941}
3942
3943static void
3944bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3945			  int nelems, enum dma_data_direction dir)
3946{
3947	struct scatterlist *sg;
3948	int i;
3949
3950	for_each_sg(sglist, sg, nelems, i)
3951		bounce_sync_single(dev, sg_dma_address(sg),
3952				   sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3953}
3954
3955static const struct dma_map_ops bounce_dma_ops = {
3956	.alloc			= intel_alloc_coherent,
3957	.free			= intel_free_coherent,
3958	.map_sg			= bounce_map_sg,
3959	.unmap_sg		= bounce_unmap_sg,
3960	.map_page		= bounce_map_page,
3961	.unmap_page		= bounce_unmap_page,
3962	.sync_single_for_cpu	= bounce_sync_single_for_cpu,
3963	.sync_single_for_device	= bounce_sync_single_for_device,
3964	.sync_sg_for_cpu	= bounce_sync_sg_for_cpu,
3965	.sync_sg_for_device	= bounce_sync_sg_for_device,
3966	.map_resource		= bounce_map_resource,
3967	.unmap_resource		= bounce_unmap_resource,
3968	.dma_supported		= dma_direct_supported,
3969};
3970
3971static inline int iommu_domain_cache_init(void)
3972{
3973	int ret = 0;
3974
3975	iommu_domain_cache = kmem_cache_create("iommu_domain",
3976					 sizeof(struct dmar_domain),
3977					 0,
3978					 SLAB_HWCACHE_ALIGN,
3979
3980					 NULL);
3981	if (!iommu_domain_cache) {
3982		pr_err("Couldn't create iommu_domain cache\n");
3983		ret = -ENOMEM;
3984	}
3985
3986	return ret;
3987}
3988
3989static inline int iommu_devinfo_cache_init(void)
3990{
3991	int ret = 0;
3992
3993	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3994					 sizeof(struct device_domain_info),
3995					 0,
3996					 SLAB_HWCACHE_ALIGN,
3997					 NULL);
3998	if (!iommu_devinfo_cache) {
3999		pr_err("Couldn't create devinfo cache\n");
4000		ret = -ENOMEM;
4001	}
4002
4003	return ret;
4004}
4005
4006static int __init iommu_init_mempool(void)
4007{
4008	int ret;
4009	ret = iova_cache_get();
4010	if (ret)
4011		return ret;
4012
4013	ret = iommu_domain_cache_init();
4014	if (ret)
4015		goto domain_error;
4016
4017	ret = iommu_devinfo_cache_init();
4018	if (!ret)
4019		return ret;
4020
4021	kmem_cache_destroy(iommu_domain_cache);
4022domain_error:
4023	iova_cache_put();
4024
4025	return -ENOMEM;
4026}
4027
4028static void __init iommu_exit_mempool(void)
4029{
4030	kmem_cache_destroy(iommu_devinfo_cache);
4031	kmem_cache_destroy(iommu_domain_cache);
4032	iova_cache_put();
4033}
4034
4035static void __init init_no_remapping_devices(void)
4036{
4037	struct dmar_drhd_unit *drhd;
4038	struct device *dev;
4039	int i;
4040
4041	for_each_drhd_unit(drhd) {
4042		if (!drhd->include_all) {
4043			for_each_active_dev_scope(drhd->devices,
4044						  drhd->devices_cnt, i, dev)
4045				break;
4046			/* ignore DMAR unit if no devices exist */
4047			if (i == drhd->devices_cnt)
4048				drhd->ignored = 1;
4049		}
4050	}
4051
4052	for_each_active_drhd_unit(drhd) {
4053		if (drhd->include_all)
4054			continue;
4055
4056		for_each_active_dev_scope(drhd->devices,
4057					  drhd->devices_cnt, i, dev)
4058			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4059				break;
4060		if (i < drhd->devices_cnt)
4061			continue;
4062
4063		/* This IOMMU has *only* gfx devices. Either bypass it or
4064		   set the gfx_mapped flag, as appropriate */
4065		drhd->gfx_dedicated = 1;
4066		if (!dmar_map_gfx)
4067			drhd->ignored = 1;
4068	}
4069}
4070
4071#ifdef CONFIG_SUSPEND
4072static int init_iommu_hw(void)
4073{
4074	struct dmar_drhd_unit *drhd;
4075	struct intel_iommu *iommu = NULL;
4076
4077	for_each_active_iommu(iommu, drhd)
4078		if (iommu->qi)
4079			dmar_reenable_qi(iommu);
4080
4081	for_each_iommu(iommu, drhd) {
4082		if (drhd->ignored) {
4083			/*
4084			 * we always have to disable PMRs or DMA may fail on
4085			 * this device
4086			 */
4087			if (force_on)
4088				iommu_disable_protect_mem_regions(iommu);
4089			continue;
4090		}
4091
4092		iommu_flush_write_buffer(iommu);
4093
4094		iommu_set_root_entry(iommu);
4095
4096		iommu->flush.flush_context(iommu, 0, 0, 0,
4097					   DMA_CCMD_GLOBAL_INVL);
4098		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4099		iommu_enable_translation(iommu);
4100		iommu_disable_protect_mem_regions(iommu);
4101	}
4102
4103	return 0;
4104}
4105
4106static void iommu_flush_all(void)
4107{
4108	struct dmar_drhd_unit *drhd;
4109	struct intel_iommu *iommu;
4110
4111	for_each_active_iommu(iommu, drhd) {
4112		iommu->flush.flush_context(iommu, 0, 0, 0,
4113					   DMA_CCMD_GLOBAL_INVL);
4114		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4115					 DMA_TLB_GLOBAL_FLUSH);
4116	}
4117}
4118
4119static int iommu_suspend(void)
4120{
4121	struct dmar_drhd_unit *drhd;
4122	struct intel_iommu *iommu = NULL;
4123	unsigned long flag;
4124
4125	for_each_active_iommu(iommu, drhd) {
4126		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4127						 GFP_ATOMIC);
4128		if (!iommu->iommu_state)
4129			goto nomem;
4130	}
4131
4132	iommu_flush_all();
4133
4134	for_each_active_iommu(iommu, drhd) {
4135		iommu_disable_translation(iommu);
4136
4137		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4138
4139		iommu->iommu_state[SR_DMAR_FECTL_REG] =
4140			readl(iommu->reg + DMAR_FECTL_REG);
4141		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4142			readl(iommu->reg + DMAR_FEDATA_REG);
4143		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4144			readl(iommu->reg + DMAR_FEADDR_REG);
4145		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4146			readl(iommu->reg + DMAR_FEUADDR_REG);
4147
4148		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4149	}
4150	return 0;
4151
4152nomem:
4153	for_each_active_iommu(iommu, drhd)
4154		kfree(iommu->iommu_state);
4155
4156	return -ENOMEM;
4157}
4158
4159static void iommu_resume(void)
4160{
4161	struct dmar_drhd_unit *drhd;
4162	struct intel_iommu *iommu = NULL;
4163	unsigned long flag;
4164
4165	if (init_iommu_hw()) {
4166		if (force_on)
4167			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4168		else
4169			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4170		return;
4171	}
4172
4173	for_each_active_iommu(iommu, drhd) {
4174
4175		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4176
4177		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4178			iommu->reg + DMAR_FECTL_REG);
4179		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4180			iommu->reg + DMAR_FEDATA_REG);
4181		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4182			iommu->reg + DMAR_FEADDR_REG);
4183		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4184			iommu->reg + DMAR_FEUADDR_REG);
4185
4186		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4187	}
4188
4189	for_each_active_iommu(iommu, drhd)
4190		kfree(iommu->iommu_state);
4191}
4192
4193static struct syscore_ops iommu_syscore_ops = {
4194	.resume		= iommu_resume,
4195	.suspend	= iommu_suspend,
4196};
4197
4198static void __init init_iommu_pm_ops(void)
4199{
4200	register_syscore_ops(&iommu_syscore_ops);
4201}
4202
4203#else
4204static inline void init_iommu_pm_ops(void) {}
4205#endif	/* CONFIG_PM */
4206
4207static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4208{
4209	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4210	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4211	    rmrr->end_address <= rmrr->base_address ||
4212	    arch_rmrr_sanity_check(rmrr))
4213		return -EINVAL;
4214
4215	return 0;
4216}
4217
4218int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4219{
4220	struct acpi_dmar_reserved_memory *rmrr;
4221	struct dmar_rmrr_unit *rmrru;
4222
4223	rmrr = (struct acpi_dmar_reserved_memory *)header;
4224	if (rmrr_sanity_check(rmrr)) {
4225		pr_warn(FW_BUG
4226			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4227			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4228			   rmrr->base_address, rmrr->end_address,
4229			   dmi_get_system_info(DMI_BIOS_VENDOR),
4230			   dmi_get_system_info(DMI_BIOS_VERSION),
4231			   dmi_get_system_info(DMI_PRODUCT_VERSION));
4232		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4233	}
4234
4235	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4236	if (!rmrru)
4237		goto out;
4238
4239	rmrru->hdr = header;
4240
4241	rmrru->base_address = rmrr->base_address;
4242	rmrru->end_address = rmrr->end_address;
4243
4244	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4245				((void *)rmrr) + rmrr->header.length,
4246				&rmrru->devices_cnt);
4247	if (rmrru->devices_cnt && rmrru->devices == NULL)
4248		goto free_rmrru;
4249
4250	list_add(&rmrru->list, &dmar_rmrr_units);
4251
4252	return 0;
4253free_rmrru:
4254	kfree(rmrru);
4255out:
4256	return -ENOMEM;
4257}
4258
4259static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4260{
4261	struct dmar_atsr_unit *atsru;
4262	struct acpi_dmar_atsr *tmp;
4263
4264	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4265				dmar_rcu_check()) {
4266		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4267		if (atsr->segment != tmp->segment)
4268			continue;
4269		if (atsr->header.length != tmp->header.length)
4270			continue;
4271		if (memcmp(atsr, tmp, atsr->header.length) == 0)
4272			return atsru;
4273	}
4274
4275	return NULL;
4276}
4277
4278int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4279{
4280	struct acpi_dmar_atsr *atsr;
4281	struct dmar_atsr_unit *atsru;
4282
4283	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4284		return 0;
4285
4286	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4287	atsru = dmar_find_atsr(atsr);
4288	if (atsru)
4289		return 0;
4290
4291	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4292	if (!atsru)
4293		return -ENOMEM;
4294
4295	/*
4296	 * If memory is allocated from slab by ACPI _DSM method, we need to
4297	 * copy the memory content because the memory buffer will be freed
4298	 * on return.
4299	 */
4300	atsru->hdr = (void *)(atsru + 1);
4301	memcpy(atsru->hdr, hdr, hdr->length);
4302	atsru->include_all = atsr->flags & 0x1;
4303	if (!atsru->include_all) {
4304		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4305				(void *)atsr + atsr->header.length,
4306				&atsru->devices_cnt);
4307		if (atsru->devices_cnt && atsru->devices == NULL) {
4308			kfree(atsru);
4309			return -ENOMEM;
4310		}
4311	}
4312
4313	list_add_rcu(&atsru->list, &dmar_atsr_units);
4314
4315	return 0;
4316}
4317
4318static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4319{
4320	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4321	kfree(atsru);
4322}
4323
4324int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4325{
4326	struct acpi_dmar_atsr *atsr;
4327	struct dmar_atsr_unit *atsru;
4328
4329	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4330	atsru = dmar_find_atsr(atsr);
4331	if (atsru) {
4332		list_del_rcu(&atsru->list);
4333		synchronize_rcu();
4334		intel_iommu_free_atsr(atsru);
4335	}
4336
4337	return 0;
4338}
4339
4340int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4341{
4342	int i;
4343	struct device *dev;
4344	struct acpi_dmar_atsr *atsr;
4345	struct dmar_atsr_unit *atsru;
4346
4347	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4348	atsru = dmar_find_atsr(atsr);
4349	if (!atsru)
4350		return 0;
4351
4352	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4353		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4354					  i, dev)
4355			return -EBUSY;
4356	}
4357
4358	return 0;
4359}
4360
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4361static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4362{
4363	int sp, ret;
4364	struct intel_iommu *iommu = dmaru->iommu;
4365
4366	if (g_iommus[iommu->seq_id])
4367		return 0;
 
4368
4369	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4370		pr_warn("%s: Doesn't support hardware pass through.\n",
4371			iommu->name);
4372		return -ENXIO;
4373	}
4374	if (!ecap_sc_support(iommu->ecap) &&
4375	    domain_update_iommu_snooping(iommu)) {
4376		pr_warn("%s: Doesn't support snooping.\n",
4377			iommu->name);
4378		return -ENXIO;
4379	}
4380	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4381	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4382		pr_warn("%s: Doesn't support large page.\n",
4383			iommu->name);
4384		return -ENXIO;
4385	}
4386
4387	/*
4388	 * Disable translation if already enabled prior to OS handover.
4389	 */
4390	if (iommu->gcmd & DMA_GCMD_TE)
4391		iommu_disable_translation(iommu);
4392
4393	g_iommus[iommu->seq_id] = iommu;
4394	ret = iommu_init_domains(iommu);
4395	if (ret == 0)
4396		ret = iommu_alloc_root_entry(iommu);
4397	if (ret)
4398		goto out;
4399
4400	intel_svm_check(iommu);
4401
4402	if (dmaru->ignored) {
4403		/*
4404		 * we always have to disable PMRs or DMA may fail on this device
4405		 */
4406		if (force_on)
4407			iommu_disable_protect_mem_regions(iommu);
4408		return 0;
4409	}
4410
4411	intel_iommu_init_qi(iommu);
4412	iommu_flush_write_buffer(iommu);
4413
4414#ifdef CONFIG_INTEL_IOMMU_SVM
4415	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4416		ret = intel_svm_enable_prq(iommu);
4417		if (ret)
4418			goto disable_iommu;
4419	}
4420#endif
4421	ret = dmar_set_interrupt(iommu);
4422	if (ret)
4423		goto disable_iommu;
4424
4425	iommu_set_root_entry(iommu);
4426	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4427	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4428	iommu_enable_translation(iommu);
4429
4430	iommu_disable_protect_mem_regions(iommu);
4431	return 0;
4432
4433disable_iommu:
4434	disable_dmar_iommu(iommu);
4435out:
4436	free_dmar_iommu(iommu);
4437	return ret;
4438}
4439
4440int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4441{
4442	int ret = 0;
4443	struct intel_iommu *iommu = dmaru->iommu;
4444
4445	if (!intel_iommu_enabled)
4446		return 0;
4447	if (iommu == NULL)
4448		return -EINVAL;
4449
4450	if (insert) {
4451		ret = intel_iommu_add(dmaru);
4452	} else {
4453		disable_dmar_iommu(iommu);
4454		free_dmar_iommu(iommu);
4455	}
4456
4457	return ret;
4458}
4459
4460static void intel_iommu_free_dmars(void)
4461{
4462	struct dmar_rmrr_unit *rmrru, *rmrr_n;
4463	struct dmar_atsr_unit *atsru, *atsr_n;
 
4464
4465	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4466		list_del(&rmrru->list);
4467		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4468		kfree(rmrru);
4469	}
4470
4471	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4472		list_del(&atsru->list);
4473		intel_iommu_free_atsr(atsru);
4474	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4475}
4476
4477int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4478{
4479	int i, ret = 1;
4480	struct pci_bus *bus;
4481	struct pci_dev *bridge = NULL;
4482	struct device *tmp;
4483	struct acpi_dmar_atsr *atsr;
4484	struct dmar_atsr_unit *atsru;
 
4485
4486	dev = pci_physfn(dev);
 
 
 
 
 
 
 
 
 
 
 
4487	for (bus = dev->bus; bus; bus = bus->parent) {
4488		bridge = bus->self;
4489		/* If it's an integrated device, allow ATS */
4490		if (!bridge)
4491			return 1;
4492		/* Connected via non-PCIe: no ATS */
4493		if (!pci_is_pcie(bridge) ||
4494		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4495			return 0;
4496		/* If we found the root port, look it up in the ATSR */
4497		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4498			break;
4499	}
4500
4501	rcu_read_lock();
4502	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4503		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4504		if (atsr->segment != pci_domain_nr(dev->bus))
4505			continue;
4506
4507		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4508			if (tmp == &bridge->dev)
4509				goto out;
4510
4511		if (atsru->include_all)
4512			goto out;
4513	}
4514	ret = 0;
4515out:
4516	rcu_read_unlock();
4517
4518	return ret;
4519}
4520
4521int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4522{
4523	int ret;
4524	struct dmar_rmrr_unit *rmrru;
4525	struct dmar_atsr_unit *atsru;
 
4526	struct acpi_dmar_atsr *atsr;
4527	struct acpi_dmar_reserved_memory *rmrr;
 
4528
4529	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4530		return 0;
4531
4532	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4533		rmrr = container_of(rmrru->hdr,
4534				    struct acpi_dmar_reserved_memory, header);
4535		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4536			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4537				((void *)rmrr) + rmrr->header.length,
4538				rmrr->segment, rmrru->devices,
4539				rmrru->devices_cnt);
4540			if (ret < 0)
4541				return ret;
4542		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4543			dmar_remove_dev_scope(info, rmrr->segment,
4544				rmrru->devices, rmrru->devices_cnt);
4545		}
4546	}
4547
4548	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4549		if (atsru->include_all)
4550			continue;
4551
4552		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4553		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4554			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4555					(void *)atsr + atsr->header.length,
4556					atsr->segment, atsru->devices,
4557					atsru->devices_cnt);
4558			if (ret > 0)
4559				break;
4560			else if (ret < 0)
4561				return ret;
4562		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4563			if (dmar_remove_dev_scope(info, atsr->segment,
4564					atsru->devices, atsru->devices_cnt))
4565				break;
4566		}
4567	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4568
4569	return 0;
4570}
4571
4572static int intel_iommu_memory_notifier(struct notifier_block *nb,
4573				       unsigned long val, void *v)
4574{
4575	struct memory_notify *mhp = v;
4576	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4577	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4578			mhp->nr_pages - 1);
4579
4580	switch (val) {
4581	case MEM_GOING_ONLINE:
4582		if (iommu_domain_identity_map(si_domain,
4583					      start_vpfn, last_vpfn)) {
4584			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4585				start_vpfn, last_vpfn);
4586			return NOTIFY_BAD;
4587		}
4588		break;
4589
4590	case MEM_OFFLINE:
4591	case MEM_CANCEL_ONLINE:
4592		{
4593			struct dmar_drhd_unit *drhd;
4594			struct intel_iommu *iommu;
4595			struct page *freelist;
4596
4597			freelist = domain_unmap(si_domain,
4598						start_vpfn, last_vpfn);
4599
4600			rcu_read_lock();
4601			for_each_active_iommu(iommu, drhd)
4602				iommu_flush_iotlb_psi(iommu, si_domain,
4603					start_vpfn, mhp->nr_pages,
4604					!freelist, 0);
4605			rcu_read_unlock();
4606			dma_free_pagelist(freelist);
4607		}
4608		break;
4609	}
4610
4611	return NOTIFY_OK;
4612}
4613
4614static struct notifier_block intel_iommu_memory_nb = {
4615	.notifier_call = intel_iommu_memory_notifier,
4616	.priority = 0
4617};
4618
4619static void free_all_cpu_cached_iovas(unsigned int cpu)
4620{
4621	int i;
4622
4623	for (i = 0; i < g_num_of_iommus; i++) {
4624		struct intel_iommu *iommu = g_iommus[i];
4625		struct dmar_domain *domain;
4626		int did;
4627
4628		if (!iommu)
4629			continue;
4630
4631		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4632			domain = get_iommu_domain(iommu, (u16)did);
4633
4634			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4635				continue;
4636
4637			free_cpu_cached_iovas(cpu, &domain->iovad);
4638		}
4639	}
4640}
4641
4642static int intel_iommu_cpu_dead(unsigned int cpu)
4643{
4644	free_all_cpu_cached_iovas(cpu);
4645	return 0;
4646}
4647
4648static void intel_disable_iommus(void)
4649{
4650	struct intel_iommu *iommu = NULL;
4651	struct dmar_drhd_unit *drhd;
4652
4653	for_each_iommu(iommu, drhd)
4654		iommu_disable_translation(iommu);
4655}
4656
4657void intel_iommu_shutdown(void)
4658{
4659	struct dmar_drhd_unit *drhd;
4660	struct intel_iommu *iommu = NULL;
4661
4662	if (no_iommu || dmar_disabled)
4663		return;
4664
4665	down_write(&dmar_global_lock);
4666
4667	/* Disable PMRs explicitly here. */
4668	for_each_iommu(iommu, drhd)
4669		iommu_disable_protect_mem_regions(iommu);
4670
4671	/* Make sure the IOMMUs are switched off */
4672	intel_disable_iommus();
4673
4674	up_write(&dmar_global_lock);
4675}
4676
4677static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4678{
4679	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4680
4681	return container_of(iommu_dev, struct intel_iommu, iommu);
4682}
4683
4684static ssize_t intel_iommu_show_version(struct device *dev,
4685					struct device_attribute *attr,
4686					char *buf)
4687{
4688	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4689	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4690	return sprintf(buf, "%d:%d\n",
4691		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4692}
4693static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4694
4695static ssize_t intel_iommu_show_address(struct device *dev,
4696					struct device_attribute *attr,
4697					char *buf)
4698{
4699	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4700	return sprintf(buf, "%llx\n", iommu->reg_phys);
4701}
4702static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4703
4704static ssize_t intel_iommu_show_cap(struct device *dev,
4705				    struct device_attribute *attr,
4706				    char *buf)
4707{
4708	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4709	return sprintf(buf, "%llx\n", iommu->cap);
4710}
4711static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4712
4713static ssize_t intel_iommu_show_ecap(struct device *dev,
4714				    struct device_attribute *attr,
4715				    char *buf)
4716{
4717	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4718	return sprintf(buf, "%llx\n", iommu->ecap);
4719}
4720static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4721
4722static ssize_t intel_iommu_show_ndoms(struct device *dev,
4723				      struct device_attribute *attr,
4724				      char *buf)
4725{
4726	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4727	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4728}
4729static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4730
4731static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4732					   struct device_attribute *attr,
4733					   char *buf)
4734{
4735	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4736	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4737						  cap_ndoms(iommu->cap)));
4738}
4739static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4740
4741static struct attribute *intel_iommu_attrs[] = {
4742	&dev_attr_version.attr,
4743	&dev_attr_address.attr,
4744	&dev_attr_cap.attr,
4745	&dev_attr_ecap.attr,
4746	&dev_attr_domains_supported.attr,
4747	&dev_attr_domains_used.attr,
4748	NULL,
4749};
4750
4751static struct attribute_group intel_iommu_group = {
4752	.name = "intel-iommu",
4753	.attrs = intel_iommu_attrs,
4754};
4755
4756const struct attribute_group *intel_iommu_groups[] = {
4757	&intel_iommu_group,
4758	NULL,
4759};
4760
4761static inline bool has_external_pci(void)
4762{
4763	struct pci_dev *pdev = NULL;
4764
4765	for_each_pci_dev(pdev)
4766		if (pdev->external_facing)
 
4767			return true;
 
4768
4769	return false;
4770}
4771
4772static int __init platform_optin_force_iommu(void)
4773{
4774	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4775		return 0;
4776
4777	if (no_iommu || dmar_disabled)
4778		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4779
4780	/*
4781	 * If Intel-IOMMU is disabled by default, we will apply identity
4782	 * map for all devices except those marked as being untrusted.
4783	 */
4784	if (dmar_disabled)
4785		iommu_set_default_passthrough(false);
4786
4787	dmar_disabled = 0;
4788	no_iommu = 0;
4789
4790	return 1;
4791}
4792
4793static int __init probe_acpi_namespace_devices(void)
4794{
4795	struct dmar_drhd_unit *drhd;
4796	/* To avoid a -Wunused-but-set-variable warning. */
4797	struct intel_iommu *iommu __maybe_unused;
4798	struct device *dev;
4799	int i, ret = 0;
4800
4801	for_each_active_iommu(iommu, drhd) {
4802		for_each_active_dev_scope(drhd->devices,
4803					  drhd->devices_cnt, i, dev) {
4804			struct acpi_device_physical_node *pn;
4805			struct iommu_group *group;
4806			struct acpi_device *adev;
4807
4808			if (dev->bus != &acpi_bus_type)
4809				continue;
4810
4811			adev = to_acpi_device(dev);
4812			mutex_lock(&adev->physical_node_lock);
4813			list_for_each_entry(pn,
4814					    &adev->physical_node_list, node) {
4815				group = iommu_group_get(pn->dev);
4816				if (group) {
4817					iommu_group_put(group);
4818					continue;
4819				}
4820
4821				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4822				ret = iommu_probe_device(pn->dev);
4823				if (ret)
4824					break;
4825			}
4826			mutex_unlock(&adev->physical_node_lock);
4827
4828			if (ret)
4829				return ret;
4830		}
4831	}
4832
4833	return 0;
4834}
4835
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4836int __init intel_iommu_init(void)
4837{
4838	int ret = -ENODEV;
4839	struct dmar_drhd_unit *drhd;
4840	struct intel_iommu *iommu;
4841
4842	/*
4843	 * Intel IOMMU is required for a TXT/tboot launch or platform
4844	 * opt in, so enforce that.
4845	 */
4846	force_on = tboot_force_iommu() || platform_optin_force_iommu();
4847
4848	if (iommu_init_mempool()) {
4849		if (force_on)
4850			panic("tboot: Failed to initialize iommu memory\n");
4851		return -ENOMEM;
4852	}
4853
4854	down_write(&dmar_global_lock);
4855	if (dmar_table_init()) {
4856		if (force_on)
4857			panic("tboot: Failed to initialize DMAR table\n");
4858		goto out_free_dmar;
4859	}
4860
4861	if (dmar_dev_scope_init() < 0) {
4862		if (force_on)
4863			panic("tboot: Failed to initialize DMAR device scope\n");
4864		goto out_free_dmar;
4865	}
4866
4867	up_write(&dmar_global_lock);
4868
4869	/*
4870	 * The bus notifier takes the dmar_global_lock, so lockdep will
4871	 * complain later when we register it under the lock.
4872	 */
4873	dmar_register_bus_notifier();
4874
4875	down_write(&dmar_global_lock);
4876
4877	if (!no_iommu)
4878		intel_iommu_debugfs_init();
4879
4880	if (no_iommu || dmar_disabled) {
4881		/*
4882		 * We exit the function here to ensure IOMMU's remapping and
4883		 * mempool aren't setup, which means that the IOMMU's PMRs
4884		 * won't be disabled via the call to init_dmars(). So disable
4885		 * it explicitly here. The PMRs were setup by tboot prior to
4886		 * calling SENTER, but the kernel is expected to reset/tear
4887		 * down the PMRs.
4888		 */
4889		if (intel_iommu_tboot_noforce) {
4890			for_each_iommu(iommu, drhd)
4891				iommu_disable_protect_mem_regions(iommu);
4892		}
4893
4894		/*
4895		 * Make sure the IOMMUs are switched off, even when we
4896		 * boot into a kexec kernel and the previous kernel left
4897		 * them enabled
4898		 */
4899		intel_disable_iommus();
4900		goto out_free_dmar;
4901	}
4902
4903	if (list_empty(&dmar_rmrr_units))
4904		pr_info("No RMRR found\n");
4905
4906	if (list_empty(&dmar_atsr_units))
4907		pr_info("No ATSR found\n");
4908
4909	if (dmar_init_reserved_ranges()) {
4910		if (force_on)
4911			panic("tboot: Failed to reserve iommu ranges\n");
4912		goto out_free_reserved_range;
4913	}
4914
4915	if (dmar_map_gfx)
4916		intel_iommu_gfx_mapped = 1;
4917
4918	init_no_remapping_devices();
4919
4920	ret = init_dmars();
4921	if (ret) {
4922		if (force_on)
4923			panic("tboot: Failed to initialize DMARs\n");
4924		pr_err("Initialization failed\n");
4925		goto out_free_reserved_range;
4926	}
4927	up_write(&dmar_global_lock);
4928
4929	init_iommu_pm_ops();
4930
4931	down_read(&dmar_global_lock);
4932	for_each_active_iommu(iommu, drhd) {
 
 
 
 
 
 
 
 
 
 
 
4933		iommu_device_sysfs_add(&iommu->iommu, NULL,
4934				       intel_iommu_groups,
4935				       "%s", iommu->name);
4936		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4937		iommu_device_register(&iommu->iommu);
4938	}
4939	up_read(&dmar_global_lock);
4940
4941	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4942	if (si_domain && !hw_pass_through)
4943		register_memory_notifier(&intel_iommu_memory_nb);
4944	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4945			  intel_iommu_cpu_dead);
4946
4947	down_read(&dmar_global_lock);
4948	if (probe_acpi_namespace_devices())
4949		pr_warn("ACPI name space devices didn't probe correctly\n");
4950
4951	/* Finally, we enable the DMA remapping hardware. */
4952	for_each_iommu(iommu, drhd) {
4953		if (!drhd->ignored && !translation_pre_enabled(iommu))
4954			iommu_enable_translation(iommu);
4955
4956		iommu_disable_protect_mem_regions(iommu);
4957	}
4958	up_read(&dmar_global_lock);
4959
4960	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4961
4962	intel_iommu_enabled = 1;
4963
4964	return 0;
4965
4966out_free_reserved_range:
4967	put_iova_domain(&reserved_iova_list);
4968out_free_dmar:
4969	intel_iommu_free_dmars();
4970	up_write(&dmar_global_lock);
4971	iommu_exit_mempool();
4972	return ret;
4973}
4974
4975static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4976{
4977	struct intel_iommu *iommu = opaque;
4978
4979	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4980	return 0;
4981}
4982
4983/*
4984 * NB - intel-iommu lacks any sort of reference counting for the users of
4985 * dependent devices.  If multiple endpoints have intersecting dependent
4986 * devices, unbinding the driver from any one of them will possibly leave
4987 * the others unable to operate.
4988 */
4989static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4990{
4991	if (!iommu || !dev || !dev_is_pci(dev))
4992		return;
4993
4994	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
 
4995}
4996
4997static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4998{
4999	struct dmar_domain *domain;
5000	struct intel_iommu *iommu;
 
5001	unsigned long flags;
5002
5003	assert_spin_locked(&device_domain_lock);
5004
5005	if (WARN_ON(!info))
5006		return;
5007
5008	iommu = info->iommu;
5009	domain = info->domain;
5010
5011	if (info->dev) {
5012		if (dev_is_pci(info->dev) && sm_supported(iommu))
5013			intel_pasid_tear_down_entry(iommu, info->dev,
5014					PASID_RID2PASID, false);
5015
5016		iommu_disable_dev_iotlb(info);
5017		if (!dev_is_real_dma_subdevice(info->dev))
5018			domain_context_clear(iommu, info->dev);
5019		intel_pasid_free_table(info->dev);
5020	}
5021
5022	unlink_domain_info(info);
 
 
5023
5024	spin_lock_irqsave(&iommu->lock, flags);
5025	domain_detach_iommu(domain, iommu);
5026	spin_unlock_irqrestore(&iommu->lock, flags);
5027
5028	free_devinfo_mem(info);
5029}
5030
5031static void dmar_remove_one_dev_info(struct device *dev)
 
 
 
 
 
5032{
5033	struct device_domain_info *info;
 
5034	unsigned long flags;
5035
5036	spin_lock_irqsave(&device_domain_lock, flags);
5037	info = get_domain_info(dev);
5038	if (info)
5039		__dmar_remove_one_dev_info(info);
5040	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
5041}
5042
5043static int md_domain_init(struct dmar_domain *domain, int guest_width)
5044{
5045	int adjust_width;
5046
5047	/* calculate AGAW */
5048	domain->gaw = guest_width;
5049	adjust_width = guestwidth_to_adjustwidth(guest_width);
5050	domain->agaw = width_to_agaw(adjust_width);
5051
5052	domain->iommu_coherency = 0;
5053	domain->iommu_snooping = 0;
5054	domain->iommu_superpage = 0;
5055	domain->max_addr = 0;
5056
5057	/* always allocate the top pgd */
5058	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5059	if (!domain->pgd)
5060		return -ENOMEM;
5061	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5062	return 0;
5063}
5064
5065static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
 
5066{
5067	init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5068	copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
 
5069
5070	if (!intel_iommu_strict &&
5071	    init_iova_flush_queue(&dmar_domain->iovad,
5072				  iommu_flush_iova, iova_entry_free))
5073		pr_info("iova flush queue initialization failed\n");
5074}
 
5075
5076static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5077{
5078	struct dmar_domain *dmar_domain;
5079	struct iommu_domain *domain;
5080
5081	switch (type) {
 
 
5082	case IOMMU_DOMAIN_DMA:
 
5083	case IOMMU_DOMAIN_UNMANAGED:
5084		dmar_domain = alloc_domain(0);
5085		if (!dmar_domain) {
5086			pr_err("Can't allocate dmar_domain\n");
5087			return NULL;
5088		}
5089		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5090			pr_err("Domain initialization failed\n");
5091			domain_exit(dmar_domain);
5092			return NULL;
5093		}
5094
5095		if (type == IOMMU_DOMAIN_DMA)
5096			intel_init_iova_domain(dmar_domain);
5097
5098		domain_update_iommu_cap(dmar_domain);
5099
5100		domain = &dmar_domain->domain;
5101		domain->geometry.aperture_start = 0;
5102		domain->geometry.aperture_end   =
5103				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
5104		domain->geometry.force_aperture = true;
5105
5106		return domain;
5107	case IOMMU_DOMAIN_IDENTITY:
5108		return &si_domain->domain;
 
 
5109	default:
5110		return NULL;
5111	}
5112
5113	return NULL;
5114}
5115
5116static void intel_iommu_domain_free(struct iommu_domain *domain)
5117{
5118	if (domain != &si_domain->domain)
5119		domain_exit(to_dmar_domain(domain));
5120}
5121
5122/*
5123 * Check whether a @domain could be attached to the @dev through the
5124 * aux-domain attach/detach APIs.
5125 */
5126static inline bool
5127is_aux_domain(struct device *dev, struct iommu_domain *domain)
5128{
5129	struct device_domain_info *info = get_domain_info(dev);
5130
5131	return info && info->auxd_enabled &&
5132			domain->type == IOMMU_DOMAIN_UNMANAGED;
5133}
5134
5135static void auxiliary_link_device(struct dmar_domain *domain,
5136				  struct device *dev)
5137{
5138	struct device_domain_info *info = get_domain_info(dev);
5139
5140	assert_spin_locked(&device_domain_lock);
5141	if (WARN_ON(!info))
5142		return;
5143
5144	domain->auxd_refcnt++;
5145	list_add(&domain->auxd, &info->auxiliary_domains);
5146}
5147
5148static void auxiliary_unlink_device(struct dmar_domain *domain,
5149				    struct device *dev)
5150{
5151	struct device_domain_info *info = get_domain_info(dev);
5152
5153	assert_spin_locked(&device_domain_lock);
5154	if (WARN_ON(!info))
5155		return;
5156
5157	list_del(&domain->auxd);
5158	domain->auxd_refcnt--;
5159
5160	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5161		ioasid_free(domain->default_pasid);
5162}
5163
5164static int aux_domain_add_dev(struct dmar_domain *domain,
5165			      struct device *dev)
5166{
5167	int ret;
5168	unsigned long flags;
5169	struct intel_iommu *iommu;
5170
5171	iommu = device_to_iommu(dev, NULL, NULL);
5172	if (!iommu)
5173		return -ENODEV;
5174
5175	if (domain->default_pasid <= 0) {
5176		int pasid;
5177
5178		/* No private data needed for the default pasid */
5179		pasid = ioasid_alloc(NULL, PASID_MIN,
5180				     pci_max_pasids(to_pci_dev(dev)) - 1,
5181				     NULL);
5182		if (pasid == INVALID_IOASID) {
5183			pr_err("Can't allocate default pasid\n");
5184			return -ENODEV;
5185		}
5186		domain->default_pasid = pasid;
5187	}
5188
5189	spin_lock_irqsave(&device_domain_lock, flags);
5190	/*
5191	 * iommu->lock must be held to attach domain to iommu and setup the
5192	 * pasid entry for second level translation.
5193	 */
5194	spin_lock(&iommu->lock);
5195	ret = domain_attach_iommu(domain, iommu);
5196	if (ret)
5197		goto attach_failed;
5198
5199	/* Setup the PASID entry for mediated devices: */
5200	if (domain_use_first_level(domain))
5201		ret = domain_setup_first_level(iommu, domain, dev,
5202					       domain->default_pasid);
5203	else
5204		ret = intel_pasid_setup_second_level(iommu, domain, dev,
5205						     domain->default_pasid);
5206	if (ret)
5207		goto table_failed;
5208	spin_unlock(&iommu->lock);
5209
5210	auxiliary_link_device(domain, dev);
5211
5212	spin_unlock_irqrestore(&device_domain_lock, flags);
5213
5214	return 0;
5215
5216table_failed:
5217	domain_detach_iommu(domain, iommu);
5218attach_failed:
5219	spin_unlock(&iommu->lock);
5220	spin_unlock_irqrestore(&device_domain_lock, flags);
5221	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5222		ioasid_free(domain->default_pasid);
5223
5224	return ret;
5225}
5226
5227static void aux_domain_remove_dev(struct dmar_domain *domain,
5228				  struct device *dev)
5229{
5230	struct device_domain_info *info;
5231	struct intel_iommu *iommu;
5232	unsigned long flags;
5233
5234	if (!is_aux_domain(dev, &domain->domain))
5235		return;
5236
5237	spin_lock_irqsave(&device_domain_lock, flags);
5238	info = get_domain_info(dev);
5239	iommu = info->iommu;
5240
5241	auxiliary_unlink_device(domain, dev);
5242
5243	spin_lock(&iommu->lock);
5244	intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5245	domain_detach_iommu(domain, iommu);
5246	spin_unlock(&iommu->lock);
5247
5248	spin_unlock_irqrestore(&device_domain_lock, flags);
5249}
5250
5251static int prepare_domain_attach_device(struct iommu_domain *domain,
5252					struct device *dev)
5253{
5254	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5255	struct intel_iommu *iommu;
5256	int addr_width;
5257
5258	iommu = device_to_iommu(dev, NULL, NULL);
5259	if (!iommu)
5260		return -ENODEV;
5261
 
 
 
5262	/* check if this iommu agaw is sufficient for max mapped address */
5263	addr_width = agaw_to_width(iommu->agaw);
5264	if (addr_width > cap_mgaw(iommu->cap))
5265		addr_width = cap_mgaw(iommu->cap);
5266
5267	if (dmar_domain->max_addr > (1LL << addr_width)) {
5268		dev_err(dev, "%s: iommu width (%d) is not "
5269		        "sufficient for the mapped address (%llx)\n",
5270		        __func__, addr_width, dmar_domain->max_addr);
5271		return -EFAULT;
5272	}
5273	dmar_domain->gaw = addr_width;
5274
5275	/*
5276	 * Knock out extra levels of page tables if necessary
5277	 */
5278	while (iommu->agaw < dmar_domain->agaw) {
5279		struct dma_pte *pte;
5280
5281		pte = dmar_domain->pgd;
5282		if (dma_pte_present(pte)) {
5283			dmar_domain->pgd = (struct dma_pte *)
5284				phys_to_virt(dma_pte_addr(pte));
5285			free_pgtable_page(pte);
5286		}
5287		dmar_domain->agaw--;
5288	}
5289
5290	return 0;
5291}
5292
5293static int intel_iommu_attach_device(struct iommu_domain *domain,
5294				     struct device *dev)
5295{
 
5296	int ret;
5297
5298	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5299	    device_is_rmrr_locked(dev)) {
5300		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5301		return -EPERM;
5302	}
5303
5304	if (is_aux_domain(dev, domain))
5305		return -EPERM;
5306
5307	/* normally dev is not mapped */
5308	if (unlikely(domain_context_mapped(dev))) {
5309		struct dmar_domain *old_domain;
5310
5311		old_domain = find_domain(dev);
5312		if (old_domain)
5313			dmar_remove_one_dev_info(dev);
5314	}
5315
5316	ret = prepare_domain_attach_device(domain, dev);
5317	if (ret)
5318		return ret;
5319
5320	return domain_add_dev_info(to_dmar_domain(domain), dev);
5321}
5322
5323static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5324					 struct device *dev)
5325{
5326	int ret;
5327
5328	if (!is_aux_domain(dev, domain))
5329		return -EPERM;
5330
5331	ret = prepare_domain_attach_device(domain, dev);
5332	if (ret)
5333		return ret;
5334
5335	return aux_domain_add_dev(to_dmar_domain(domain), dev);
5336}
5337
5338static void intel_iommu_detach_device(struct iommu_domain *domain,
5339				      struct device *dev)
5340{
5341	dmar_remove_one_dev_info(dev);
5342}
5343
5344static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5345					  struct device *dev)
5346{
5347	aux_domain_remove_dev(to_dmar_domain(domain), dev);
5348}
5349
5350/*
5351 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5352 * VT-d granularity. Invalidation is typically included in the unmap operation
5353 * as a result of DMA or VFIO unmap. However, for assigned devices guest
5354 * owns the first level page tables. Invalidations of translation caches in the
5355 * guest are trapped and passed down to the host.
5356 *
5357 * vIOMMU in the guest will only expose first level page tables, therefore
5358 * we do not support IOTLB granularity for request without PASID (second level).
5359 *
5360 * For example, to find the VT-d granularity encoding for IOTLB
5361 * type and page selective granularity within PASID:
5362 * X: indexed by iommu cache type
5363 * Y: indexed by enum iommu_inv_granularity
5364 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5365 */
5366
5367static const int
5368inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5369	/*
5370	 * PASID based IOTLB invalidation: PASID selective (per PASID),
5371	 * page selective (address granularity)
5372	 */
5373	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5374	/* PASID based dev TLBs */
5375	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5376	/* PASID cache */
5377	{-EINVAL, -EINVAL, -EINVAL}
5378};
5379
5380static inline int to_vtd_granularity(int type, int granu)
5381{
5382	return inv_type_granu_table[type][granu];
5383}
5384
5385static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5386{
5387	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5388
5389	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5390	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5391	 * granu size in contiguous memory.
5392	 */
5393	return order_base_2(nr_pages);
5394}
5395
5396#ifdef CONFIG_INTEL_IOMMU_SVM
5397static int
5398intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5399			   struct iommu_cache_invalidate_info *inv_info)
5400{
5401	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5402	struct device_domain_info *info;
5403	struct intel_iommu *iommu;
5404	unsigned long flags;
5405	int cache_type;
5406	u8 bus, devfn;
5407	u16 did, sid;
5408	int ret = 0;
5409	u64 size = 0;
5410
5411	if (!inv_info || !dmar_domain ||
5412	    inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
5413		return -EINVAL;
5414
5415	if (!dev || !dev_is_pci(dev))
5416		return -ENODEV;
5417
5418	iommu = device_to_iommu(dev, &bus, &devfn);
5419	if (!iommu)
5420		return -ENODEV;
5421
5422	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5423		return -EINVAL;
5424
5425	spin_lock_irqsave(&device_domain_lock, flags);
5426	spin_lock(&iommu->lock);
5427	info = get_domain_info(dev);
5428	if (!info) {
5429		ret = -EINVAL;
5430		goto out_unlock;
5431	}
5432	did = dmar_domain->iommu_did[iommu->seq_id];
5433	sid = PCI_DEVID(bus, devfn);
5434
5435	/* Size is only valid in address selective invalidation */
5436	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5437		size = to_vtd_size(inv_info->addr_info.granule_size,
5438				   inv_info->addr_info.nb_granules);
5439
5440	for_each_set_bit(cache_type,
5441			 (unsigned long *)&inv_info->cache,
5442			 IOMMU_CACHE_INV_TYPE_NR) {
5443		int granu = 0;
5444		u64 pasid = 0;
5445		u64 addr = 0;
5446
5447		granu = to_vtd_granularity(cache_type, inv_info->granularity);
5448		if (granu == -EINVAL) {
5449			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5450					   cache_type, inv_info->granularity);
5451			break;
5452		}
5453
5454		/*
5455		 * PASID is stored in different locations based on the
5456		 * granularity.
5457		 */
5458		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5459		    (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5460			pasid = inv_info->pasid_info.pasid;
5461		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5462			 (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5463			pasid = inv_info->addr_info.pasid;
5464
5465		switch (BIT(cache_type)) {
5466		case IOMMU_CACHE_INV_TYPE_IOTLB:
5467			/* HW will ignore LSB bits based on address mask */
5468			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5469			    size &&
5470			    (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5471				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5472						   inv_info->addr_info.addr, size);
5473			}
5474
5475			/*
5476			 * If granu is PASID-selective, address is ignored.
5477			 * We use npages = -1 to indicate that.
5478			 */
5479			qi_flush_piotlb(iommu, did, pasid,
5480					mm_to_dma_pfn(inv_info->addr_info.addr),
5481					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5482					inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5483
5484			if (!info->ats_enabled)
5485				break;
5486			/*
5487			 * Always flush device IOTLB if ATS is enabled. vIOMMU
5488			 * in the guest may assume IOTLB flush is inclusive,
5489			 * which is more efficient.
5490			 */
5491			fallthrough;
5492		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5493			/*
5494			 * PASID based device TLB invalidation does not support
5495			 * IOMMU_INV_GRANU_PASID granularity but only supports
5496			 * IOMMU_INV_GRANU_ADDR.
5497			 * The equivalent of that is we set the size to be the
5498			 * entire range of 64 bit. User only provides PASID info
5499			 * without address info. So we set addr to 0.
5500			 */
5501			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5502				size = 64 - VTD_PAGE_SHIFT;
5503				addr = 0;
5504			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5505				addr = inv_info->addr_info.addr;
5506			}
5507
5508			if (info->ats_enabled)
5509				qi_flush_dev_iotlb_pasid(iommu, sid,
5510						info->pfsid, pasid,
5511						info->ats_qdep, addr,
5512						size);
5513			else
5514				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5515			break;
5516		default:
5517			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5518					    cache_type);
5519			ret = -EINVAL;
5520		}
5521	}
5522out_unlock:
5523	spin_unlock(&iommu->lock);
5524	spin_unlock_irqrestore(&device_domain_lock, flags);
5525
5526	return ret;
5527}
5528#endif
5529
5530static int intel_iommu_map(struct iommu_domain *domain,
5531			   unsigned long iova, phys_addr_t hpa,
5532			   size_t size, int iommu_prot, gfp_t gfp)
5533{
5534	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5535	u64 max_addr;
5536	int prot = 0;
5537	int ret;
5538
5539	if (iommu_prot & IOMMU_READ)
5540		prot |= DMA_PTE_READ;
5541	if (iommu_prot & IOMMU_WRITE)
5542		prot |= DMA_PTE_WRITE;
5543	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5544		prot |= DMA_PTE_SNP;
5545
5546	max_addr = iova + size;
5547	if (dmar_domain->max_addr < max_addr) {
5548		u64 end;
5549
5550		/* check if minimum agaw is sufficient for mapped address */
5551		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5552		if (end < max_addr) {
5553			pr_err("%s: iommu width (%d) is not "
5554			       "sufficient for the mapped address (%llx)\n",
5555			       __func__, dmar_domain->gaw, max_addr);
5556			return -EFAULT;
5557		}
5558		dmar_domain->max_addr = max_addr;
5559	}
5560	/* Round up size to next multiple of PAGE_SIZE, if it and
5561	   the low bits of hpa would take us onto the next page */
5562	size = aligned_nrpages(hpa, size);
5563	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5564				 hpa >> VTD_PAGE_SHIFT, size, prot);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5565	return ret;
5566}
5567
5568static size_t intel_iommu_unmap(struct iommu_domain *domain,
5569				unsigned long iova, size_t size,
5570				struct iommu_iotlb_gather *gather)
5571{
5572	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5573	struct page *freelist = NULL;
5574	unsigned long start_pfn, last_pfn;
5575	unsigned int npages;
5576	int iommu_id, level = 0;
5577
5578	/* Cope with horrid API which requires us to unmap more than the
5579	   size argument if it happens to be a large-page mapping. */
5580	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5581
5582	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5583		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5584
5585	start_pfn = iova >> VTD_PAGE_SHIFT;
5586	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5587
5588	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
 
 
 
 
 
5589
5590	npages = last_pfn - start_pfn + 1;
 
 
 
 
 
 
 
 
 
 
 
 
5591
5592	for_each_domain_iommu(iommu_id, dmar_domain)
5593		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5594				      start_pfn, npages, !freelist, 0);
 
 
 
 
 
 
 
5595
5596	dma_free_pagelist(freelist);
 
5597
5598	if (dmar_domain->max_addr == iova + size)
5599		dmar_domain->max_addr = iova;
 
 
5600
5601	return size;
5602}
5603
5604static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5605					    dma_addr_t iova)
5606{
5607	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5608	struct dma_pte *pte;
5609	int level = 0;
5610	u64 phys = 0;
5611
5612	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5613	if (pte && dma_pte_present(pte))
5614		phys = dma_pte_addr(pte) +
5615			(iova & (BIT_MASK(level_to_offset_bits(level) +
5616						VTD_PAGE_SHIFT) - 1));
5617
5618	return phys;
5619}
5620
5621static inline bool scalable_mode_support(void)
5622{
5623	struct dmar_drhd_unit *drhd;
5624	struct intel_iommu *iommu;
5625	bool ret = true;
5626
5627	rcu_read_lock();
5628	for_each_active_iommu(iommu, drhd) {
5629		if (!sm_supported(iommu)) {
5630			ret = false;
5631			break;
5632		}
5633	}
5634	rcu_read_unlock();
5635
5636	return ret;
5637}
5638
5639static inline bool iommu_pasid_support(void)
5640{
5641	struct dmar_drhd_unit *drhd;
5642	struct intel_iommu *iommu;
5643	bool ret = true;
5644
5645	rcu_read_lock();
5646	for_each_active_iommu(iommu, drhd) {
5647		if (!pasid_supported(iommu)) {
5648			ret = false;
5649			break;
5650		}
 
 
5651	}
5652	rcu_read_unlock();
5653
5654	return ret;
 
 
5655}
5656
5657static inline bool nested_mode_support(void)
5658{
5659	struct dmar_drhd_unit *drhd;
5660	struct intel_iommu *iommu;
5661	bool ret = true;
 
 
5662
5663	rcu_read_lock();
5664	for_each_active_iommu(iommu, drhd) {
5665		if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5666			ret = false;
5667			break;
5668		}
5669	}
5670	rcu_read_unlock();
5671
5672	return ret;
 
 
 
 
5673}
5674
5675static bool intel_iommu_capable(enum iommu_cap cap)
5676{
5677	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5678		return domain_update_iommu_snooping(NULL) == 1;
5679	if (cap == IOMMU_CAP_INTR_REMAP)
 
 
 
5680		return irq_remapping_enabled == 1;
5681
5682	return false;
 
 
 
 
 
5683}
5684
5685static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5686{
 
 
5687	struct intel_iommu *iommu;
 
 
5688
5689	iommu = device_to_iommu(dev, NULL, NULL);
5690	if (!iommu)
5691		return ERR_PTR(-ENODEV);
5692
5693	if (translation_pre_enabled(iommu))
5694		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5695
5696	return &iommu->iommu;
5697}
5698
5699static void intel_iommu_release_device(struct device *dev)
5700{
5701	struct intel_iommu *iommu;
5702
5703	iommu = device_to_iommu(dev, NULL, NULL);
5704	if (!iommu)
5705		return;
5706
5707	dmar_remove_one_dev_info(dev);
5708
 
 
5709	set_dma_ops(dev, NULL);
5710}
5711
5712static void intel_iommu_probe_finalize(struct device *dev)
5713{
5714	struct iommu_domain *domain;
5715
5716	domain = iommu_get_domain_for_dev(dev);
5717	if (device_needs_bounce(dev))
5718		set_dma_ops(dev, &bounce_dma_ops);
5719	else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5720		set_dma_ops(dev, &intel_dma_ops);
5721	else
5722		set_dma_ops(dev, NULL);
5723}
5724
5725static void intel_iommu_get_resv_regions(struct device *device,
5726					 struct list_head *head)
5727{
5728	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5729	struct iommu_resv_region *reg;
5730	struct dmar_rmrr_unit *rmrr;
5731	struct device *i_dev;
5732	int i;
5733
5734	down_read(&dmar_global_lock);
5735	for_each_rmrr_units(rmrr) {
5736		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5737					  i, i_dev) {
5738			struct iommu_resv_region *resv;
5739			enum iommu_resv_type type;
5740			size_t length;
5741
5742			if (i_dev != device &&
5743			    !is_downstream_to_pci_bridge(device, i_dev))
5744				continue;
5745
5746			length = rmrr->end_address - rmrr->base_address + 1;
5747
5748			type = device_rmrr_is_relaxable(device) ?
5749				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5750
5751			resv = iommu_alloc_resv_region(rmrr->base_address,
5752						       length, prot, type);
 
5753			if (!resv)
5754				break;
5755
5756			list_add_tail(&resv->list, head);
5757		}
5758	}
5759	up_read(&dmar_global_lock);
5760
5761#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5762	if (dev_is_pci(device)) {
5763		struct pci_dev *pdev = to_pci_dev(device);
5764
5765		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5766			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5767						   IOMMU_RESV_DIRECT_RELAXABLE);
 
5768			if (reg)
5769				list_add_tail(&reg->list, head);
5770		}
5771	}
5772#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5773
5774	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5775				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5776				      0, IOMMU_RESV_MSI);
5777	if (!reg)
5778		return;
5779	list_add_tail(&reg->list, head);
5780}
5781
5782int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5783{
5784	struct device_domain_info *info;
5785	struct context_entry *context;
5786	struct dmar_domain *domain;
5787	unsigned long flags;
5788	u64 ctx_lo;
5789	int ret;
5790
5791	domain = find_domain(dev);
5792	if (!domain)
5793		return -EINVAL;
5794
5795	spin_lock_irqsave(&device_domain_lock, flags);
5796	spin_lock(&iommu->lock);
5797
5798	ret = -EINVAL;
5799	info = get_domain_info(dev);
5800	if (!info || !info->pasid_supported)
5801		goto out;
5802
5803	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5804	if (WARN_ON(!context))
5805		goto out;
5806
5807	ctx_lo = context[0].lo;
5808
5809	if (!(ctx_lo & CONTEXT_PASIDE)) {
5810		ctx_lo |= CONTEXT_PASIDE;
5811		context[0].lo = ctx_lo;
5812		wmb();
5813		iommu->flush.flush_context(iommu,
5814					   domain->iommu_did[iommu->seq_id],
5815					   PCI_DEVID(info->bus, info->devfn),
5816					   DMA_CCMD_MASK_NOBIT,
5817					   DMA_CCMD_DEVICE_INVL);
5818	}
5819
5820	/* Enable PASID support in the device, if it wasn't already */
5821	if (!info->pasid_enabled)
5822		iommu_enable_dev_iotlb(info);
5823
5824	ret = 0;
5825
5826 out:
5827	spin_unlock(&iommu->lock);
5828	spin_unlock_irqrestore(&device_domain_lock, flags);
5829
5830	return ret;
5831}
5832
5833static void intel_iommu_apply_resv_region(struct device *dev,
5834					  struct iommu_domain *domain,
5835					  struct iommu_resv_region *region)
5836{
5837	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5838	unsigned long start, end;
5839
5840	start = IOVA_PFN(region->start);
5841	end   = IOVA_PFN(region->start + region->length - 1);
5842
5843	WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5844}
5845
5846static struct iommu_group *intel_iommu_device_group(struct device *dev)
5847{
5848	if (dev_is_pci(dev))
5849		return pci_device_group(dev);
5850	return generic_device_group(dev);
5851}
5852
5853static int intel_iommu_enable_auxd(struct device *dev)
5854{
5855	struct device_domain_info *info;
5856	struct intel_iommu *iommu;
5857	unsigned long flags;
5858	int ret;
5859
5860	iommu = device_to_iommu(dev, NULL, NULL);
5861	if (!iommu || dmar_disabled)
5862		return -EINVAL;
5863
5864	if (!sm_supported(iommu) || !pasid_supported(iommu))
 
5865		return -EINVAL;
5866
5867	ret = intel_iommu_enable_pasid(iommu, dev);
5868	if (ret)
5869		return -ENODEV;
5870
5871	spin_lock_irqsave(&device_domain_lock, flags);
5872	info = get_domain_info(dev);
5873	info->auxd_enabled = 1;
5874	spin_unlock_irqrestore(&device_domain_lock, flags);
5875
5876	return 0;
5877}
 
5878
5879static int intel_iommu_disable_auxd(struct device *dev)
5880{
5881	struct device_domain_info *info;
5882	unsigned long flags;
5883
5884	spin_lock_irqsave(&device_domain_lock, flags);
5885	info = get_domain_info(dev);
5886	if (!WARN_ON(!info))
5887		info->auxd_enabled = 0;
5888	spin_unlock_irqrestore(&device_domain_lock, flags);
5889
5890	return 0;
5891}
5892
5893/*
5894 * A PCI express designated vendor specific extended capability is defined
5895 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5896 * for system software and tools to detect endpoint devices supporting the
5897 * Intel scalable IO virtualization without host driver dependency.
5898 *
5899 * Returns the address of the matching extended capability structure within
5900 * the device's PCI configuration space or 0 if the device does not support
5901 * it.
5902 */
5903static int siov_find_pci_dvsec(struct pci_dev *pdev)
5904{
5905	int pos;
5906	u16 vendor, id;
 
5907
5908	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5909	while (pos) {
5910		pci_read_config_word(pdev, pos + 4, &vendor);
5911		pci_read_config_word(pdev, pos + 8, &id);
5912		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5913			return pos;
5914
5915		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5916	}
5917
5918	return 0;
5919}
5920
5921static bool
5922intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5923{
5924	if (feat == IOMMU_DEV_FEAT_AUX) {
5925		int ret;
5926
5927		if (!dev_is_pci(dev) || dmar_disabled ||
5928		    !scalable_mode_support() || !iommu_pasid_support())
5929			return false;
5930
5931		ret = pci_pasid_features(to_pci_dev(dev));
5932		if (ret < 0)
5933			return false;
5934
5935		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5936	}
5937
5938	if (feat == IOMMU_DEV_FEAT_SVA) {
5939		struct device_domain_info *info = get_domain_info(dev);
5940
5941		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5942			info->pasid_supported && info->pri_supported &&
5943			info->ats_supported;
5944	}
5945
5946	return false;
5947}
5948
5949static int
5950intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5951{
5952	if (feat == IOMMU_DEV_FEAT_AUX)
5953		return intel_iommu_enable_auxd(dev);
 
5954
5955	if (feat == IOMMU_DEV_FEAT_SVA) {
5956		struct device_domain_info *info = get_domain_info(dev);
5957
5958		if (!info)
5959			return -EINVAL;
5960
5961		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5962			return 0;
5963	}
5964
5965	return -ENODEV;
5966}
5967
5968static int
5969intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5970{
5971	if (feat == IOMMU_DEV_FEAT_AUX)
5972		return intel_iommu_disable_auxd(dev);
 
5973
5974	return -ENODEV;
5975}
5976
5977static bool
5978intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5979{
5980	struct device_domain_info *info = get_domain_info(dev);
5981
5982	if (feat == IOMMU_DEV_FEAT_AUX)
5983		return scalable_mode_support() && info && info->auxd_enabled;
5984
5985	return false;
5986}
5987
5988static int
5989intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5990{
5991	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5992
5993	return dmar_domain->default_pasid > 0 ?
5994			dmar_domain->default_pasid : -EINVAL;
5995}
5996
5997static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5998					   struct device *dev)
5999{
6000	return attach_deferred(dev);
6001}
6002
6003static int
6004intel_iommu_domain_set_attr(struct iommu_domain *domain,
6005			    enum iommu_attr attr, void *data)
6006{
6007	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6008	unsigned long flags;
6009	int ret = 0;
6010
6011	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6012		return -EINVAL;
6013
6014	switch (attr) {
6015	case DOMAIN_ATTR_NESTING:
6016		spin_lock_irqsave(&device_domain_lock, flags);
6017		if (nested_mode_support() &&
6018		    list_empty(&dmar_domain->devices)) {
6019			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6020			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6021		} else {
6022			ret = -ENODEV;
6023		}
6024		spin_unlock_irqrestore(&device_domain_lock, flags);
6025		break;
6026	default:
6027		ret = -EINVAL;
6028		break;
6029	}
6030
6031	return ret;
6032}
6033
6034/*
6035 * Check that the device does not live on an external facing PCI port that is
6036 * marked as untrusted. Such devices should not be able to apply quirks and
6037 * thus not be able to bypass the IOMMU restrictions.
6038 */
6039static bool risky_device(struct pci_dev *pdev)
6040{
6041	if (pdev->untrusted) {
6042		pci_info(pdev,
6043			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6044			 pdev->vendor, pdev->device);
6045		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6046		return true;
6047	}
6048	return false;
6049}
6050
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6051const struct iommu_ops intel_iommu_ops = {
6052	.capable		= intel_iommu_capable,
6053	.domain_alloc		= intel_iommu_domain_alloc,
6054	.domain_free		= intel_iommu_domain_free,
6055	.domain_set_attr	= intel_iommu_domain_set_attr,
6056	.attach_dev		= intel_iommu_attach_device,
6057	.detach_dev		= intel_iommu_detach_device,
6058	.aux_attach_dev		= intel_iommu_aux_attach_device,
6059	.aux_detach_dev		= intel_iommu_aux_detach_device,
6060	.aux_get_pasid		= intel_iommu_aux_get_pasid,
6061	.map			= intel_iommu_map,
6062	.unmap			= intel_iommu_unmap,
6063	.iova_to_phys		= intel_iommu_iova_to_phys,
6064	.probe_device		= intel_iommu_probe_device,
6065	.probe_finalize		= intel_iommu_probe_finalize,
6066	.release_device		= intel_iommu_release_device,
6067	.get_resv_regions	= intel_iommu_get_resv_regions,
6068	.put_resv_regions	= generic_iommu_put_resv_regions,
6069	.apply_resv_region	= intel_iommu_apply_resv_region,
6070	.device_group		= intel_iommu_device_group,
6071	.dev_has_feat		= intel_iommu_dev_has_feat,
6072	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
6073	.dev_enable_feat	= intel_iommu_dev_enable_feat,
6074	.dev_disable_feat	= intel_iommu_dev_disable_feat,
6075	.is_attach_deferred	= intel_iommu_is_attach_deferred,
6076	.def_domain_type	= device_def_domain_type,
6077	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
 
6078#ifdef CONFIG_INTEL_IOMMU_SVM
6079	.cache_invalidate	= intel_iommu_sva_invalidate,
6080	.sva_bind_gpasid	= intel_svm_bind_gpasid,
6081	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
6082	.sva_bind		= intel_svm_bind,
6083	.sva_unbind		= intel_svm_unbind,
6084	.sva_get_pasid		= intel_svm_get_pasid,
6085	.page_response		= intel_svm_page_response,
6086#endif
 
 
 
 
 
 
 
 
 
 
 
6087};
6088
6089static void quirk_iommu_igfx(struct pci_dev *dev)
6090{
6091	if (risky_device(dev))
6092		return;
6093
6094	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6095	dmar_map_gfx = 0;
6096}
6097
6098/* G4x/GM45 integrated gfx dmar support is totally busted. */
6099DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6100DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6101DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6102DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6103DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6104DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6105DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6106
6107/* Broadwell igfx malfunctions with dmar */
6108DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6109DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6110DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6111DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6112DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6113DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6114DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6115DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6116DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6117DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6118DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6119DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6120DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6121DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6122DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6123DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6124DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6125DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6126DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6127DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6128DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6129DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6130DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6131DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6132
6133static void quirk_iommu_rwbf(struct pci_dev *dev)
6134{
6135	if (risky_device(dev))
6136		return;
6137
6138	/*
6139	 * Mobile 4 Series Chipset neglects to set RWBF capability,
6140	 * but needs it. Same seems to hold for the desktop versions.
6141	 */
6142	pci_info(dev, "Forcing write-buffer flush capability\n");
6143	rwbf_quirk = 1;
6144}
6145
6146DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6147DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6148DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6149DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6150DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6151DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6152DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6153
6154#define GGC 0x52
6155#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
6156#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
6157#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
6158#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
6159#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
6160#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
6161#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
6162#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
6163
6164static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6165{
6166	unsigned short ggc;
6167
6168	if (risky_device(dev))
6169		return;
6170
6171	if (pci_read_config_word(dev, GGC, &ggc))
6172		return;
6173
6174	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6175		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6176		dmar_map_gfx = 0;
6177	} else if (dmar_map_gfx) {
6178		/* we have to ensure the gfx device is idle before we flush */
6179		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6180		intel_iommu_strict = 1;
6181       }
6182}
6183DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6184DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6185DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6186DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6187
6188static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6189{
6190	unsigned short ver;
6191
6192	if (!IS_GFX_DEVICE(dev))
6193		return;
6194
6195	ver = (dev->device >> 8) & 0xff;
6196	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6197	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6198	    ver != 0x9a)
6199		return;
6200
6201	if (risky_device(dev))
6202		return;
6203
6204	pci_info(dev, "Skip IOMMU disabling for graphics\n");
6205	iommu_skip_te_disable = 1;
6206}
6207DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6208
6209/* On Tylersburg chipsets, some BIOSes have been known to enable the
6210   ISOCH DMAR unit for the Azalia sound device, but not give it any
6211   TLB entries, which causes it to deadlock. Check for that.  We do
6212   this in a function called from init_dmars(), instead of in a PCI
6213   quirk, because we don't want to print the obnoxious "BIOS broken"
6214   message if VT-d is actually disabled.
6215*/
6216static void __init check_tylersburg_isoch(void)
6217{
6218	struct pci_dev *pdev;
6219	uint32_t vtisochctrl;
6220
6221	/* If there's no Azalia in the system anyway, forget it. */
6222	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6223	if (!pdev)
6224		return;
6225
6226	if (risky_device(pdev)) {
6227		pci_dev_put(pdev);
6228		return;
6229	}
6230
6231	pci_dev_put(pdev);
6232
6233	/* System Management Registers. Might be hidden, in which case
6234	   we can't do the sanity check. But that's OK, because the
6235	   known-broken BIOSes _don't_ actually hide it, so far. */
6236	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6237	if (!pdev)
6238		return;
6239
6240	if (risky_device(pdev)) {
6241		pci_dev_put(pdev);
6242		return;
6243	}
6244
6245	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6246		pci_dev_put(pdev);
6247		return;
6248	}
6249
6250	pci_dev_put(pdev);
6251
6252	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6253	if (vtisochctrl & 1)
6254		return;
6255
6256	/* Drop all bits other than the number of TLB entries */
6257	vtisochctrl &= 0x1c;
6258
6259	/* If we have the recommended number of TLB entries (16), fine. */
6260	if (vtisochctrl == 0x10)
6261		return;
6262
6263	/* Zero TLB entries? You get to ride the short bus to school. */
6264	if (!vtisochctrl) {
6265		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6266		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6267		     dmi_get_system_info(DMI_BIOS_VENDOR),
6268		     dmi_get_system_info(DMI_BIOS_VERSION),
6269		     dmi_get_system_info(DMI_PRODUCT_VERSION));
6270		iommu_identity_mapping |= IDENTMAP_AZALIA;
6271		return;
6272	}
6273
6274	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6275	       vtisochctrl);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6276}