intel-iommu.c - drivers/iommu/intel-iommu.c - Linux source code v6.2

Note: File does not exist in v6.2.
   1/*
   2 * Copyright © 2006-2014 Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * Authors: David Woodhouse <dwmw2@infradead.org>,
  14 *          Ashok Raj <ashok.raj@intel.com>,
  15 *          Shaohua Li <shaohua.li@intel.com>,
  16 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
  17 *          Fenghua Yu <fenghua.yu@intel.com>
  18 *          Joerg Roedel <jroedel@suse.de>
  19 */
  20
  21#define pr_fmt(fmt)     "DMAR: " fmt
  22
  23#include <linux/init.h>
  24#include <linux/bitmap.h>
  25#include <linux/debugfs.h>
  26#include <linux/export.h>
  27#include <linux/slab.h>
  28#include <linux/irq.h>
  29#include <linux/interrupt.h>
  30#include <linux/spinlock.h>
  31#include <linux/pci.h>
  32#include <linux/dmar.h>
  33#include <linux/dma-mapping.h>
  34#include <linux/dma-direct.h>
  35#include <linux/mempool.h>
  36#include <linux/memory.h>
  37#include <linux/cpu.h>
  38#include <linux/timer.h>
  39#include <linux/io.h>
  40#include <linux/iova.h>
  41#include <linux/iommu.h>
  42#include <linux/intel-iommu.h>
  43#include <linux/syscore_ops.h>
  44#include <linux/tboot.h>
  45#include <linux/dmi.h>
  46#include <linux/pci-ats.h>
  47#include <linux/memblock.h>
  48#include <linux/dma-contiguous.h>
  49#include <linux/dma-direct.h>
  50#include <linux/crash_dump.h>
  51#include <asm/irq_remapping.h>
  52#include <asm/cacheflush.h>
  53#include <asm/iommu.h>
  54
  55#include "irq_remapping.h"
  56
  57#define ROOT_SIZE		VTD_PAGE_SIZE
  58#define CONTEXT_SIZE		VTD_PAGE_SIZE
  59
  60#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  61#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  62#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  63#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  64
  65#define IOAPIC_RANGE_START	(0xfee00000)
  66#define IOAPIC_RANGE_END	(0xfeefffff)
  67#define IOVA_START_ADDR		(0x1000)
  68
  69#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  70
  71#define MAX_AGAW_WIDTH 64
  72#define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  73
  74#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  75#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  76
  77/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  78   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  79#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
  80				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  81#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  82
  83/* IO virtual address start page frame number */
  84#define IOVA_START_PFN		(1)
  85
  86#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
  87
  88/* page table handling */
  89#define LEVEL_STRIDE		(9)
  90#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
  91
  92/*
  93 * This bitmap is used to advertise the page sizes our hardware support
  94 * to the IOMMU core, which will then use this information to split
  95 * physically contiguous memory regions it is mapping into page sizes
  96 * that we support.
  97 *
  98 * Traditionally the IOMMU core just handed us the mappings directly,
  99 * after making sure the size is an order of a 4KiB page and that the
 100 * mapping has natural alignment.
 101 *
 102 * To retain this behavior, we currently advertise that we support
 103 * all page sizes that are an order of 4KiB.
 104 *
 105 * If at some point we'd like to utilize the IOMMU core's new behavior,
 106 * we could change this to advertise the real page sizes we support.
 107 */
 108#define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
 109
 110static inline int agaw_to_level(int agaw)
 111{
 112	return agaw + 2;
 113}
 114
 115static inline int agaw_to_width(int agaw)
 116{
 117	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 118}
 119
 120static inline int width_to_agaw(int width)
 121{
 122	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 123}
 124
 125static inline unsigned int level_to_offset_bits(int level)
 126{
 127	return (level - 1) * LEVEL_STRIDE;
 128}
 129
 130static inline int pfn_level_offset(unsigned long pfn, int level)
 131{
 132	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 133}
 134
 135static inline unsigned long level_mask(int level)
 136{
 137	return -1UL << level_to_offset_bits(level);
 138}
 139
 140static inline unsigned long level_size(int level)
 141{
 142	return 1UL << level_to_offset_bits(level);
 143}
 144
 145static inline unsigned long align_to_level(unsigned long pfn, int level)
 146{
 147	return (pfn + level_size(level) - 1) & level_mask(level);
 148}
 149
 150static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 151{
 152	return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 153}
 154
 155/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 156   are never going to work. */
 157static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 158{
 159	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 160}
 161
 162static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 163{
 164	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 165}
 166static inline unsigned long page_to_dma_pfn(struct page *pg)
 167{
 168	return mm_to_dma_pfn(page_to_pfn(pg));
 169}
 170static inline unsigned long virt_to_dma_pfn(void *p)
 171{
 172	return page_to_dma_pfn(virt_to_page(p));
 173}
 174
 175/* global iommu list, set NULL for ignored DMAR units */
 176static struct intel_iommu **g_iommus;
 177
 178static void __init check_tylersburg_isoch(void);
 179static int rwbf_quirk;
 180
 181/*
 182 * set to 1 to panic kernel if can't successfully enable VT-d
 183 * (used when kernel is launched w/ TXT)
 184 */
 185static int force_on = 0;
 186int intel_iommu_tboot_noforce;
 187
 188/*
 189 * 0: Present
 190 * 1-11: Reserved
 191 * 12-63: Context Ptr (12 - (haw-1))
 192 * 64-127: Reserved
 193 */
 194struct root_entry {
 195	u64	lo;
 196	u64	hi;
 197};
 198#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 199
 200/*
 201 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 202 * if marked present.
 203 */
 204static phys_addr_t root_entry_lctp(struct root_entry *re)
 205{
 206	if (!(re->lo & 1))
 207		return 0;
 208
 209	return re->lo & VTD_PAGE_MASK;
 210}
 211
 212/*
 213 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 214 * if marked present.
 215 */
 216static phys_addr_t root_entry_uctp(struct root_entry *re)
 217{
 218	if (!(re->hi & 1))
 219		return 0;
 220
 221	return re->hi & VTD_PAGE_MASK;
 222}
 223/*
 224 * low 64 bits:
 225 * 0: present
 226 * 1: fault processing disable
 227 * 2-3: translation type
 228 * 12-63: address space root
 229 * high 64 bits:
 230 * 0-2: address width
 231 * 3-6: aval
 232 * 8-23: domain id
 233 */
 234struct context_entry {
 235	u64 lo;
 236	u64 hi;
 237};
 238
 239static inline void context_clear_pasid_enable(struct context_entry *context)
 240{
 241	context->lo &= ~(1ULL << 11);
 242}
 243
 244static inline bool context_pasid_enabled(struct context_entry *context)
 245{
 246	return !!(context->lo & (1ULL << 11));
 247}
 248
 249static inline void context_set_copied(struct context_entry *context)
 250{
 251	context->hi |= (1ull << 3);
 252}
 253
 254static inline bool context_copied(struct context_entry *context)
 255{
 256	return !!(context->hi & (1ULL << 3));
 257}
 258
 259static inline bool __context_present(struct context_entry *context)
 260{
 261	return (context->lo & 1);
 262}
 263
 264static inline bool context_present(struct context_entry *context)
 265{
 266	return context_pasid_enabled(context) ?
 267	     __context_present(context) :
 268	     __context_present(context) && !context_copied(context);
 269}
 270
 271static inline void context_set_present(struct context_entry *context)
 272{
 273	context->lo |= 1;
 274}
 275
 276static inline void context_set_fault_enable(struct context_entry *context)
 277{
 278	context->lo &= (((u64)-1) << 2) | 1;
 279}
 280
 281static inline void context_set_translation_type(struct context_entry *context,
 282						unsigned long value)
 283{
 284	context->lo &= (((u64)-1) << 4) | 3;
 285	context->lo |= (value & 3) << 2;
 286}
 287
 288static inline void context_set_address_root(struct context_entry *context,
 289					    unsigned long value)
 290{
 291	context->lo &= ~VTD_PAGE_MASK;
 292	context->lo |= value & VTD_PAGE_MASK;
 293}
 294
 295static inline void context_set_address_width(struct context_entry *context,
 296					     unsigned long value)
 297{
 298	context->hi |= value & 7;
 299}
 300
 301static inline void context_set_domain_id(struct context_entry *context,
 302					 unsigned long value)
 303{
 304	context->hi |= (value & ((1 << 16) - 1)) << 8;
 305}
 306
 307static inline int context_domain_id(struct context_entry *c)
 308{
 309	return((c->hi >> 8) & 0xffff);
 310}
 311
 312static inline void context_clear_entry(struct context_entry *context)
 313{
 314	context->lo = 0;
 315	context->hi = 0;
 316}
 317
 318/*
 319 * 0: readable
 320 * 1: writable
 321 * 2-6: reserved
 322 * 7: super page
 323 * 8-10: available
 324 * 11: snoop behavior
 325 * 12-63: Host physcial address
 326 */
 327struct dma_pte {
 328	u64 val;
 329};
 330
 331static inline void dma_clear_pte(struct dma_pte *pte)
 332{
 333	pte->val = 0;
 334}
 335
 336static inline u64 dma_pte_addr(struct dma_pte *pte)
 337{
 338#ifdef CONFIG_64BIT
 339	return pte->val & VTD_PAGE_MASK;
 340#else
 341	/* Must have a full atomic 64-bit read */
 342	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 343#endif
 344}
 345
 346static inline bool dma_pte_present(struct dma_pte *pte)
 347{
 348	return (pte->val & 3) != 0;
 349}
 350
 351static inline bool dma_pte_superpage(struct dma_pte *pte)
 352{
 353	return (pte->val & DMA_PTE_LARGE_PAGE);
 354}
 355
 356static inline int first_pte_in_page(struct dma_pte *pte)
 357{
 358	return !((unsigned long)pte & ~VTD_PAGE_MASK);
 359}
 360
 361/*
 362 * This domain is a statically identity mapping domain.
 363 *	1. This domain creats a static 1:1 mapping to all usable memory.
 364 * 	2. It maps to each iommu if successful.
 365 *	3. Each iommu mapps to this domain if successful.
 366 */
 367static struct dmar_domain *si_domain;
 368static int hw_pass_through = 1;
 369
 370/*
 371 * Domain represents a virtual machine, more than one devices
 372 * across iommus may be owned in one domain, e.g. kvm guest.
 373 */
 374#define DOMAIN_FLAG_VIRTUAL_MACHINE	(1 << 0)
 375
 376/* si_domain contains mulitple devices */
 377#define DOMAIN_FLAG_STATIC_IDENTITY	(1 << 1)
 378
 379#define for_each_domain_iommu(idx, domain)			\
 380	for (idx = 0; idx < g_num_of_iommus; idx++)		\
 381		if (domain->iommu_refcnt[idx])
 382
 383struct dmar_domain {
 384	int	nid;			/* node id */
 385
 386	unsigned	iommu_refcnt[DMAR_UNITS_SUPPORTED];
 387					/* Refcount of devices per iommu */
 388
 389
 390	u16		iommu_did[DMAR_UNITS_SUPPORTED];
 391					/* Domain ids per IOMMU. Use u16 since
 392					 * domain ids are 16 bit wide according
 393					 * to VT-d spec, section 9.3 */
 394
 395	bool has_iotlb_device;
 396	struct list_head devices;	/* all devices' list */
 397	struct iova_domain iovad;	/* iova's that belong to this domain */
 398
 399	struct dma_pte	*pgd;		/* virtual address */
 400	int		gaw;		/* max guest address width */
 401
 402	/* adjusted guest address width, 0 is level 2 30-bit */
 403	int		agaw;
 404
 405	int		flags;		/* flags to find out type of domain */
 406
 407	int		iommu_coherency;/* indicate coherency of iommu access */
 408	int		iommu_snooping; /* indicate snooping control feature*/
 409	int		iommu_count;	/* reference count of iommu */
 410	int		iommu_superpage;/* Level of superpages supported:
 411					   0 == 4KiB (no superpages), 1 == 2MiB,
 412					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 413	u64		max_addr;	/* maximum mapped address */
 414
 415	struct iommu_domain domain;	/* generic domain data structure for
 416					   iommu core */
 417};
 418
 419/* PCI domain-device relationship */
 420struct device_domain_info {
 421	struct list_head link;	/* link to domain siblings */
 422	struct list_head global; /* link to global list */
 423	u8 bus;			/* PCI bus number */
 424	u8 devfn;		/* PCI devfn number */
 425	u8 pasid_supported:3;
 426	u8 pasid_enabled:1;
 427	u8 pri_supported:1;
 428	u8 pri_enabled:1;
 429	u8 ats_supported:1;
 430	u8 ats_enabled:1;
 431	u8 ats_qdep;
 432	struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
 433	struct intel_iommu *iommu; /* IOMMU used by this device */
 434	struct dmar_domain *domain; /* pointer to domain */
 435};
 436
 437struct dmar_rmrr_unit {
 438	struct list_head list;		/* list of rmrr units	*/
 439	struct acpi_dmar_header *hdr;	/* ACPI header		*/
 440	u64	base_address;		/* reserved base address*/
 441	u64	end_address;		/* reserved end address */
 442	struct dmar_dev_scope *devices;	/* target devices */
 443	int	devices_cnt;		/* target device count */
 444	struct iommu_resv_region *resv; /* reserved region handle */
 445};
 446
 447struct dmar_atsr_unit {
 448	struct list_head list;		/* list of ATSR units */
 449	struct acpi_dmar_header *hdr;	/* ACPI header */
 450	struct dmar_dev_scope *devices;	/* target devices */
 451	int devices_cnt;		/* target device count */
 452	u8 include_all:1;		/* include all ports */
 453};
 454
 455static LIST_HEAD(dmar_atsr_units);
 456static LIST_HEAD(dmar_rmrr_units);
 457
 458#define for_each_rmrr_units(rmrr) \
 459	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 460
 461/* bitmap for indexing intel_iommus */
 462static int g_num_of_iommus;
 463
 464static void domain_exit(struct dmar_domain *domain);
 465static void domain_remove_dev_info(struct dmar_domain *domain);
 466static void dmar_remove_one_dev_info(struct dmar_domain *domain,
 467				     struct device *dev);
 468static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 469static void domain_context_clear(struct intel_iommu *iommu,
 470				 struct device *dev);
 471static int domain_detach_iommu(struct dmar_domain *domain,
 472			       struct intel_iommu *iommu);
 473
 474#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 475int dmar_disabled = 0;
 476#else
 477int dmar_disabled = 1;
 478#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 479
 480int intel_iommu_enabled = 0;
 481EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 482
 483static int dmar_map_gfx = 1;
 484static int dmar_forcedac;
 485static int intel_iommu_strict;
 486static int intel_iommu_superpage = 1;
 487static int intel_iommu_ecs = 1;
 488static int intel_iommu_pasid28;
 489static int iommu_identity_mapping;
 490
 491#define IDENTMAP_ALL		1
 492#define IDENTMAP_GFX		2
 493#define IDENTMAP_AZALIA		4
 494
 495/* Broadwell and Skylake have broken ECS support — normal so-called "second
 496 * level" translation of DMA requests-without-PASID doesn't actually happen
 497 * unless you also set the NESTE bit in an extended context-entry. Which of
 498 * course means that SVM doesn't work because it's trying to do nested
 499 * translation of the physical addresses it finds in the process page tables,
 500 * through the IOVA->phys mapping found in the "second level" page tables.
 501 *
 502 * The VT-d specification was retroactively changed to change the definition
 503 * of the capability bits and pretend that Broadwell/Skylake never happened...
 504 * but unfortunately the wrong bit was changed. It's ECS which is broken, but
 505 * for some reason it was the PASID capability bit which was redefined (from
 506 * bit 28 on BDW/SKL to bit 40 in future).
 507 *
 508 * So our test for ECS needs to eschew those implementations which set the old
 509 * PASID capabiity bit 28, since those are the ones on which ECS is broken.
 510 * Unless we are working around the 'pasid28' limitations, that is, by putting
 511 * the device into passthrough mode for normal DMA and thus masking the bug.
 512 */
 513#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
 514			    (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
 515/* PASID support is thus enabled if ECS is enabled and *either* of the old
 516 * or new capability bits are set. */
 517#define pasid_enabled(iommu) (ecs_enabled(iommu) &&			\
 518			      (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
 519
 520int intel_iommu_gfx_mapped;
 521EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 522
 523#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 524static DEFINE_SPINLOCK(device_domain_lock);
 525static LIST_HEAD(device_domain_list);
 526
 527const struct iommu_ops intel_iommu_ops;
 528
 529static bool translation_pre_enabled(struct intel_iommu *iommu)
 530{
 531	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 532}
 533
 534static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 535{
 536	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 537}
 538
 539static void init_translation_status(struct intel_iommu *iommu)
 540{
 541	u32 gsts;
 542
 543	gsts = readl(iommu->reg + DMAR_GSTS_REG);
 544	if (gsts & DMA_GSTS_TES)
 545		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 546}
 547
 548/* Convert generic 'struct iommu_domain to private struct dmar_domain */
 549static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 550{
 551	return container_of(dom, struct dmar_domain, domain);
 552}
 553
 554static int __init intel_iommu_setup(char *str)
 555{
 556	if (!str)
 557		return -EINVAL;
 558	while (*str) {
 559		if (!strncmp(str, "on", 2)) {
 560			dmar_disabled = 0;
 561			pr_info("IOMMU enabled\n");
 562		} else if (!strncmp(str, "off", 3)) {
 563			dmar_disabled = 1;
 564			pr_info("IOMMU disabled\n");
 565		} else if (!strncmp(str, "igfx_off", 8)) {
 566			dmar_map_gfx = 0;
 567			pr_info("Disable GFX device mapping\n");
 568		} else if (!strncmp(str, "forcedac", 8)) {
 569			pr_info("Forcing DAC for PCI devices\n");
 570			dmar_forcedac = 1;
 571		} else if (!strncmp(str, "strict", 6)) {
 572			pr_info("Disable batched IOTLB flush\n");
 573			intel_iommu_strict = 1;
 574		} else if (!strncmp(str, "sp_off", 6)) {
 575			pr_info("Disable supported super page\n");
 576			intel_iommu_superpage = 0;
 577		} else if (!strncmp(str, "ecs_off", 7)) {
 578			printk(KERN_INFO
 579				"Intel-IOMMU: disable extended context table support\n");
 580			intel_iommu_ecs = 0;
 581		} else if (!strncmp(str, "pasid28", 7)) {
 582			printk(KERN_INFO
 583				"Intel-IOMMU: enable pre-production PASID support\n");
 584			intel_iommu_pasid28 = 1;
 585			iommu_identity_mapping |= IDENTMAP_GFX;
 586		} else if (!strncmp(str, "tboot_noforce", 13)) {
 587			printk(KERN_INFO
 588				"Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 589			intel_iommu_tboot_noforce = 1;
 590		}
 591
 592		str += strcspn(str, ",");
 593		while (*str == ',')
 594			str++;
 595	}
 596	return 0;
 597}
 598__setup("intel_iommu=", intel_iommu_setup);
 599
 600static struct kmem_cache *iommu_domain_cache;
 601static struct kmem_cache *iommu_devinfo_cache;
 602
 603static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 604{
 605	struct dmar_domain **domains;
 606	int idx = did >> 8;
 607
 608	domains = iommu->domains[idx];
 609	if (!domains)
 610		return NULL;
 611
 612	return domains[did & 0xff];
 613}
 614
 615static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 616			     struct dmar_domain *domain)
 617{
 618	struct dmar_domain **domains;
 619	int idx = did >> 8;
 620
 621	if (!iommu->domains[idx]) {
 622		size_t size = 256 * sizeof(struct dmar_domain *);
 623		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 624	}
 625
 626	domains = iommu->domains[idx];
 627	if (WARN_ON(!domains))
 628		return;
 629	else
 630		domains[did & 0xff] = domain;
 631}
 632
 633static inline void *alloc_pgtable_page(int node)
 634{
 635	struct page *page;
 636	void *vaddr = NULL;
 637
 638	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 639	if (page)
 640		vaddr = page_address(page);
 641	return vaddr;
 642}
 643
 644static inline void free_pgtable_page(void *vaddr)
 645{
 646	free_page((unsigned long)vaddr);
 647}
 648
 649static inline void *alloc_domain_mem(void)
 650{
 651	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 652}
 653
 654static void free_domain_mem(void *vaddr)
 655{
 656	kmem_cache_free(iommu_domain_cache, vaddr);
 657}
 658
 659static inline void * alloc_devinfo_mem(void)
 660{
 661	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 662}
 663
 664static inline void free_devinfo_mem(void *vaddr)
 665{
 666	kmem_cache_free(iommu_devinfo_cache, vaddr);
 667}
 668
 669static inline int domain_type_is_vm(struct dmar_domain *domain)
 670{
 671	return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
 672}
 673
 674static inline int domain_type_is_si(struct dmar_domain *domain)
 675{
 676	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 677}
 678
 679static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
 680{
 681	return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
 682				DOMAIN_FLAG_STATIC_IDENTITY);
 683}
 684
 685static inline int domain_pfn_supported(struct dmar_domain *domain,
 686				       unsigned long pfn)
 687{
 688	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 689
 690	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 691}
 692
 693static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 694{
 695	unsigned long sagaw;
 696	int agaw = -1;
 697
 698	sagaw = cap_sagaw(iommu->cap);
 699	for (agaw = width_to_agaw(max_gaw);
 700	     agaw >= 0; agaw--) {
 701		if (test_bit(agaw, &sagaw))
 702			break;
 703	}
 704
 705	return agaw;
 706}
 707
 708/*
 709 * Calculate max SAGAW for each iommu.
 710 */
 711int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 712{
 713	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 714}
 715
 716/*
 717 * calculate agaw for each iommu.
 718 * "SAGAW" may be different across iommus, use a default agaw, and
 719 * get a supported less agaw for iommus that don't support the default agaw.
 720 */
 721int iommu_calculate_agaw(struct intel_iommu *iommu)
 722{
 723	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 724}
 725
 726/* This functionin only returns single iommu in a domain */
 727static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 728{
 729	int iommu_id;
 730
 731	/* si_domain and vm domain should not get here. */
 732	BUG_ON(domain_type_is_vm_or_si(domain));
 733	for_each_domain_iommu(iommu_id, domain)
 734		break;
 735
 736	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 737		return NULL;
 738
 739	return g_iommus[iommu_id];
 740}
 741
 742static void domain_update_iommu_coherency(struct dmar_domain *domain)
 743{
 744	struct dmar_drhd_unit *drhd;
 745	struct intel_iommu *iommu;
 746	bool found = false;
 747	int i;
 748
 749	domain->iommu_coherency = 1;
 750
 751	for_each_domain_iommu(i, domain) {
 752		found = true;
 753		if (!ecap_coherent(g_iommus[i]->ecap)) {
 754			domain->iommu_coherency = 0;
 755			break;
 756		}
 757	}
 758	if (found)
 759		return;
 760
 761	/* No hardware attached; use lowest common denominator */
 762	rcu_read_lock();
 763	for_each_active_iommu(iommu, drhd) {
 764		if (!ecap_coherent(iommu->ecap)) {
 765			domain->iommu_coherency = 0;
 766			break;
 767		}
 768	}
 769	rcu_read_unlock();
 770}
 771
 772static int domain_update_iommu_snooping(struct intel_iommu *skip)
 773{
 774	struct dmar_drhd_unit *drhd;
 775	struct intel_iommu *iommu;
 776	int ret = 1;
 777
 778	rcu_read_lock();
 779	for_each_active_iommu(iommu, drhd) {
 780		if (iommu != skip) {
 781			if (!ecap_sc_support(iommu->ecap)) {
 782				ret = 0;
 783				break;
 784			}
 785		}
 786	}
 787	rcu_read_unlock();
 788
 789	return ret;
 790}
 791
 792static int domain_update_iommu_superpage(struct intel_iommu *skip)
 793{
 794	struct dmar_drhd_unit *drhd;
 795	struct intel_iommu *iommu;
 796	int mask = 0xf;
 797
 798	if (!intel_iommu_superpage) {
 799		return 0;
 800	}
 801
 802	/* set iommu_superpage to the smallest common denominator */
 803	rcu_read_lock();
 804	for_each_active_iommu(iommu, drhd) {
 805		if (iommu != skip) {
 806			mask &= cap_super_page_val(iommu->cap);
 807			if (!mask)
 808				break;
 809		}
 810	}
 811	rcu_read_unlock();
 812
 813	return fls(mask);
 814}
 815
 816/* Some capabilities may be different across iommus */
 817static void domain_update_iommu_cap(struct dmar_domain *domain)
 818{
 819	domain_update_iommu_coherency(domain);
 820	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 821	domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 822}
 823
 824static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
 825						       u8 bus, u8 devfn, int alloc)
 826{
 827	struct root_entry *root = &iommu->root_entry[bus];
 828	struct context_entry *context;
 829	u64 *entry;
 830
 831	entry = &root->lo;
 832	if (ecs_enabled(iommu)) {
 833		if (devfn >= 0x80) {
 834			devfn -= 0x80;
 835			entry = &root->hi;
 836		}
 837		devfn *= 2;
 838	}
 839	if (*entry & 1)
 840		context = phys_to_virt(*entry & VTD_PAGE_MASK);
 841	else {
 842		unsigned long phy_addr;
 843		if (!alloc)
 844			return NULL;
 845
 846		context = alloc_pgtable_page(iommu->node);
 847		if (!context)
 848			return NULL;
 849
 850		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 851		phy_addr = virt_to_phys((void *)context);
 852		*entry = phy_addr | 1;
 853		__iommu_flush_cache(iommu, entry, sizeof(*entry));
 854	}
 855	return &context[devfn];
 856}
 857
 858static int iommu_dummy(struct device *dev)
 859{
 860	return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 861}
 862
 863static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 864{
 865	struct dmar_drhd_unit *drhd = NULL;
 866	struct intel_iommu *iommu;
 867	struct device *tmp;
 868	struct pci_dev *ptmp, *pdev = NULL;
 869	u16 segment = 0;
 870	int i;
 871
 872	if (iommu_dummy(dev))
 873		return NULL;
 874
 875	if (dev_is_pci(dev)) {
 876		struct pci_dev *pf_pdev;
 877
 878		pdev = to_pci_dev(dev);
 879
 880#ifdef CONFIG_X86
 881		/* VMD child devices currently cannot be handled individually */
 882		if (is_vmd(pdev->bus))
 883			return NULL;
 884#endif
 885
 886		/* VFs aren't listed in scope tables; we need to look up
 887		 * the PF instead to find the IOMMU. */
 888		pf_pdev = pci_physfn(pdev);
 889		dev = &pf_pdev->dev;
 890		segment = pci_domain_nr(pdev->bus);
 891	} else if (has_acpi_companion(dev))
 892		dev = &ACPI_COMPANION(dev)->dev;
 893
 894	rcu_read_lock();
 895	for_each_active_iommu(iommu, drhd) {
 896		if (pdev && segment != drhd->segment)
 897			continue;
 898
 899		for_each_active_dev_scope(drhd->devices,
 900					  drhd->devices_cnt, i, tmp) {
 901			if (tmp == dev) {
 902				/* For a VF use its original BDF# not that of the PF
 903				 * which we used for the IOMMU lookup. Strictly speaking
 904				 * we could do this for all PCI devices; we only need to
 905				 * get the BDF# from the scope table for ACPI matches. */
 906				if (pdev && pdev->is_virtfn)
 907					goto got_pdev;
 908
 909				*bus = drhd->devices[i].bus;
 910				*devfn = drhd->devices[i].devfn;
 911				goto out;
 912			}
 913
 914			if (!pdev || !dev_is_pci(tmp))
 915				continue;
 916
 917			ptmp = to_pci_dev(tmp);
 918			if (ptmp->subordinate &&
 919			    ptmp->subordinate->number <= pdev->bus->number &&
 920			    ptmp->subordinate->busn_res.end >= pdev->bus->number)
 921				goto got_pdev;
 922		}
 923
 924		if (pdev && drhd->include_all) {
 925		got_pdev:
 926			*bus = pdev->bus->number;
 927			*devfn = pdev->devfn;
 928			goto out;
 929		}
 930	}
 931	iommu = NULL;
 932 out:
 933	rcu_read_unlock();
 934
 935	return iommu;
 936}
 937
 938static void domain_flush_cache(struct dmar_domain *domain,
 939			       void *addr, int size)
 940{
 941	if (!domain->iommu_coherency)
 942		clflush_cache_range(addr, size);
 943}
 944
 945static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 946{
 947	struct context_entry *context;
 948	int ret = 0;
 949	unsigned long flags;
 950
 951	spin_lock_irqsave(&iommu->lock, flags);
 952	context = iommu_context_addr(iommu, bus, devfn, 0);
 953	if (context)
 954		ret = context_present(context);
 955	spin_unlock_irqrestore(&iommu->lock, flags);
 956	return ret;
 957}
 958
 959static void free_context_table(struct intel_iommu *iommu)
 960{
 961	int i;
 962	unsigned long flags;
 963	struct context_entry *context;
 964
 965	spin_lock_irqsave(&iommu->lock, flags);
 966	if (!iommu->root_entry) {
 967		goto out;
 968	}
 969	for (i = 0; i < ROOT_ENTRY_NR; i++) {
 970		context = iommu_context_addr(iommu, i, 0, 0);
 971		if (context)
 972			free_pgtable_page(context);
 973
 974		if (!ecs_enabled(iommu))
 975			continue;
 976
 977		context = iommu_context_addr(iommu, i, 0x80, 0);
 978		if (context)
 979			free_pgtable_page(context);
 980
 981	}
 982	free_pgtable_page(iommu->root_entry);
 983	iommu->root_entry = NULL;
 984out:
 985	spin_unlock_irqrestore(&iommu->lock, flags);
 986}
 987
 988static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 989				      unsigned long pfn, int *target_level)
 990{
 991	struct dma_pte *parent, *pte = NULL;
 992	int level = agaw_to_level(domain->agaw);
 993	int offset;
 994
 995	BUG_ON(!domain->pgd);
 996
 997	if (!domain_pfn_supported(domain, pfn))
 998		/* Address beyond IOMMU's addressing capabilities. */
 999		return NULL;
1000
1001	parent = domain->pgd;
1002
1003	while (1) {
1004		void *tmp_page;
1005
1006		offset = pfn_level_offset(pfn, level);
1007		pte = &parent[offset];
1008		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1009			break;
1010		if (level == *target_level)
1011			break;
1012
1013		if (!dma_pte_present(pte)) {
1014			uint64_t pteval;
1015
1016			tmp_page = alloc_pgtable_page(domain->nid);
1017
1018			if (!tmp_page)
1019				return NULL;
1020
1021			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1022			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1023			if (cmpxchg64(&pte->val, 0ULL, pteval))
1024				/* Someone else set it while we were thinking; use theirs. */
1025				free_pgtable_page(tmp_page);
1026			else
1027				domain_flush_cache(domain, pte, sizeof(*pte));
1028		}
1029		if (level == 1)
1030			break;
1031
1032		parent = phys_to_virt(dma_pte_addr(pte));
1033		level--;
1034	}
1035
1036	if (!*target_level)
1037		*target_level = level;
1038
1039	return pte;
1040}
1041
1042
1043/* return address's pte at specific level */
1044static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1045					 unsigned long pfn,
1046					 int level, int *large_page)
1047{
1048	struct dma_pte *parent, *pte = NULL;
1049	int total = agaw_to_level(domain->agaw);
1050	int offset;
1051
1052	parent = domain->pgd;
1053	while (level <= total) {
1054		offset = pfn_level_offset(pfn, total);
1055		pte = &parent[offset];
1056		if (level == total)
1057			return pte;
1058
1059		if (!dma_pte_present(pte)) {
1060			*large_page = total;
1061			break;
1062		}
1063
1064		if (dma_pte_superpage(pte)) {
1065			*large_page = total;
1066			return pte;
1067		}
1068
1069		parent = phys_to_virt(dma_pte_addr(pte));
1070		total--;
1071	}
1072	return NULL;
1073}
1074
1075/* clear last level pte, a tlb flush should be followed */
1076static void dma_pte_clear_range(struct dmar_domain *domain,
1077				unsigned long start_pfn,
1078				unsigned long last_pfn)
1079{
1080	unsigned int large_page = 1;
1081	struct dma_pte *first_pte, *pte;
1082
1083	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1084	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1085	BUG_ON(start_pfn > last_pfn);
1086
1087	/* we don't need lock here; nobody else touches the iova range */
1088	do {
1089		large_page = 1;
1090		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1091		if (!pte) {
1092			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1093			continue;
1094		}
1095		do {
1096			dma_clear_pte(pte);
1097			start_pfn += lvl_to_nr_pages(large_page);
1098			pte++;
1099		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1100
1101		domain_flush_cache(domain, first_pte,
1102				   (void *)pte - (void *)first_pte);
1103
1104	} while (start_pfn && start_pfn <= last_pfn);
1105}
1106
1107static void dma_pte_free_level(struct dmar_domain *domain, int level,
1108			       int retain_level, struct dma_pte *pte,
1109			       unsigned long pfn, unsigned long start_pfn,
1110			       unsigned long last_pfn)
1111{
1112	pfn = max(start_pfn, pfn);
1113	pte = &pte[pfn_level_offset(pfn, level)];
1114
1115	do {
1116		unsigned long level_pfn;
1117		struct dma_pte *level_pte;
1118
1119		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1120			goto next;
1121
1122		level_pfn = pfn & level_mask(level);
1123		level_pte = phys_to_virt(dma_pte_addr(pte));
1124
1125		if (level > 2) {
1126			dma_pte_free_level(domain, level - 1, retain_level,
1127					   level_pte, level_pfn, start_pfn,
1128					   last_pfn);
1129		}
1130
1131		/*
1132		 * Free the page table if we're below the level we want to
1133		 * retain and the range covers the entire table.
1134		 */
1135		if (level < retain_level && !(start_pfn > level_pfn ||
1136		      last_pfn < level_pfn + level_size(level) - 1)) {
1137			dma_clear_pte(pte);
1138			domain_flush_cache(domain, pte, sizeof(*pte));
1139			free_pgtable_page(level_pte);
1140		}
1141next:
1142		pfn += level_size(level);
1143	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1144}
1145
1146/*
1147 * clear last level (leaf) ptes and free page table pages below the
1148 * level we wish to keep intact.
1149 */
1150static void dma_pte_free_pagetable(struct dmar_domain *domain,
1151				   unsigned long start_pfn,
1152				   unsigned long last_pfn,
1153				   int retain_level)
1154{
1155	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1156	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1157	BUG_ON(start_pfn > last_pfn);
1158
1159	dma_pte_clear_range(domain, start_pfn, last_pfn);
1160
1161	/* We don't need lock here; nobody else touches the iova range */
1162	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1163			   domain->pgd, 0, start_pfn, last_pfn);
1164
1165	/* free pgd */
1166	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1167		free_pgtable_page(domain->pgd);
1168		domain->pgd = NULL;
1169	}
1170}
1171
1172/* When a page at a given level is being unlinked from its parent, we don't
1173   need to *modify* it at all. All we need to do is make a list of all the
1174   pages which can be freed just as soon as we've flushed the IOTLB and we
1175   know the hardware page-walk will no longer touch them.
1176   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1177   be freed. */
1178static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1179					    int level, struct dma_pte *pte,
1180					    struct page *freelist)
1181{
1182	struct page *pg;
1183
1184	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1185	pg->freelist = freelist;
1186	freelist = pg;
1187
1188	if (level == 1)
1189		return freelist;
1190
1191	pte = page_address(pg);
1192	do {
1193		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1194			freelist = dma_pte_list_pagetables(domain, level - 1,
1195							   pte, freelist);
1196		pte++;
1197	} while (!first_pte_in_page(pte));
1198
1199	return freelist;
1200}
1201
1202static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1203					struct dma_pte *pte, unsigned long pfn,
1204					unsigned long start_pfn,
1205					unsigned long last_pfn,
1206					struct page *freelist)
1207{
1208	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1209
1210	pfn = max(start_pfn, pfn);
1211	pte = &pte[pfn_level_offset(pfn, level)];
1212
1213	do {
1214		unsigned long level_pfn;
1215
1216		if (!dma_pte_present(pte))
1217			goto next;
1218
1219		level_pfn = pfn & level_mask(level);
1220
1221		/* If range covers entire pagetable, free it */
1222		if (start_pfn <= level_pfn &&
1223		    last_pfn >= level_pfn + level_size(level) - 1) {
1224			/* These suborbinate page tables are going away entirely. Don't
1225			   bother to clear them; we're just going to *free* them. */
1226			if (level > 1 && !dma_pte_superpage(pte))
1227				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1228
1229			dma_clear_pte(pte);
1230			if (!first_pte)
1231				first_pte = pte;
1232			last_pte = pte;
1233		} else if (level > 1) {
1234			/* Recurse down into a level that isn't *entirely* obsolete */
1235			freelist = dma_pte_clear_level(domain, level - 1,
1236						       phys_to_virt(dma_pte_addr(pte)),
1237						       level_pfn, start_pfn, last_pfn,
1238						       freelist);
1239		}
1240next:
1241		pfn += level_size(level);
1242	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1243
1244	if (first_pte)
1245		domain_flush_cache(domain, first_pte,
1246				   (void *)++last_pte - (void *)first_pte);
1247
1248	return freelist;
1249}
1250
1251/* We can't just free the pages because the IOMMU may still be walking
1252   the page tables, and may have cached the intermediate levels. The
1253   pages can only be freed after the IOTLB flush has been done. */
1254static struct page *domain_unmap(struct dmar_domain *domain,
1255				 unsigned long start_pfn,
1256				 unsigned long last_pfn)
1257{
1258	struct page *freelist = NULL;
1259
1260	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1261	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1262	BUG_ON(start_pfn > last_pfn);
1263
1264	/* we don't need lock here; nobody else touches the iova range */
1265	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1266				       domain->pgd, 0, start_pfn, last_pfn, NULL);
1267
1268	/* free pgd */
1269	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1270		struct page *pgd_page = virt_to_page(domain->pgd);
1271		pgd_page->freelist = freelist;
1272		freelist = pgd_page;
1273
1274		domain->pgd = NULL;
1275	}
1276
1277	return freelist;
1278}
1279
1280static void dma_free_pagelist(struct page *freelist)
1281{
1282	struct page *pg;
1283
1284	while ((pg = freelist)) {
1285		freelist = pg->freelist;
1286		free_pgtable_page(page_address(pg));
1287	}
1288}
1289
1290static void iova_entry_free(unsigned long data)
1291{
1292	struct page *freelist = (struct page *)data;
1293
1294	dma_free_pagelist(freelist);
1295}
1296
1297/* iommu handling */
1298static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1299{
1300	struct root_entry *root;
1301	unsigned long flags;
1302
1303	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1304	if (!root) {
1305		pr_err("Allocating root entry for %s failed\n",
1306			iommu->name);
1307		return -ENOMEM;
1308	}
1309
1310	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1311
1312	spin_lock_irqsave(&iommu->lock, flags);
1313	iommu->root_entry = root;
1314	spin_unlock_irqrestore(&iommu->lock, flags);
1315
1316	return 0;
1317}
1318
1319static void iommu_set_root_entry(struct intel_iommu *iommu)
1320{
1321	u64 addr;
1322	u32 sts;
1323	unsigned long flag;
1324
1325	addr = virt_to_phys(iommu->root_entry);
1326	if (ecs_enabled(iommu))
1327		addr |= DMA_RTADDR_RTT;
1328
1329	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1330	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1331
1332	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1333
1334	/* Make sure hardware complete it */
1335	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1336		      readl, (sts & DMA_GSTS_RTPS), sts);
1337
1338	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1339}
1340
1341static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1342{
1343	u32 val;
1344	unsigned long flag;
1345
1346	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1347		return;
1348
1349	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1350	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1351
1352	/* Make sure hardware complete it */
1353	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1354		      readl, (!(val & DMA_GSTS_WBFS)), val);
1355
1356	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1357}
1358
1359/* return value determine if we need a write buffer flush */
1360static void __iommu_flush_context(struct intel_iommu *iommu,
1361				  u16 did, u16 source_id, u8 function_mask,
1362				  u64 type)
1363{
1364	u64 val = 0;
1365	unsigned long flag;
1366
1367	switch (type) {
1368	case DMA_CCMD_GLOBAL_INVL:
1369		val = DMA_CCMD_GLOBAL_INVL;
1370		break;
1371	case DMA_CCMD_DOMAIN_INVL:
1372		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1373		break;
1374	case DMA_CCMD_DEVICE_INVL:
1375		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1376			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1377		break;
1378	default:
1379		BUG();
1380	}
1381	val |= DMA_CCMD_ICC;
1382
1383	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1384	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1385
1386	/* Make sure hardware complete it */
1387	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1388		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1389
1390	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1391}
1392
1393/* return value determine if we need a write buffer flush */
1394static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1395				u64 addr, unsigned int size_order, u64 type)
1396{
1397	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1398	u64 val = 0, val_iva = 0;
1399	unsigned long flag;
1400
1401	switch (type) {
1402	case DMA_TLB_GLOBAL_FLUSH:
1403		/* global flush doesn't need set IVA_REG */
1404		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1405		break;
1406	case DMA_TLB_DSI_FLUSH:
1407		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1408		break;
1409	case DMA_TLB_PSI_FLUSH:
1410		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1411		/* IH bit is passed in as part of address */
1412		val_iva = size_order | addr;
1413		break;
1414	default:
1415		BUG();
1416	}
1417	/* Note: set drain read/write */
1418#if 0
1419	/*
1420	 * This is probably to be super secure.. Looks like we can
1421	 * ignore it without any impact.
1422	 */
1423	if (cap_read_drain(iommu->cap))
1424		val |= DMA_TLB_READ_DRAIN;
1425#endif
1426	if (cap_write_drain(iommu->cap))
1427		val |= DMA_TLB_WRITE_DRAIN;
1428
1429	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1430	/* Note: Only uses first TLB reg currently */
1431	if (val_iva)
1432		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1433	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1434
1435	/* Make sure hardware complete it */
1436	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1437		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1438
1439	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1440
1441	/* check IOTLB invalidation granularity */
1442	if (DMA_TLB_IAIG(val) == 0)
1443		pr_err("Flush IOTLB failed\n");
1444	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1445		pr_debug("TLB flush request %Lx, actual %Lx\n",
1446			(unsigned long long)DMA_TLB_IIRG(type),
1447			(unsigned long long)DMA_TLB_IAIG(val));
1448}
1449
1450static struct device_domain_info *
1451iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1452			 u8 bus, u8 devfn)
1453{
1454	struct device_domain_info *info;
1455
1456	assert_spin_locked(&device_domain_lock);
1457
1458	if (!iommu->qi)
1459		return NULL;
1460
1461	list_for_each_entry(info, &domain->devices, link)
1462		if (info->iommu == iommu && info->bus == bus &&
1463		    info->devfn == devfn) {
1464			if (info->ats_supported && info->dev)
1465				return info;
1466			break;
1467		}
1468
1469	return NULL;
1470}
1471
1472static void domain_update_iotlb(struct dmar_domain *domain)
1473{
1474	struct device_domain_info *info;
1475	bool has_iotlb_device = false;
1476
1477	assert_spin_locked(&device_domain_lock);
1478
1479	list_for_each_entry(info, &domain->devices, link) {
1480		struct pci_dev *pdev;
1481
1482		if (!info->dev || !dev_is_pci(info->dev))
1483			continue;
1484
1485		pdev = to_pci_dev(info->dev);
1486		if (pdev->ats_enabled) {
1487			has_iotlb_device = true;
1488			break;
1489		}
1490	}
1491
1492	domain->has_iotlb_device = has_iotlb_device;
1493}
1494
1495static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1496{
1497	struct pci_dev *pdev;
1498
1499	assert_spin_locked(&device_domain_lock);
1500
1501	if (!info || !dev_is_pci(info->dev))
1502		return;
1503
1504	pdev = to_pci_dev(info->dev);
1505
1506#ifdef CONFIG_INTEL_IOMMU_SVM
1507	/* The PCIe spec, in its wisdom, declares that the behaviour of
1508	   the device if you enable PASID support after ATS support is
1509	   undefined. So always enable PASID support on devices which
1510	   have it, even if we can't yet know if we're ever going to
1511	   use it. */
1512	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1513		info->pasid_enabled = 1;
1514
1515	if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1516		info->pri_enabled = 1;
1517#endif
1518	if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1519		info->ats_enabled = 1;
1520		domain_update_iotlb(info->domain);
1521		info->ats_qdep = pci_ats_queue_depth(pdev);
1522	}
1523}
1524
1525static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1526{
1527	struct pci_dev *pdev;
1528
1529	assert_spin_locked(&device_domain_lock);
1530
1531	if (!dev_is_pci(info->dev))
1532		return;
1533
1534	pdev = to_pci_dev(info->dev);
1535
1536	if (info->ats_enabled) {
1537		pci_disable_ats(pdev);
1538		info->ats_enabled = 0;
1539		domain_update_iotlb(info->domain);
1540	}
1541#ifdef CONFIG_INTEL_IOMMU_SVM
1542	if (info->pri_enabled) {
1543		pci_disable_pri(pdev);
1544		info->pri_enabled = 0;
1545	}
1546	if (info->pasid_enabled) {
1547		pci_disable_pasid(pdev);
1548		info->pasid_enabled = 0;
1549	}
1550#endif
1551}
1552
1553static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1554				  u64 addr, unsigned mask)
1555{
1556	u16 sid, qdep;
1557	unsigned long flags;
1558	struct device_domain_info *info;
1559
1560	if (!domain->has_iotlb_device)
1561		return;
1562
1563	spin_lock_irqsave(&device_domain_lock, flags);
1564	list_for_each_entry(info, &domain->devices, link) {
1565		if (!info->ats_enabled)
1566			continue;
1567
1568		sid = info->bus << 8 | info->devfn;
1569		qdep = info->ats_qdep;
1570		qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1571	}
1572	spin_unlock_irqrestore(&device_domain_lock, flags);
1573}
1574
1575static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1576				  struct dmar_domain *domain,
1577				  unsigned long pfn, unsigned int pages,
1578				  int ih, int map)
1579{
1580	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1581	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1582	u16 did = domain->iommu_did[iommu->seq_id];
1583
1584	BUG_ON(pages == 0);
1585
1586	if (ih)
1587		ih = 1 << 6;
1588	/*
1589	 * Fallback to domain selective flush if no PSI support or the size is
1590	 * too big.
1591	 * PSI requires page size to be 2 ^ x, and the base address is naturally
1592	 * aligned to the size
1593	 */
1594	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1595		iommu->flush.flush_iotlb(iommu, did, 0, 0,
1596						DMA_TLB_DSI_FLUSH);
1597	else
1598		iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1599						DMA_TLB_PSI_FLUSH);
1600
1601	/*
1602	 * In caching mode, changes of pages from non-present to present require
1603	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1604	 */
1605	if (!cap_caching_mode(iommu->cap) || !map)
1606		iommu_flush_dev_iotlb(domain, addr, mask);
1607}
1608
1609static void iommu_flush_iova(struct iova_domain *iovad)
1610{
1611	struct dmar_domain *domain;
1612	int idx;
1613
1614	domain = container_of(iovad, struct dmar_domain, iovad);
1615
1616	for_each_domain_iommu(idx, domain) {
1617		struct intel_iommu *iommu = g_iommus[idx];
1618		u16 did = domain->iommu_did[iommu->seq_id];
1619
1620		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1621
1622		if (!cap_caching_mode(iommu->cap))
1623			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1624					      0, MAX_AGAW_PFN_WIDTH);
1625	}
1626}
1627
1628static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1629{
1630	u32 pmen;
1631	unsigned long flags;
1632
1633	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1634	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1635	pmen &= ~DMA_PMEN_EPM;
1636	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1637
1638	/* wait for the protected region status bit to clear */
1639	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1640		readl, !(pmen & DMA_PMEN_PRS), pmen);
1641
1642	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1643}
1644
1645static void iommu_enable_translation(struct intel_iommu *iommu)
1646{
1647	u32 sts;
1648	unsigned long flags;
1649
1650	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1651	iommu->gcmd |= DMA_GCMD_TE;
1652	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1653
1654	/* Make sure hardware complete it */
1655	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1656		      readl, (sts & DMA_GSTS_TES), sts);
1657
1658	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1659}
1660
1661static void iommu_disable_translation(struct intel_iommu *iommu)
1662{
1663	u32 sts;
1664	unsigned long flag;
1665
1666	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1667	iommu->gcmd &= ~DMA_GCMD_TE;
1668	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1669
1670	/* Make sure hardware complete it */
1671	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1672		      readl, (!(sts & DMA_GSTS_TES)), sts);
1673
1674	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1675}
1676
1677
1678static int iommu_init_domains(struct intel_iommu *iommu)
1679{
1680	u32 ndomains, nlongs;
1681	size_t size;
1682
1683	ndomains = cap_ndoms(iommu->cap);
1684	pr_debug("%s: Number of Domains supported <%d>\n",
1685		 iommu->name, ndomains);
1686	nlongs = BITS_TO_LONGS(ndomains);
1687
1688	spin_lock_init(&iommu->lock);
1689
1690	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1691	if (!iommu->domain_ids) {
1692		pr_err("%s: Allocating domain id array failed\n",
1693		       iommu->name);
1694		return -ENOMEM;
1695	}
1696
1697	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1698	iommu->domains = kzalloc(size, GFP_KERNEL);
1699
1700	if (iommu->domains) {
1701		size = 256 * sizeof(struct dmar_domain *);
1702		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1703	}
1704
1705	if (!iommu->domains || !iommu->domains[0]) {
1706		pr_err("%s: Allocating domain array failed\n",
1707		       iommu->name);
1708		kfree(iommu->domain_ids);
1709		kfree(iommu->domains);
1710		iommu->domain_ids = NULL;
1711		iommu->domains    = NULL;
1712		return -ENOMEM;
1713	}
1714
1715
1716
1717	/*
1718	 * If Caching mode is set, then invalid translations are tagged
1719	 * with domain-id 0, hence we need to pre-allocate it. We also
1720	 * use domain-id 0 as a marker for non-allocated domain-id, so
1721	 * make sure it is not used for a real domain.
1722	 */
1723	set_bit(0, iommu->domain_ids);
1724
1725	return 0;
1726}
1727
1728static void disable_dmar_iommu(struct intel_iommu *iommu)
1729{
1730	struct device_domain_info *info, *tmp;
1731	unsigned long flags;
1732
1733	if (!iommu->domains || !iommu->domain_ids)
1734		return;
1735
1736again:
1737	spin_lock_irqsave(&device_domain_lock, flags);
1738	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1739		struct dmar_domain *domain;
1740
1741		if (info->iommu != iommu)
1742			continue;
1743
1744		if (!info->dev || !info->domain)
1745			continue;
1746
1747		domain = info->domain;
1748
1749		__dmar_remove_one_dev_info(info);
1750
1751		if (!domain_type_is_vm_or_si(domain)) {
1752			/*
1753			 * The domain_exit() function  can't be called under
1754			 * device_domain_lock, as it takes this lock itself.
1755			 * So release the lock here and re-run the loop
1756			 * afterwards.
1757			 */
1758			spin_unlock_irqrestore(&device_domain_lock, flags);
1759			domain_exit(domain);
1760			goto again;
1761		}
1762	}
1763	spin_unlock_irqrestore(&device_domain_lock, flags);
1764
1765	if (iommu->gcmd & DMA_GCMD_TE)
1766		iommu_disable_translation(iommu);
1767}
1768
1769static void free_dmar_iommu(struct intel_iommu *iommu)
1770{
1771	if ((iommu->domains) && (iommu->domain_ids)) {
1772		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1773		int i;
1774
1775		for (i = 0; i < elems; i++)
1776			kfree(iommu->domains[i]);
1777		kfree(iommu->domains);
1778		kfree(iommu->domain_ids);
1779		iommu->domains = NULL;
1780		iommu->domain_ids = NULL;
1781	}
1782
1783	g_iommus[iommu->seq_id] = NULL;
1784
1785	/* free context mapping */
1786	free_context_table(iommu);
1787
1788#ifdef CONFIG_INTEL_IOMMU_SVM
1789	if (pasid_enabled(iommu)) {
1790		if (ecap_prs(iommu->ecap))
1791			intel_svm_finish_prq(iommu);
1792		intel_svm_free_pasid_tables(iommu);
1793	}
1794#endif
1795}
1796
1797static struct dmar_domain *alloc_domain(int flags)
1798{
1799	struct dmar_domain *domain;
1800
1801	domain = alloc_domain_mem();
1802	if (!domain)
1803		return NULL;
1804
1805	memset(domain, 0, sizeof(*domain));
1806	domain->nid = -1;
1807	domain->flags = flags;
1808	domain->has_iotlb_device = false;
1809	INIT_LIST_HEAD(&domain->devices);
1810
1811	return domain;
1812}
1813
1814/* Must be called with iommu->lock */
1815static int domain_attach_iommu(struct dmar_domain *domain,
1816			       struct intel_iommu *iommu)
1817{
1818	unsigned long ndomains;
1819	int num;
1820
1821	assert_spin_locked(&device_domain_lock);
1822	assert_spin_locked(&iommu->lock);
1823
1824	domain->iommu_refcnt[iommu->seq_id] += 1;
1825	domain->iommu_count += 1;
1826	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1827		ndomains = cap_ndoms(iommu->cap);
1828		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1829
1830		if (num >= ndomains) {
1831			pr_err("%s: No free domain ids\n", iommu->name);
1832			domain->iommu_refcnt[iommu->seq_id] -= 1;
1833			domain->iommu_count -= 1;
1834			return -ENOSPC;
1835		}
1836
1837		set_bit(num, iommu->domain_ids);
1838		set_iommu_domain(iommu, num, domain);
1839
1840		domain->iommu_did[iommu->seq_id] = num;
1841		domain->nid			 = iommu->node;
1842
1843		domain_update_iommu_cap(domain);
1844	}
1845
1846	return 0;
1847}
1848
1849static int domain_detach_iommu(struct dmar_domain *domain,
1850			       struct intel_iommu *iommu)
1851{
1852	int num, count = INT_MAX;
1853
1854	assert_spin_locked(&device_domain_lock);
1855	assert_spin_locked(&iommu->lock);
1856
1857	domain->iommu_refcnt[iommu->seq_id] -= 1;
1858	count = --domain->iommu_count;
1859	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1860		num = domain->iommu_did[iommu->seq_id];
1861		clear_bit(num, iommu->domain_ids);
1862		set_iommu_domain(iommu, num, NULL);
1863
1864		domain_update_iommu_cap(domain);
1865		domain->iommu_did[iommu->seq_id] = 0;
1866	}
1867
1868	return count;
1869}
1870
1871static struct iova_domain reserved_iova_list;
1872static struct lock_class_key reserved_rbtree_key;
1873
1874static int dmar_init_reserved_ranges(void)
1875{
1876	struct pci_dev *pdev = NULL;
1877	struct iova *iova;
1878	int i;
1879
1880	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1881
1882	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1883		&reserved_rbtree_key);
1884
1885	/* IOAPIC ranges shouldn't be accessed by DMA */
1886	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1887		IOVA_PFN(IOAPIC_RANGE_END));
1888	if (!iova) {
1889		pr_err("Reserve IOAPIC range failed\n");
1890		return -ENODEV;
1891	}
1892
1893	/* Reserve all PCI MMIO to avoid peer-to-peer access */
1894	for_each_pci_dev(pdev) {
1895		struct resource *r;
1896
1897		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1898			r = &pdev->resource[i];
1899			if (!r->flags || !(r->flags & IORESOURCE_MEM))
1900				continue;
1901			iova = reserve_iova(&reserved_iova_list,
1902					    IOVA_PFN(r->start),
1903					    IOVA_PFN(r->end));
1904			if (!iova) {
1905				pr_err("Reserve iova failed\n");
1906				return -ENODEV;
1907			}
1908		}
1909	}
1910	return 0;
1911}
1912
1913static void domain_reserve_special_ranges(struct dmar_domain *domain)
1914{
1915	copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1916}
1917
1918static inline int guestwidth_to_adjustwidth(int gaw)
1919{
1920	int agaw;
1921	int r = (gaw - 12) % 9;
1922
1923	if (r == 0)
1924		agaw = gaw;
1925	else
1926		agaw = gaw + 9 - r;
1927	if (agaw > 64)
1928		agaw = 64;
1929	return agaw;
1930}
1931
1932static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1933		       int guest_width)
1934{
1935	int adjust_width, agaw;
1936	unsigned long sagaw;
1937	int err;
1938
1939	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1940
1941	err = init_iova_flush_queue(&domain->iovad,
1942				    iommu_flush_iova, iova_entry_free);
1943	if (err)
1944		return err;
1945
1946	domain_reserve_special_ranges(domain);
1947
1948	/* calculate AGAW */
1949	if (guest_width > cap_mgaw(iommu->cap))
1950		guest_width = cap_mgaw(iommu->cap);
1951	domain->gaw = guest_width;
1952	adjust_width = guestwidth_to_adjustwidth(guest_width);
1953	agaw = width_to_agaw(adjust_width);
1954	sagaw = cap_sagaw(iommu->cap);
1955	if (!test_bit(agaw, &sagaw)) {
1956		/* hardware doesn't support it, choose a bigger one */
1957		pr_debug("Hardware doesn't support agaw %d\n", agaw);
1958		agaw = find_next_bit(&sagaw, 5, agaw);
1959		if (agaw >= 5)
1960			return -ENODEV;
1961	}
1962	domain->agaw = agaw;
1963
1964	if (ecap_coherent(iommu->ecap))
1965		domain->iommu_coherency = 1;
1966	else
1967		domain->iommu_coherency = 0;
1968
1969	if (ecap_sc_support(iommu->ecap))
1970		domain->iommu_snooping = 1;
1971	else
1972		domain->iommu_snooping = 0;
1973
1974	if (intel_iommu_superpage)
1975		domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1976	else
1977		domain->iommu_superpage = 0;
1978
1979	domain->nid = iommu->node;
1980
1981	/* always allocate the top pgd */
1982	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1983	if (!domain->pgd)
1984		return -ENOMEM;
1985	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1986	return 0;
1987}
1988
1989static void domain_exit(struct dmar_domain *domain)
1990{
1991	struct page *freelist = NULL;
1992
1993	/* Domain 0 is reserved, so dont process it */
1994	if (!domain)
1995		return;
1996
1997	/* Remove associated devices and clear attached or cached domains */
1998	rcu_read_lock();
1999	domain_remove_dev_info(domain);
2000	rcu_read_unlock();
2001
2002	/* destroy iovas */
2003	put_iova_domain(&domain->iovad);
2004
2005	freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2006
2007	dma_free_pagelist(freelist);
2008
2009	free_domain_mem(domain);
2010}
2011
2012static int domain_context_mapping_one(struct dmar_domain *domain,
2013				      struct intel_iommu *iommu,
2014				      u8 bus, u8 devfn)
2015{
2016	u16 did = domain->iommu_did[iommu->seq_id];
2017	int translation = CONTEXT_TT_MULTI_LEVEL;
2018	struct device_domain_info *info = NULL;
2019	struct context_entry *context;
2020	unsigned long flags;
2021	struct dma_pte *pgd;
2022	int ret, agaw;
2023
2024	WARN_ON(did == 0);
2025
2026	if (hw_pass_through && domain_type_is_si(domain))
2027		translation = CONTEXT_TT_PASS_THROUGH;
2028
2029	pr_debug("Set context mapping for %02x:%02x.%d\n",
2030		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2031
2032	BUG_ON(!domain->pgd);
2033
2034	spin_lock_irqsave(&device_domain_lock, flags);
2035	spin_lock(&iommu->lock);
2036
2037	ret = -ENOMEM;
2038	context = iommu_context_addr(iommu, bus, devfn, 1);
2039	if (!context)
2040		goto out_unlock;
2041
2042	ret = 0;
2043	if (context_present(context))
2044		goto out_unlock;
2045
2046	/*
2047	 * For kdump cases, old valid entries may be cached due to the
2048	 * in-flight DMA and copied pgtable, but there is no unmapping
2049	 * behaviour for them, thus we need an explicit cache flush for
2050	 * the newly-mapped device. For kdump, at this point, the device
2051	 * is supposed to finish reset at its driver probe stage, so no
2052	 * in-flight DMA will exist, and we don't need to worry anymore
2053	 * hereafter.
2054	 */
2055	if (context_copied(context)) {
2056		u16 did_old = context_domain_id(context);
2057
2058		if (did_old < cap_ndoms(iommu->cap)) {
2059			iommu->flush.flush_context(iommu, did_old,
2060						   (((u16)bus) << 8) | devfn,
2061						   DMA_CCMD_MASK_NOBIT,
2062						   DMA_CCMD_DEVICE_INVL);
2063			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2064						 DMA_TLB_DSI_FLUSH);
2065		}
2066	}
2067
2068	pgd = domain->pgd;
2069
2070	context_clear_entry(context);
2071	context_set_domain_id(context, did);
2072
2073	/*
2074	 * Skip top levels of page tables for iommu which has less agaw
2075	 * than default.  Unnecessary for PT mode.
2076	 */
2077	if (translation != CONTEXT_TT_PASS_THROUGH) {
2078		for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2079			ret = -ENOMEM;
2080			pgd = phys_to_virt(dma_pte_addr(pgd));
2081			if (!dma_pte_present(pgd))
2082				goto out_unlock;
2083		}
2084
2085		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2086		if (info && info->ats_supported)
2087			translation = CONTEXT_TT_DEV_IOTLB;
2088		else
2089			translation = CONTEXT_TT_MULTI_LEVEL;
2090
2091		context_set_address_root(context, virt_to_phys(pgd));
2092		context_set_address_width(context, iommu->agaw);
2093	} else {
2094		/*
2095		 * In pass through mode, AW must be programmed to
2096		 * indicate the largest AGAW value supported by
2097		 * hardware. And ASR is ignored by hardware.
2098		 */
2099		context_set_address_width(context, iommu->msagaw);
2100	}
2101
2102	context_set_translation_type(context, translation);
2103	context_set_fault_enable(context);
2104	context_set_present(context);
2105	domain_flush_cache(domain, context, sizeof(*context));
2106
2107	/*
2108	 * It's a non-present to present mapping. If hardware doesn't cache
2109	 * non-present entry we only need to flush the write-buffer. If the
2110	 * _does_ cache non-present entries, then it does so in the special
2111	 * domain #0, which we have to flush:
2112	 */
2113	if (cap_caching_mode(iommu->cap)) {
2114		iommu->flush.flush_context(iommu, 0,
2115					   (((u16)bus) << 8) | devfn,
2116					   DMA_CCMD_MASK_NOBIT,
2117					   DMA_CCMD_DEVICE_INVL);
2118		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2119	} else {
2120		iommu_flush_write_buffer(iommu);
2121	}
2122	iommu_enable_dev_iotlb(info);
2123
2124	ret = 0;
2125
2126out_unlock:
2127	spin_unlock(&iommu->lock);
2128	spin_unlock_irqrestore(&device_domain_lock, flags);
2129
2130	return ret;
2131}
2132
2133struct domain_context_mapping_data {
2134	struct dmar_domain *domain;
2135	struct intel_iommu *iommu;
2136};
2137
2138static int domain_context_mapping_cb(struct pci_dev *pdev,
2139				     u16 alias, void *opaque)
2140{
2141	struct domain_context_mapping_data *data = opaque;
2142
2143	return domain_context_mapping_one(data->domain, data->iommu,
2144					  PCI_BUS_NUM(alias), alias & 0xff);
2145}
2146
2147static int
2148domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2149{
2150	struct intel_iommu *iommu;
2151	u8 bus, devfn;
2152	struct domain_context_mapping_data data;
2153
2154	iommu = device_to_iommu(dev, &bus, &devfn);
2155	if (!iommu)
2156		return -ENODEV;
2157
2158	if (!dev_is_pci(dev))
2159		return domain_context_mapping_one(domain, iommu, bus, devfn);
2160
2161	data.domain = domain;
2162	data.iommu = iommu;
2163
2164	return pci_for_each_dma_alias(to_pci_dev(dev),
2165				      &domain_context_mapping_cb, &data);
2166}
2167
2168static int domain_context_mapped_cb(struct pci_dev *pdev,
2169				    u16 alias, void *opaque)
2170{
2171	struct intel_iommu *iommu = opaque;
2172
2173	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2174}
2175
2176static int domain_context_mapped(struct device *dev)
2177{
2178	struct intel_iommu *iommu;
2179	u8 bus, devfn;
2180
2181	iommu = device_to_iommu(dev, &bus, &devfn);
2182	if (!iommu)
2183		return -ENODEV;
2184
2185	if (!dev_is_pci(dev))
2186		return device_context_mapped(iommu, bus, devfn);
2187
2188	return !pci_for_each_dma_alias(to_pci_dev(dev),
2189				       domain_context_mapped_cb, iommu);
2190}
2191
2192/* Returns a number of VTD pages, but aligned to MM page size */
2193static inline unsigned long aligned_nrpages(unsigned long host_addr,
2194					    size_t size)
2195{
2196	host_addr &= ~PAGE_MASK;
2197	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2198}
2199
2200/* Return largest possible superpage level for a given mapping */
2201static inline int hardware_largepage_caps(struct dmar_domain *domain,
2202					  unsigned long iov_pfn,
2203					  unsigned long phy_pfn,
2204					  unsigned long pages)
2205{
2206	int support, level = 1;
2207	unsigned long pfnmerge;
2208
2209	support = domain->iommu_superpage;
2210
2211	/* To use a large page, the virtual *and* physical addresses
2212	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2213	   of them will mean we have to use smaller pages. So just
2214	   merge them and check both at once. */
2215	pfnmerge = iov_pfn | phy_pfn;
2216
2217	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2218		pages >>= VTD_STRIDE_SHIFT;
2219		if (!pages)
2220			break;
2221		pfnmerge >>= VTD_STRIDE_SHIFT;
2222		level++;
2223		support--;
2224	}
2225	return level;
2226}
2227
2228static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2229			    struct scatterlist *sg, unsigned long phys_pfn,
2230			    unsigned long nr_pages, int prot)
2231{
2232	struct dma_pte *first_pte = NULL, *pte = NULL;
2233	phys_addr_t uninitialized_var(pteval);
2234	unsigned long sg_res = 0;
2235	unsigned int largepage_lvl = 0;
2236	unsigned long lvl_pages = 0;
2237
2238	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2239
2240	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2241		return -EINVAL;
2242
2243	prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2244
2245	if (!sg) {
2246		sg_res = nr_pages;
2247		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2248	}
2249
2250	while (nr_pages > 0) {
2251		uint64_t tmp;
2252
2253		if (!sg_res) {
2254			unsigned int pgoff = sg->offset & ~PAGE_MASK;
2255
2256			sg_res = aligned_nrpages(sg->offset, sg->length);
2257			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2258			sg->dma_length = sg->length;
2259			pteval = (sg_phys(sg) - pgoff) | prot;
2260			phys_pfn = pteval >> VTD_PAGE_SHIFT;
2261		}
2262
2263		if (!pte) {
2264			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2265
2266			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2267			if (!pte)
2268				return -ENOMEM;
2269			/* It is large page*/
2270			if (largepage_lvl > 1) {
2271				unsigned long nr_superpages, end_pfn;
2272
2273				pteval |= DMA_PTE_LARGE_PAGE;
2274				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2275
2276				nr_superpages = sg_res / lvl_pages;
2277				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2278
2279				/*
2280				 * Ensure that old small page tables are
2281				 * removed to make room for superpage(s).
2282				 * We're adding new large pages, so make sure
2283				 * we don't remove their parent tables.
2284				 */
2285				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2286						       largepage_lvl + 1);
2287			} else {
2288				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2289			}
2290
2291		}
2292		/* We don't need lock here, nobody else
2293		 * touches the iova range
2294		 */
2295		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2296		if (tmp) {
2297			static int dumps = 5;
2298			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2299				iov_pfn, tmp, (unsigned long long)pteval);
2300			if (dumps) {
2301				dumps--;
2302				debug_dma_dump_mappings(NULL);
2303			}
2304			WARN_ON(1);
2305		}
2306
2307		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2308
2309		BUG_ON(nr_pages < lvl_pages);
2310		BUG_ON(sg_res < lvl_pages);
2311
2312		nr_pages -= lvl_pages;
2313		iov_pfn += lvl_pages;
2314		phys_pfn += lvl_pages;
2315		pteval += lvl_pages * VTD_PAGE_SIZE;
2316		sg_res -= lvl_pages;
2317
2318		/* If the next PTE would be the first in a new page, then we
2319		   need to flush the cache on the entries we've just written.
2320		   And then we'll need to recalculate 'pte', so clear it and
2321		   let it get set again in the if (!pte) block above.
2322
2323		   If we're done (!nr_pages) we need to flush the cache too.
2324
2325		   Also if we've been setting superpages, we may need to
2326		   recalculate 'pte' and switch back to smaller pages for the
2327		   end of the mapping, if the trailing size is not enough to
2328		   use another superpage (i.e. sg_res < lvl_pages). */
2329		pte++;
2330		if (!nr_pages || first_pte_in_page(pte) ||
2331		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2332			domain_flush_cache(domain, first_pte,
2333					   (void *)pte - (void *)first_pte);
2334			pte = NULL;
2335		}
2336
2337		if (!sg_res && nr_pages)
2338			sg = sg_next(sg);
2339	}
2340	return 0;
2341}
2342
2343static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2344				    struct scatterlist *sg, unsigned long nr_pages,
2345				    int prot)
2346{
2347	return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2348}
2349
2350static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2351				     unsigned long phys_pfn, unsigned long nr_pages,
2352				     int prot)
2353{
2354	return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2355}
2356
2357static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2358{
2359	unsigned long flags;
2360	struct context_entry *context;
2361	u16 did_old;
2362
2363	if (!iommu)
2364		return;
2365
2366	spin_lock_irqsave(&iommu->lock, flags);
2367	context = iommu_context_addr(iommu, bus, devfn, 0);
2368	if (!context) {
2369		spin_unlock_irqrestore(&iommu->lock, flags);
2370		return;
2371	}
2372	did_old = context_domain_id(context);
2373	context_clear_entry(context);
2374	__iommu_flush_cache(iommu, context, sizeof(*context));
2375	spin_unlock_irqrestore(&iommu->lock, flags);
2376	iommu->flush.flush_context(iommu,
2377				   did_old,
2378				   (((u16)bus) << 8) | devfn,
2379				   DMA_CCMD_MASK_NOBIT,
2380				   DMA_CCMD_DEVICE_INVL);
2381	iommu->flush.flush_iotlb(iommu,
2382				 did_old,
2383				 0,
2384				 0,
2385				 DMA_TLB_DSI_FLUSH);
2386}
2387
2388static inline void unlink_domain_info(struct device_domain_info *info)
2389{
2390	assert_spin_locked(&device_domain_lock);
2391	list_del(&info->link);
2392	list_del(&info->global);
2393	if (info->dev)
2394		info->dev->archdata.iommu = NULL;
2395}
2396
2397static void domain_remove_dev_info(struct dmar_domain *domain)
2398{
2399	struct device_domain_info *info, *tmp;
2400	unsigned long flags;
2401
2402	spin_lock_irqsave(&device_domain_lock, flags);
2403	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2404		__dmar_remove_one_dev_info(info);
2405	spin_unlock_irqrestore(&device_domain_lock, flags);
2406}
2407
2408/*
2409 * find_domain
2410 * Note: we use struct device->archdata.iommu stores the info
2411 */
2412static struct dmar_domain *find_domain(struct device *dev)
2413{
2414	struct device_domain_info *info;
2415
2416	/* No lock here, assumes no domain exit in normal case */
2417	info = dev->archdata.iommu;
2418	if (likely(info))
2419		return info->domain;
2420	return NULL;
2421}
2422
2423static inline struct device_domain_info *
2424dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2425{
2426	struct device_domain_info *info;
2427
2428	list_for_each_entry(info, &device_domain_list, global)
2429		if (info->iommu->segment == segment && info->bus == bus &&
2430		    info->devfn == devfn)
2431			return info;
2432
2433	return NULL;
2434}
2435
2436static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2437						    int bus, int devfn,
2438						    struct device *dev,
2439						    struct dmar_domain *domain)
2440{
2441	struct dmar_domain *found = NULL;
2442	struct device_domain_info *info;
2443	unsigned long flags;
2444	int ret;
2445
2446	info = alloc_devinfo_mem();
2447	if (!info)
2448		return NULL;
2449
2450	info->bus = bus;
2451	info->devfn = devfn;
2452	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2453	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2454	info->ats_qdep = 0;
2455	info->dev = dev;
2456	info->domain = domain;
2457	info->iommu = iommu;
2458
2459	if (dev && dev_is_pci(dev)) {
2460		struct pci_dev *pdev = to_pci_dev(info->dev);
2461
2462		if (ecap_dev_iotlb_support(iommu->ecap) &&
2463		    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2464		    dmar_find_matched_atsr_unit(pdev))
2465			info->ats_supported = 1;
2466
2467		if (ecs_enabled(iommu)) {
2468			if (pasid_enabled(iommu)) {
2469				int features = pci_pasid_features(pdev);
2470				if (features >= 0)
2471					info->pasid_supported = features | 1;
2472			}
2473
2474			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2475			    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2476				info->pri_supported = 1;
2477		}
2478	}
2479
2480	spin_lock_irqsave(&device_domain_lock, flags);
2481	if (dev)
2482		found = find_domain(dev);
2483
2484	if (!found) {
2485		struct device_domain_info *info2;
2486		info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2487		if (info2) {
2488			found      = info2->domain;
2489			info2->dev = dev;
2490		}
2491	}
2492
2493	if (found) {
2494		spin_unlock_irqrestore(&device_domain_lock, flags);
2495		free_devinfo_mem(info);
2496		/* Caller must free the original domain */
2497		return found;
2498	}
2499
2500	spin_lock(&iommu->lock);
2501	ret = domain_attach_iommu(domain, iommu);
2502	spin_unlock(&iommu->lock);
2503
2504	if (ret) {
2505		spin_unlock_irqrestore(&device_domain_lock, flags);
2506		free_devinfo_mem(info);
2507		return NULL;
2508	}
2509
2510	list_add(&info->link, &domain->devices);
2511	list_add(&info->global, &device_domain_list);
2512	if (dev)
2513		dev->archdata.iommu = info;
2514	spin_unlock_irqrestore(&device_domain_lock, flags);
2515
2516	if (dev && domain_context_mapping(domain, dev)) {
2517		pr_err("Domain context map for %s failed\n", dev_name(dev));
2518		dmar_remove_one_dev_info(domain, dev);
2519		return NULL;
2520	}
2521
2522	return domain;
2523}
2524
2525static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2526{
2527	*(u16 *)opaque = alias;
2528	return 0;
2529}
2530
2531static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2532{
2533	struct device_domain_info *info = NULL;
2534	struct dmar_domain *domain = NULL;
2535	struct intel_iommu *iommu;
2536	u16 req_id, dma_alias;
2537	unsigned long flags;
2538	u8 bus, devfn;
2539
2540	iommu = device_to_iommu(dev, &bus, &devfn);
2541	if (!iommu)
2542		return NULL;
2543
2544	req_id = ((u16)bus << 8) | devfn;
2545
2546	if (dev_is_pci(dev)) {
2547		struct pci_dev *pdev = to_pci_dev(dev);
2548
2549		pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2550
2551		spin_lock_irqsave(&device_domain_lock, flags);
2552		info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2553						      PCI_BUS_NUM(dma_alias),
2554						      dma_alias & 0xff);
2555		if (info) {
2556			iommu = info->iommu;
2557			domain = info->domain;
2558		}
2559		spin_unlock_irqrestore(&device_domain_lock, flags);
2560
2561		/* DMA alias already has a domain, use it */
2562		if (info)
2563			goto out;
2564	}
2565
2566	/* Allocate and initialize new domain for the device */
2567	domain = alloc_domain(0);
2568	if (!domain)
2569		return NULL;
2570	if (domain_init(domain, iommu, gaw)) {
2571		domain_exit(domain);
2572		return NULL;
2573	}
2574
2575out:
2576
2577	return domain;
2578}
2579
2580static struct dmar_domain *set_domain_for_dev(struct device *dev,
2581					      struct dmar_domain *domain)
2582{
2583	struct intel_iommu *iommu;
2584	struct dmar_domain *tmp;
2585	u16 req_id, dma_alias;
2586	u8 bus, devfn;
2587
2588	iommu = device_to_iommu(dev, &bus, &devfn);
2589	if (!iommu)
2590		return NULL;
2591
2592	req_id = ((u16)bus << 8) | devfn;
2593
2594	if (dev_is_pci(dev)) {
2595		struct pci_dev *pdev = to_pci_dev(dev);
2596
2597		pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2598
2599		/* register PCI DMA alias device */
2600		if (req_id != dma_alias) {
2601			tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2602					dma_alias & 0xff, NULL, domain);
2603
2604			if (!tmp || tmp != domain)
2605				return tmp;
2606		}
2607	}
2608
2609	tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2610	if (!tmp || tmp != domain)
2611		return tmp;
2612
2613	return domain;
2614}
2615
2616static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2617{
2618	struct dmar_domain *domain, *tmp;
2619
2620	domain = find_domain(dev);
2621	if (domain)
2622		goto out;
2623
2624	domain = find_or_alloc_domain(dev, gaw);
2625	if (!domain)
2626		goto out;
2627
2628	tmp = set_domain_for_dev(dev, domain);
2629	if (!tmp || domain != tmp) {
2630		domain_exit(domain);
2631		domain = tmp;
2632	}
2633
2634out:
2635
2636	return domain;
2637}
2638
2639static int iommu_domain_identity_map(struct dmar_domain *domain,
2640				     unsigned long long start,
2641				     unsigned long long end)
2642{
2643	unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2644	unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2645
2646	if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2647			  dma_to_mm_pfn(last_vpfn))) {
2648		pr_err("Reserving iova failed\n");
2649		return -ENOMEM;
2650	}
2651
2652	pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2653	/*
2654	 * RMRR range might have overlap with physical memory range,
2655	 * clear it first
2656	 */
2657	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2658
2659	return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2660				  last_vpfn - first_vpfn + 1,
2661				  DMA_PTE_READ|DMA_PTE_WRITE);
2662}
2663
2664static int domain_prepare_identity_map(struct device *dev,
2665				       struct dmar_domain *domain,
2666				       unsigned long long start,
2667				       unsigned long long end)
2668{
2669	/* For _hardware_ passthrough, don't bother. But for software
2670	   passthrough, we do it anyway -- it may indicate a memory
2671	   range which is reserved in E820, so which didn't get set
2672	   up to start with in si_domain */
2673	if (domain == si_domain && hw_pass_through) {
2674		pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2675			dev_name(dev), start, end);
2676		return 0;
2677	}
2678
2679	pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2680		dev_name(dev), start, end);
2681
2682	if (end < start) {
2683		WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2684			"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2685			dmi_get_system_info(DMI_BIOS_VENDOR),
2686			dmi_get_system_info(DMI_BIOS_VERSION),
2687		     dmi_get_system_info(DMI_PRODUCT_VERSION));
2688		return -EIO;
2689	}
2690
2691	if (end >> agaw_to_width(domain->agaw)) {
2692		WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2693		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2694		     agaw_to_width(domain->agaw),
2695		     dmi_get_system_info(DMI_BIOS_VENDOR),
2696		     dmi_get_system_info(DMI_BIOS_VERSION),
2697		     dmi_get_system_info(DMI_PRODUCT_VERSION));
2698		return -EIO;
2699	}
2700
2701	return iommu_domain_identity_map(domain, start, end);
2702}
2703
2704static int iommu_prepare_identity_map(struct device *dev,
2705				      unsigned long long start,
2706				      unsigned long long end)
2707{
2708	struct dmar_domain *domain;
2709	int ret;
2710
2711	domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2712	if (!domain)
2713		return -ENOMEM;
2714
2715	ret = domain_prepare_identity_map(dev, domain, start, end);
2716	if (ret)
2717		domain_exit(domain);
2718
2719	return ret;
2720}
2721
2722static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2723					 struct device *dev)
2724{
2725	if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2726		return 0;
2727	return iommu_prepare_identity_map(dev, rmrr->base_address,
2728					  rmrr->end_address);
2729}
2730
2731#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2732static inline void iommu_prepare_isa(void)
2733{
2734	struct pci_dev *pdev;
2735	int ret;
2736
2737	pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2738	if (!pdev)
2739		return;
2740
2741	pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2742	ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2743
2744	if (ret)
2745		pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2746
2747	pci_dev_put(pdev);
2748}
2749#else
2750static inline void iommu_prepare_isa(void)
2751{
2752	return;
2753}
2754#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2755
2756static int md_domain_init(struct dmar_domain *domain, int guest_width);
2757
2758static int __init si_domain_init(int hw)
2759{
2760	int nid, ret = 0;
2761
2762	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2763	if (!si_domain)
2764		return -EFAULT;
2765
2766	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2767		domain_exit(si_domain);
2768		return -EFAULT;
2769	}
2770
2771	pr_debug("Identity mapping domain allocated\n");
2772
2773	if (hw)
2774		return 0;
2775
2776	for_each_online_node(nid) {
2777		unsigned long start_pfn, end_pfn;
2778		int i;
2779
2780		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2781			ret = iommu_domain_identity_map(si_domain,
2782					PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2783			if (ret)
2784				return ret;
2785		}
2786	}
2787
2788	return 0;
2789}
2790
2791static int identity_mapping(struct device *dev)
2792{
2793	struct device_domain_info *info;
2794
2795	if (likely(!iommu_identity_mapping))
2796		return 0;
2797
2798	info = dev->archdata.iommu;
2799	if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2800		return (info->domain == si_domain);
2801
2802	return 0;
2803}
2804
2805static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2806{
2807	struct dmar_domain *ndomain;
2808	struct intel_iommu *iommu;
2809	u8 bus, devfn;
2810
2811	iommu = device_to_iommu(dev, &bus, &devfn);
2812	if (!iommu)
2813		return -ENODEV;
2814
2815	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2816	if (ndomain != domain)
2817		return -EBUSY;
2818
2819	return 0;
2820}
2821
2822static bool device_has_rmrr(struct device *dev)
2823{
2824	struct dmar_rmrr_unit *rmrr;
2825	struct device *tmp;
2826	int i;
2827
2828	rcu_read_lock();
2829	for_each_rmrr_units(rmrr) {
2830		/*
2831		 * Return TRUE if this RMRR contains the device that
2832		 * is passed in.
2833		 */
2834		for_each_active_dev_scope(rmrr->devices,
2835					  rmrr->devices_cnt, i, tmp)
2836			if (tmp == dev) {
2837				rcu_read_unlock();
2838				return true;
2839			}
2840	}
2841	rcu_read_unlock();
2842	return false;
2843}
2844
2845/*
2846 * There are a couple cases where we need to restrict the functionality of
2847 * devices associated with RMRRs.  The first is when evaluating a device for
2848 * identity mapping because problems exist when devices are moved in and out
2849 * of domains and their respective RMRR information is lost.  This means that
2850 * a device with associated RMRRs will never be in a "passthrough" domain.
2851 * The second is use of the device through the IOMMU API.  This interface
2852 * expects to have full control of the IOVA space for the device.  We cannot
2853 * satisfy both the requirement that RMRR access is maintained and have an
2854 * unencumbered IOVA space.  We also have no ability to quiesce the device's
2855 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2856 * We therefore prevent devices associated with an RMRR from participating in
2857 * the IOMMU API, which eliminates them from device assignment.
2858 *
2859 * In both cases we assume that PCI USB devices with RMRRs have them largely
2860 * for historical reasons and that the RMRR space is not actively used post
2861 * boot.  This exclusion may change if vendors begin to abuse it.
2862 *
2863 * The same exception is made for graphics devices, with the requirement that
2864 * any use of the RMRR regions will be torn down before assigning the device
2865 * to a guest.
2866 */
2867static bool device_is_rmrr_locked(struct device *dev)
2868{
2869	if (!device_has_rmrr(dev))
2870		return false;
2871
2872	if (dev_is_pci(dev)) {
2873		struct pci_dev *pdev = to_pci_dev(dev);
2874
2875		if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2876			return false;
2877	}
2878
2879	return true;
2880}
2881
2882static int iommu_should_identity_map(struct device *dev, int startup)
2883{
2884
2885	if (dev_is_pci(dev)) {
2886		struct pci_dev *pdev = to_pci_dev(dev);
2887
2888		if (device_is_rmrr_locked(dev))
2889			return 0;
2890
2891		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2892			return 1;
2893
2894		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2895			return 1;
2896
2897		if (!(iommu_identity_mapping & IDENTMAP_ALL))
2898			return 0;
2899
2900		/*
2901		 * We want to start off with all devices in the 1:1 domain, and
2902		 * take them out later if we find they can't access all of memory.
2903		 *
2904		 * However, we can't do this for PCI devices behind bridges,
2905		 * because all PCI devices behind the same bridge will end up
2906		 * with the same source-id on their transactions.
2907		 *
2908		 * Practically speaking, we can't change things around for these
2909		 * devices at run-time, because we can't be sure there'll be no
2910		 * DMA transactions in flight for any of their siblings.
2911		 *
2912		 * So PCI devices (unless they're on the root bus) as well as
2913		 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2914		 * the 1:1 domain, just in _case_ one of their siblings turns out
2915		 * not to be able to map all of memory.
2916		 */
2917		if (!pci_is_pcie(pdev)) {
2918			if (!pci_is_root_bus(pdev->bus))
2919				return 0;
2920			if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2921				return 0;
2922		} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2923			return 0;
2924	} else {
2925		if (device_has_rmrr(dev))
2926			return 0;
2927	}
2928
2929	/*
2930	 * At boot time, we don't yet know if devices will be 64-bit capable.
2931	 * Assume that they will — if they turn out not to be, then we can
2932	 * take them out of the 1:1 domain later.
2933	 */
2934	if (!startup) {
2935		/*
2936		 * If the device's dma_mask is less than the system's memory
2937		 * size then this is not a candidate for identity mapping.
2938		 */
2939		u64 dma_mask = *dev->dma_mask;
2940
2941		if (dev->coherent_dma_mask &&
2942		    dev->coherent_dma_mask < dma_mask)
2943			dma_mask = dev->coherent_dma_mask;
2944
2945		return dma_mask >= dma_get_required_mask(dev);
2946	}
2947
2948	return 1;
2949}
2950
2951static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2952{
2953	int ret;
2954
2955	if (!iommu_should_identity_map(dev, 1))
2956		return 0;
2957
2958	ret = domain_add_dev_info(si_domain, dev);
2959	if (!ret)
2960		pr_info("%s identity mapping for device %s\n",
2961			hw ? "Hardware" : "Software", dev_name(dev));
2962	else if (ret == -ENODEV)
2963		/* device not associated with an iommu */
2964		ret = 0;
2965
2966	return ret;
2967}
2968
2969
2970static int __init iommu_prepare_static_identity_mapping(int hw)
2971{
2972	struct pci_dev *pdev = NULL;
2973	struct dmar_drhd_unit *drhd;
2974	struct intel_iommu *iommu;
2975	struct device *dev;
2976	int i;
2977	int ret = 0;
2978
2979	for_each_pci_dev(pdev) {
2980		ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2981		if (ret)
2982			return ret;
2983	}
2984
2985	for_each_active_iommu(iommu, drhd)
2986		for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2987			struct acpi_device_physical_node *pn;
2988			struct acpi_device *adev;
2989
2990			if (dev->bus != &acpi_bus_type)
2991				continue;
2992
2993			adev= to_acpi_device(dev);
2994			mutex_lock(&adev->physical_node_lock);
2995			list_for_each_entry(pn, &adev->physical_node_list, node) {
2996				ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2997				if (ret)
2998					break;
2999			}
3000			mutex_unlock(&adev->physical_node_lock);
3001			if (ret)
3002				return ret;
3003		}
3004
3005	return 0;
3006}
3007
3008static void intel_iommu_init_qi(struct intel_iommu *iommu)
3009{
3010	/*
3011	 * Start from the sane iommu hardware state.
3012	 * If the queued invalidation is already initialized by us
3013	 * (for example, while enabling interrupt-remapping) then
3014	 * we got the things already rolling from a sane state.
3015	 */
3016	if (!iommu->qi) {
3017		/*
3018		 * Clear any previous faults.
3019		 */
3020		dmar_fault(-1, iommu);
3021		/*
3022		 * Disable queued invalidation if supported and already enabled
3023		 * before OS handover.
3024		 */
3025		dmar_disable_qi(iommu);
3026	}
3027
3028	if (dmar_enable_qi(iommu)) {
3029		/*
3030		 * Queued Invalidate not enabled, use Register Based Invalidate
3031		 */
3032		iommu->flush.flush_context = __iommu_flush_context;
3033		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3034		pr_info("%s: Using Register based invalidation\n",
3035			iommu->name);
3036	} else {
3037		iommu->flush.flush_context = qi_flush_context;
3038		iommu->flush.flush_iotlb = qi_flush_iotlb;
3039		pr_info("%s: Using Queued invalidation\n", iommu->name);
3040	}
3041}
3042
3043static int copy_context_table(struct intel_iommu *iommu,
3044			      struct root_entry *old_re,
3045			      struct context_entry **tbl,
3046			      int bus, bool ext)
3047{
3048	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3049	struct context_entry *new_ce = NULL, ce;
3050	struct context_entry *old_ce = NULL;
3051	struct root_entry re;
3052	phys_addr_t old_ce_phys;
3053
3054	tbl_idx = ext ? bus * 2 : bus;
3055	memcpy(&re, old_re, sizeof(re));
3056
3057	for (devfn = 0; devfn < 256; devfn++) {
3058		/* First calculate the correct index */
3059		idx = (ext ? devfn * 2 : devfn) % 256;
3060
3061		if (idx == 0) {
3062			/* First save what we may have and clean up */
3063			if (new_ce) {
3064				tbl[tbl_idx] = new_ce;
3065				__iommu_flush_cache(iommu, new_ce,
3066						    VTD_PAGE_SIZE);
3067				pos = 1;
3068			}
3069
3070			if (old_ce)
3071				iounmap(old_ce);
3072
3073			ret = 0;
3074			if (devfn < 0x80)
3075				old_ce_phys = root_entry_lctp(&re);
3076			else
3077				old_ce_phys = root_entry_uctp(&re);
3078
3079			if (!old_ce_phys) {
3080				if (ext && devfn == 0) {
3081					/* No LCTP, try UCTP */
3082					devfn = 0x7f;
3083					continue;
3084				} else {
3085					goto out;
3086				}
3087			}
3088
3089			ret = -ENOMEM;
3090			old_ce = memremap(old_ce_phys, PAGE_SIZE,
3091					MEMREMAP_WB);
3092			if (!old_ce)
3093				goto out;
3094
3095			new_ce = alloc_pgtable_page(iommu->node);
3096			if (!new_ce)
3097				goto out_unmap;
3098
3099			ret = 0;
3100		}
3101
3102		/* Now copy the context entry */
3103		memcpy(&ce, old_ce + idx, sizeof(ce));
3104
3105		if (!__context_present(&ce))
3106			continue;
3107
3108		did = context_domain_id(&ce);
3109		if (did >= 0 && did < cap_ndoms(iommu->cap))
3110			set_bit(did, iommu->domain_ids);
3111
3112		/*
3113		 * We need a marker for copied context entries. This
3114		 * marker needs to work for the old format as well as
3115		 * for extended context entries.
3116		 *
3117		 * Bit 67 of the context entry is used. In the old
3118		 * format this bit is available to software, in the
3119		 * extended format it is the PGE bit, but PGE is ignored
3120		 * by HW if PASIDs are disabled (and thus still
3121		 * available).
3122		 *
3123		 * So disable PASIDs first and then mark the entry
3124		 * copied. This means that we don't copy PASID
3125		 * translations from the old kernel, but this is fine as
3126		 * faults there are not fatal.
3127		 */
3128		context_clear_pasid_enable(&ce);
3129		context_set_copied(&ce);
3130
3131		new_ce[idx] = ce;
3132	}
3133
3134	tbl[tbl_idx + pos] = new_ce;
3135
3136	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3137
3138out_unmap:
3139	memunmap(old_ce);
3140
3141out:
3142	return ret;
3143}
3144
3145static int copy_translation_tables(struct intel_iommu *iommu)
3146{
3147	struct context_entry **ctxt_tbls;
3148	struct root_entry *old_rt;
3149	phys_addr_t old_rt_phys;
3150	int ctxt_table_entries;
3151	unsigned long flags;
3152	u64 rtaddr_reg;
3153	int bus, ret;
3154	bool new_ext, ext;
3155
3156	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3157	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3158	new_ext    = !!ecap_ecs(iommu->ecap);
3159
3160	/*
3161	 * The RTT bit can only be changed when translation is disabled,
3162	 * but disabling translation means to open a window for data
3163	 * corruption. So bail out and don't copy anything if we would
3164	 * have to change the bit.
3165	 */
3166	if (new_ext != ext)
3167		return -EINVAL;
3168
3169	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3170	if (!old_rt_phys)
3171		return -EINVAL;
3172
3173	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3174	if (!old_rt)
3175		return -ENOMEM;
3176
3177	/* This is too big for the stack - allocate it from slab */
3178	ctxt_table_entries = ext ? 512 : 256;
3179	ret = -ENOMEM;
3180	ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3181	if (!ctxt_tbls)
3182		goto out_unmap;
3183
3184	for (bus = 0; bus < 256; bus++) {
3185		ret = copy_context_table(iommu, &old_rt[bus],
3186					 ctxt_tbls, bus, ext);
3187		if (ret) {
3188			pr_err("%s: Failed to copy context table for bus %d\n",
3189				iommu->name, bus);
3190			continue;
3191		}
3192	}
3193
3194	spin_lock_irqsave(&iommu->lock, flags);
3195
3196	/* Context tables are copied, now write them to the root_entry table */
3197	for (bus = 0; bus < 256; bus++) {
3198		int idx = ext ? bus * 2 : bus;
3199		u64 val;
3200
3201		if (ctxt_tbls[idx]) {
3202			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3203			iommu->root_entry[bus].lo = val;
3204		}
3205
3206		if (!ext || !ctxt_tbls[idx + 1])
3207			continue;
3208
3209		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3210		iommu->root_entry[bus].hi = val;
3211	}
3212
3213	spin_unlock_irqrestore(&iommu->lock, flags);
3214
3215	kfree(ctxt_tbls);
3216
3217	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3218
3219	ret = 0;
3220
3221out_unmap:
3222	memunmap(old_rt);
3223
3224	return ret;
3225}
3226
3227static int __init init_dmars(void)
3228{
3229	struct dmar_drhd_unit *drhd;
3230	struct dmar_rmrr_unit *rmrr;
3231	bool copied_tables = false;
3232	struct device *dev;
3233	struct intel_iommu *iommu;
3234	int i, ret;
3235
3236	/*
3237	 * for each drhd
3238	 *    allocate root
3239	 *    initialize and program root entry to not present
3240	 * endfor
3241	 */
3242	for_each_drhd_unit(drhd) {
3243		/*
3244		 * lock not needed as this is only incremented in the single
3245		 * threaded kernel __init code path all other access are read
3246		 * only
3247		 */
3248		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3249			g_num_of_iommus++;
3250			continue;
3251		}
3252		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3253	}
3254
3255	/* Preallocate enough resources for IOMMU hot-addition */
3256	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3257		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3258
3259	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3260			GFP_KERNEL);
3261	if (!g_iommus) {
3262		pr_err("Allocating global iommu array failed\n");
3263		ret = -ENOMEM;
3264		goto error;
3265	}
3266
3267	for_each_active_iommu(iommu, drhd) {
3268		g_iommus[iommu->seq_id] = iommu;
3269
3270		intel_iommu_init_qi(iommu);
3271
3272		ret = iommu_init_domains(iommu);
3273		if (ret)
3274			goto free_iommu;
3275
3276		init_translation_status(iommu);
3277
3278		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3279			iommu_disable_translation(iommu);
3280			clear_translation_pre_enabled(iommu);
3281			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3282				iommu->name);
3283		}
3284
3285		/*
3286		 * TBD:
3287		 * we could share the same root & context tables
3288		 * among all IOMMU's. Need to Split it later.
3289		 */
3290		ret = iommu_alloc_root_entry(iommu);
3291		if (ret)
3292			goto free_iommu;
3293
3294		if (translation_pre_enabled(iommu)) {
3295			pr_info("Translation already enabled - trying to copy translation structures\n");
3296
3297			ret = copy_translation_tables(iommu);
3298			if (ret) {
3299				/*
3300				 * We found the IOMMU with translation
3301				 * enabled - but failed to copy over the
3302				 * old root-entry table. Try to proceed
3303				 * by disabling translation now and
3304				 * allocating a clean root-entry table.
3305				 * This might cause DMAR faults, but
3306				 * probably the dump will still succeed.
3307				 */
3308				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3309				       iommu->name);
3310				iommu_disable_translation(iommu);
3311				clear_translation_pre_enabled(iommu);
3312			} else {
3313				pr_info("Copied translation tables from previous kernel for %s\n",
3314					iommu->name);
3315				copied_tables = true;
3316			}
3317		}
3318
3319		if (!ecap_pass_through(iommu->ecap))
3320			hw_pass_through = 0;
3321#ifdef CONFIG_INTEL_IOMMU_SVM
3322		if (pasid_enabled(iommu))
3323			intel_svm_alloc_pasid_tables(iommu);
3324#endif
3325	}
3326
3327	/*
3328	 * Now that qi is enabled on all iommus, set the root entry and flush
3329	 * caches. This is required on some Intel X58 chipsets, otherwise the
3330	 * flush_context function will loop forever and the boot hangs.
3331	 */
3332	for_each_active_iommu(iommu, drhd) {
3333		iommu_flush_write_buffer(iommu);
3334		iommu_set_root_entry(iommu);
3335		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3336		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3337	}
3338
3339	if (iommu_pass_through)
3340		iommu_identity_mapping |= IDENTMAP_ALL;
3341
3342#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3343	iommu_identity_mapping |= IDENTMAP_GFX;
3344#endif
3345
3346	check_tylersburg_isoch();
3347
3348	if (iommu_identity_mapping) {
3349		ret = si_domain_init(hw_pass_through);
3350		if (ret)
3351			goto free_iommu;
3352	}
3353
3354
3355	/*
3356	 * If we copied translations from a previous kernel in the kdump
3357	 * case, we can not assign the devices to domains now, as that
3358	 * would eliminate the old mappings. So skip this part and defer
3359	 * the assignment to device driver initialization time.
3360	 */
3361	if (copied_tables)
3362		goto domains_done;
3363
3364	/*
3365	 * If pass through is not set or not enabled, setup context entries for
3366	 * identity mappings for rmrr, gfx, and isa and may fall back to static
3367	 * identity mapping if iommu_identity_mapping is set.
3368	 */
3369	if (iommu_identity_mapping) {
3370		ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3371		if (ret) {
3372			pr_crit("Failed to setup IOMMU pass-through\n");
3373			goto free_iommu;
3374		}
3375	}
3376	/*
3377	 * For each rmrr
3378	 *   for each dev attached to rmrr
3379	 *   do
3380	 *     locate drhd for dev, alloc domain for dev
3381	 *     allocate free domain
3382	 *     allocate page table entries for rmrr
3383	 *     if context not allocated for bus
3384	 *           allocate and init context
3385	 *           set present in root table for this bus
3386	 *     init context with domain, translation etc
3387	 *    endfor
3388	 * endfor
3389	 */
3390	pr_info("Setting RMRR:\n");
3391	for_each_rmrr_units(rmrr) {
3392		/* some BIOS lists non-exist devices in DMAR table. */
3393		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3394					  i, dev) {
3395			ret = iommu_prepare_rmrr_dev(rmrr, dev);
3396			if (ret)
3397				pr_err("Mapping reserved region failed\n");
3398		}
3399	}
3400
3401	iommu_prepare_isa();
3402
3403domains_done:
3404
3405	/*
3406	 * for each drhd
3407	 *   enable fault log
3408	 *   global invalidate context cache
3409	 *   global invalidate iotlb
3410	 *   enable translation
3411	 */
3412	for_each_iommu(iommu, drhd) {
3413		if (drhd->ignored) {
3414			/*
3415			 * we always have to disable PMRs or DMA may fail on
3416			 * this device
3417			 */
3418			if (force_on)
3419				iommu_disable_protect_mem_regions(iommu);
3420			continue;
3421		}
3422
3423		iommu_flush_write_buffer(iommu);
3424
3425#ifdef CONFIG_INTEL_IOMMU_SVM
3426		if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3427			ret = intel_svm_enable_prq(iommu);
3428			if (ret)
3429				goto free_iommu;
3430		}
3431#endif
3432		ret = dmar_set_interrupt(iommu);
3433		if (ret)
3434			goto free_iommu;
3435
3436		if (!translation_pre_enabled(iommu))
3437			iommu_enable_translation(iommu);
3438
3439		iommu_disable_protect_mem_regions(iommu);
3440	}
3441
3442	return 0;
3443
3444free_iommu:
3445	for_each_active_iommu(iommu, drhd) {
3446		disable_dmar_iommu(iommu);
3447		free_dmar_iommu(iommu);
3448	}
3449
3450	kfree(g_iommus);
3451
3452error:
3453	return ret;
3454}
3455
3456/* This takes a number of _MM_ pages, not VTD pages */
3457static unsigned long intel_alloc_iova(struct device *dev,
3458				     struct dmar_domain *domain,
3459				     unsigned long nrpages, uint64_t dma_mask)
3460{
3461	unsigned long iova_pfn = 0;
3462
3463	/* Restrict dma_mask to the width that the iommu can handle */
3464	dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3465	/* Ensure we reserve the whole size-aligned region */
3466	nrpages = __roundup_pow_of_two(nrpages);
3467
3468	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3469		/*
3470		 * First try to allocate an io virtual address in
3471		 * DMA_BIT_MASK(32) and if that fails then try allocating
3472		 * from higher range
3473		 */
3474		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3475					   IOVA_PFN(DMA_BIT_MASK(32)), false);
3476		if (iova_pfn)
3477			return iova_pfn;
3478	}
3479	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3480				   IOVA_PFN(dma_mask), true);
3481	if (unlikely(!iova_pfn)) {
3482		pr_err("Allocating %ld-page iova for %s failed",
3483		       nrpages, dev_name(dev));
3484		return 0;
3485	}
3486
3487	return iova_pfn;
3488}
3489
3490static struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3491{
3492	struct dmar_domain *domain, *tmp;
3493	struct dmar_rmrr_unit *rmrr;
3494	struct device *i_dev;
3495	int i, ret;
3496
3497	domain = find_domain(dev);
3498	if (domain)
3499		goto out;
3500
3501	domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3502	if (!domain)
3503		goto out;
3504
3505	/* We have a new domain - setup possible RMRRs for the device */
3506	rcu_read_lock();
3507	for_each_rmrr_units(rmrr) {
3508		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3509					  i, i_dev) {
3510			if (i_dev != dev)
3511				continue;
3512
3513			ret = domain_prepare_identity_map(dev, domain,
3514							  rmrr->base_address,
3515							  rmrr->end_address);
3516			if (ret)
3517				dev_err(dev, "Mapping reserved region failed\n");
3518		}
3519	}
3520	rcu_read_unlock();
3521
3522	tmp = set_domain_for_dev(dev, domain);
3523	if (!tmp || domain != tmp) {
3524		domain_exit(domain);
3525		domain = tmp;
3526	}
3527
3528out:
3529
3530	if (!domain)
3531		pr_err("Allocating domain for %s failed\n", dev_name(dev));
3532
3533
3534	return domain;
3535}
3536
3537/* Check if the dev needs to go through non-identity map and unmap process.*/
3538static int iommu_no_mapping(struct device *dev)
3539{
3540	int found;
3541
3542	if (iommu_dummy(dev))
3543		return 1;
3544
3545	if (!iommu_identity_mapping)
3546		return 0;
3547
3548	found = identity_mapping(dev);
3549	if (found) {
3550		if (iommu_should_identity_map(dev, 0))
3551			return 1;
3552		else {
3553			/*
3554			 * 32 bit DMA is removed from si_domain and fall back
3555			 * to non-identity mapping.
3556			 */
3557			dmar_remove_one_dev_info(si_domain, dev);
3558			pr_info("32bit %s uses non-identity mapping\n",
3559				dev_name(dev));
3560			return 0;
3561		}
3562	} else {
3563		/*
3564		 * In case of a detached 64 bit DMA device from vm, the device
3565		 * is put into si_domain for identity mapping.
3566		 */
3567		if (iommu_should_identity_map(dev, 0)) {
3568			int ret;
3569			ret = domain_add_dev_info(si_domain, dev);
3570			if (!ret) {
3571				pr_info("64bit %s uses identity mapping\n",
3572					dev_name(dev));
3573				return 1;
3574			}
3575		}
3576	}
3577
3578	return 0;
3579}
3580
3581static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3582				     size_t size, int dir, u64 dma_mask)
3583{
3584	struct dmar_domain *domain;
3585	phys_addr_t start_paddr;
3586	unsigned long iova_pfn;
3587	int prot = 0;
3588	int ret;
3589	struct intel_iommu *iommu;
3590	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3591
3592	BUG_ON(dir == DMA_NONE);
3593
3594	if (iommu_no_mapping(dev))
3595		return paddr;
3596
3597	domain = get_valid_domain_for_dev(dev);
3598	if (!domain)
3599		return 0;
3600
3601	iommu = domain_get_iommu(domain);
3602	size = aligned_nrpages(paddr, size);
3603
3604	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3605	if (!iova_pfn)
3606		goto error;
3607
3608	/*
3609	 * Check if DMAR supports zero-length reads on write only
3610	 * mappings..
3611	 */
3612	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3613			!cap_zlr(iommu->cap))
3614		prot |= DMA_PTE_READ;
3615	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3616		prot |= DMA_PTE_WRITE;
3617	/*
3618	 * paddr - (paddr + size) might be partial page, we should map the whole
3619	 * page.  Note: if two part of one page are separately mapped, we
3620	 * might have two guest_addr mapping to the same host paddr, but this
3621	 * is not a big problem
3622	 */
3623	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3624				 mm_to_dma_pfn(paddr_pfn), size, prot);
3625	if (ret)
3626		goto error;
3627
3628	/* it's a non-present to present mapping. Only flush if caching mode */
3629	if (cap_caching_mode(iommu->cap))
3630		iommu_flush_iotlb_psi(iommu, domain,
3631				      mm_to_dma_pfn(iova_pfn),
3632				      size, 0, 1);
3633	else
3634		iommu_flush_write_buffer(iommu);
3635
3636	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3637	start_paddr += paddr & ~PAGE_MASK;
3638	return start_paddr;
3639
3640error:
3641	if (iova_pfn)
3642		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3643	pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3644		dev_name(dev), size, (unsigned long long)paddr, dir);
3645	return 0;
3646}
3647
3648static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3649				 unsigned long offset, size_t size,
3650				 enum dma_data_direction dir,
3651				 unsigned long attrs)
3652{
3653	return __intel_map_single(dev, page_to_phys(page) + offset, size,
3654				  dir, *dev->dma_mask);
3655}
3656
3657static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3658{
3659	struct dmar_domain *domain;
3660	unsigned long start_pfn, last_pfn;
3661	unsigned long nrpages;
3662	unsigned long iova_pfn;
3663	struct intel_iommu *iommu;
3664	struct page *freelist;
3665
3666	if (iommu_no_mapping(dev))
3667		return;
3668
3669	domain = find_domain(dev);
3670	BUG_ON(!domain);
3671
3672	iommu = domain_get_iommu(domain);
3673
3674	iova_pfn = IOVA_PFN(dev_addr);
3675
3676	nrpages = aligned_nrpages(dev_addr, size);
3677	start_pfn = mm_to_dma_pfn(iova_pfn);
3678	last_pfn = start_pfn + nrpages - 1;
3679
3680	pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3681		 dev_name(dev), start_pfn, last_pfn);
3682
3683	freelist = domain_unmap(domain, start_pfn, last_pfn);
3684
3685	if (intel_iommu_strict) {
3686		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3687				      nrpages, !freelist, 0);
3688		/* free iova */
3689		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3690		dma_free_pagelist(freelist);
3691	} else {
3692		queue_iova(&domain->iovad, iova_pfn, nrpages,
3693			   (unsigned long)freelist);
3694		/*
3695		 * queue up the release of the unmap to save the 1/6th of the
3696		 * cpu used up by the iotlb flush operation...
3697		 */
3698	}
3699}
3700
3701static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3702			     size_t size, enum dma_data_direction dir,
3703			     unsigned long attrs)
3704{
3705	intel_unmap(dev, dev_addr, size);
3706}
3707
3708static void *intel_alloc_coherent(struct device *dev, size_t size,
3709				  dma_addr_t *dma_handle, gfp_t flags,
3710				  unsigned long attrs)
3711{
3712	void *vaddr;
3713
3714	vaddr = dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3715	if (iommu_no_mapping(dev) || !vaddr)
3716		return vaddr;
3717
3718	*dma_handle = __intel_map_single(dev, virt_to_phys(vaddr),
3719			PAGE_ALIGN(size), DMA_BIDIRECTIONAL,
3720			dev->coherent_dma_mask);
3721	if (!*dma_handle)
3722		goto out_free_pages;
3723	return vaddr;
3724
3725out_free_pages:
3726	dma_direct_free(dev, size, vaddr, *dma_handle, attrs);
3727	return NULL;
3728}
3729
3730static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3731				dma_addr_t dma_handle, unsigned long attrs)
3732{
3733	if (!iommu_no_mapping(dev))
3734		intel_unmap(dev, dma_handle, PAGE_ALIGN(size));
3735	dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3736}
3737
3738static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3739			   int nelems, enum dma_data_direction dir,
3740			   unsigned long attrs)
3741{
3742	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3743	unsigned long nrpages = 0;
3744	struct scatterlist *sg;
3745	int i;
3746
3747	for_each_sg(sglist, sg, nelems, i) {
3748		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3749	}
3750
3751	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3752}
3753
3754static int intel_nontranslate_map_sg(struct device *hddev,
3755	struct scatterlist *sglist, int nelems, int dir)
3756{
3757	int i;
3758	struct scatterlist *sg;
3759
3760	for_each_sg(sglist, sg, nelems, i) {
3761		BUG_ON(!sg_page(sg));
3762		sg->dma_address = sg_phys(sg);
3763		sg->dma_length = sg->length;
3764	}
3765	return nelems;
3766}
3767
3768static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3769			enum dma_data_direction dir, unsigned long attrs)
3770{
3771	int i;
3772	struct dmar_domain *domain;
3773	size_t size = 0;
3774	int prot = 0;
3775	unsigned long iova_pfn;
3776	int ret;
3777	struct scatterlist *sg;
3778	unsigned long start_vpfn;
3779	struct intel_iommu *iommu;
3780
3781	BUG_ON(dir == DMA_NONE);
3782	if (iommu_no_mapping(dev))
3783		return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3784
3785	domain = get_valid_domain_for_dev(dev);
3786	if (!domain)
3787		return 0;
3788
3789	iommu = domain_get_iommu(domain);
3790
3791	for_each_sg(sglist, sg, nelems, i)
3792		size += aligned_nrpages(sg->offset, sg->length);
3793
3794	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3795				*dev->dma_mask);
3796	if (!iova_pfn) {
3797		sglist->dma_length = 0;
3798		return 0;
3799	}
3800
3801	/*
3802	 * Check if DMAR supports zero-length reads on write only
3803	 * mappings..
3804	 */
3805	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3806			!cap_zlr(iommu->cap))
3807		prot |= DMA_PTE_READ;
3808	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3809		prot |= DMA_PTE_WRITE;
3810
3811	start_vpfn = mm_to_dma_pfn(iova_pfn);
3812
3813	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3814	if (unlikely(ret)) {
3815		dma_pte_free_pagetable(domain, start_vpfn,
3816				       start_vpfn + size - 1,
3817				       agaw_to_level(domain->agaw) + 1);
3818		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3819		return 0;
3820	}
3821
3822	/* it's a non-present to present mapping. Only flush if caching mode */
3823	if (cap_caching_mode(iommu->cap))
3824		iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3825	else
3826		iommu_flush_write_buffer(iommu);
3827
3828	return nelems;
3829}
3830
3831static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3832{
3833	return !dma_addr;
3834}
3835
3836const struct dma_map_ops intel_dma_ops = {
3837	.alloc = intel_alloc_coherent,
3838	.free = intel_free_coherent,
3839	.map_sg = intel_map_sg,
3840	.unmap_sg = intel_unmap_sg,
3841	.map_page = intel_map_page,
3842	.unmap_page = intel_unmap_page,
3843	.mapping_error = intel_mapping_error,
3844#ifdef CONFIG_X86
3845	.dma_supported = dma_direct_supported,
3846#endif
3847};
3848
3849static inline int iommu_domain_cache_init(void)
3850{
3851	int ret = 0;
3852
3853	iommu_domain_cache = kmem_cache_create("iommu_domain",
3854					 sizeof(struct dmar_domain),
3855					 0,
3856					 SLAB_HWCACHE_ALIGN,
3857
3858					 NULL);
3859	if (!iommu_domain_cache) {
3860		pr_err("Couldn't create iommu_domain cache\n");
3861		ret = -ENOMEM;
3862	}
3863
3864	return ret;
3865}
3866
3867static inline int iommu_devinfo_cache_init(void)
3868{
3869	int ret = 0;
3870
3871	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3872					 sizeof(struct device_domain_info),
3873					 0,
3874					 SLAB_HWCACHE_ALIGN,
3875					 NULL);
3876	if (!iommu_devinfo_cache) {
3877		pr_err("Couldn't create devinfo cache\n");
3878		ret = -ENOMEM;
3879	}
3880
3881	return ret;
3882}
3883
3884static int __init iommu_init_mempool(void)
3885{
3886	int ret;
3887	ret = iova_cache_get();
3888	if (ret)
3889		return ret;
3890
3891	ret = iommu_domain_cache_init();
3892	if (ret)
3893		goto domain_error;
3894
3895	ret = iommu_devinfo_cache_init();
3896	if (!ret)
3897		return ret;
3898
3899	kmem_cache_destroy(iommu_domain_cache);
3900domain_error:
3901	iova_cache_put();
3902
3903	return -ENOMEM;
3904}
3905
3906static void __init iommu_exit_mempool(void)
3907{
3908	kmem_cache_destroy(iommu_devinfo_cache);
3909	kmem_cache_destroy(iommu_domain_cache);
3910	iova_cache_put();
3911}
3912
3913static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3914{
3915	struct dmar_drhd_unit *drhd;
3916	u32 vtbar;
3917	int rc;
3918
3919	/* We know that this device on this chipset has its own IOMMU.
3920	 * If we find it under a different IOMMU, then the BIOS is lying
3921	 * to us. Hope that the IOMMU for this device is actually
3922	 * disabled, and it needs no translation...
3923	 */
3924	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3925	if (rc) {
3926		/* "can't" happen */
3927		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3928		return;
3929	}
3930	vtbar &= 0xffff0000;
3931
3932	/* we know that the this iommu should be at offset 0xa000 from vtbar */
3933	drhd = dmar_find_matched_drhd_unit(pdev);
3934	if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3935			    TAINT_FIRMWARE_WORKAROUND,
3936			    "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3937		pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3938}
3939DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3940
3941static void __init init_no_remapping_devices(void)
3942{
3943	struct dmar_drhd_unit *drhd;
3944	struct device *dev;
3945	int i;
3946
3947	for_each_drhd_unit(drhd) {
3948		if (!drhd->include_all) {
3949			for_each_active_dev_scope(drhd->devices,
3950						  drhd->devices_cnt, i, dev)
3951				break;
3952			/* ignore DMAR unit if no devices exist */
3953			if (i == drhd->devices_cnt)
3954				drhd->ignored = 1;
3955		}
3956	}
3957
3958	for_each_active_drhd_unit(drhd) {
3959		if (drhd->include_all)
3960			continue;
3961
3962		for_each_active_dev_scope(drhd->devices,
3963					  drhd->devices_cnt, i, dev)
3964			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3965				break;
3966		if (i < drhd->devices_cnt)
3967			continue;
3968
3969		/* This IOMMU has *only* gfx devices. Either bypass it or
3970		   set the gfx_mapped flag, as appropriate */
3971		if (dmar_map_gfx) {
3972			intel_iommu_gfx_mapped = 1;
3973		} else {
3974			drhd->ignored = 1;
3975			for_each_active_dev_scope(drhd->devices,
3976						  drhd->devices_cnt, i, dev)
3977				dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3978		}
3979	}
3980}
3981
3982#ifdef CONFIG_SUSPEND
3983static int init_iommu_hw(void)
3984{
3985	struct dmar_drhd_unit *drhd;
3986	struct intel_iommu *iommu = NULL;
3987
3988	for_each_active_iommu(iommu, drhd)
3989		if (iommu->qi)
3990			dmar_reenable_qi(iommu);
3991
3992	for_each_iommu(iommu, drhd) {
3993		if (drhd->ignored) {
3994			/*
3995			 * we always have to disable PMRs or DMA may fail on
3996			 * this device
3997			 */
3998			if (force_on)
3999				iommu_disable_protect_mem_regions(iommu);
4000			continue;
4001		}
4002	
4003		iommu_flush_write_buffer(iommu);
4004
4005		iommu_set_root_entry(iommu);
4006
4007		iommu->flush.flush_context(iommu, 0, 0, 0,
4008					   DMA_CCMD_GLOBAL_INVL);
4009		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4010		iommu_enable_translation(iommu);
4011		iommu_disable_protect_mem_regions(iommu);
4012	}
4013
4014	return 0;
4015}
4016
4017static void iommu_flush_all(void)
4018{
4019	struct dmar_drhd_unit *drhd;
4020	struct intel_iommu *iommu;
4021
4022	for_each_active_iommu(iommu, drhd) {
4023		iommu->flush.flush_context(iommu, 0, 0, 0,
4024					   DMA_CCMD_GLOBAL_INVL);
4025		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4026					 DMA_TLB_GLOBAL_FLUSH);
4027	}
4028}
4029
4030static int iommu_suspend(void)
4031{
4032	struct dmar_drhd_unit *drhd;
4033	struct intel_iommu *iommu = NULL;
4034	unsigned long flag;
4035
4036	for_each_active_iommu(iommu, drhd) {
4037		iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
4038						 GFP_ATOMIC);
4039		if (!iommu->iommu_state)
4040			goto nomem;
4041	}
4042
4043	iommu_flush_all();
4044
4045	for_each_active_iommu(iommu, drhd) {
4046		iommu_disable_translation(iommu);
4047
4048		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4049
4050		iommu->iommu_state[SR_DMAR_FECTL_REG] =
4051			readl(iommu->reg + DMAR_FECTL_REG);
4052		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4053			readl(iommu->reg + DMAR_FEDATA_REG);
4054		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4055			readl(iommu->reg + DMAR_FEADDR_REG);
4056		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4057			readl(iommu->reg + DMAR_FEUADDR_REG);
4058
4059		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4060	}
4061	return 0;
4062
4063nomem:
4064	for_each_active_iommu(iommu, drhd)
4065		kfree(iommu->iommu_state);
4066
4067	return -ENOMEM;
4068}
4069
4070static void iommu_resume(void)
4071{
4072	struct dmar_drhd_unit *drhd;
4073	struct intel_iommu *iommu = NULL;
4074	unsigned long flag;
4075
4076	if (init_iommu_hw()) {
4077		if (force_on)
4078			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4079		else
4080			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4081		return;
4082	}
4083
4084	for_each_active_iommu(iommu, drhd) {
4085
4086		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4087
4088		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4089			iommu->reg + DMAR_FECTL_REG);
4090		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4091			iommu->reg + DMAR_FEDATA_REG);
4092		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4093			iommu->reg + DMAR_FEADDR_REG);
4094		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4095			iommu->reg + DMAR_FEUADDR_REG);
4096
4097		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4098	}
4099
4100	for_each_active_iommu(iommu, drhd)
4101		kfree(iommu->iommu_state);
4102}
4103
4104static struct syscore_ops iommu_syscore_ops = {
4105	.resume		= iommu_resume,
4106	.suspend	= iommu_suspend,
4107};
4108
4109static void __init init_iommu_pm_ops(void)
4110{
4111	register_syscore_ops(&iommu_syscore_ops);
4112}
4113
4114#else
4115static inline void init_iommu_pm_ops(void) {}
4116#endif	/* CONFIG_PM */
4117
4118
4119int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4120{
4121	struct acpi_dmar_reserved_memory *rmrr;
4122	int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4123	struct dmar_rmrr_unit *rmrru;
4124	size_t length;
4125
4126	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4127	if (!rmrru)
4128		goto out;
4129
4130	rmrru->hdr = header;
4131	rmrr = (struct acpi_dmar_reserved_memory *)header;
4132	rmrru->base_address = rmrr->base_address;
4133	rmrru->end_address = rmrr->end_address;
4134
4135	length = rmrr->end_address - rmrr->base_address + 1;
4136	rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4137					      IOMMU_RESV_DIRECT);
4138	if (!rmrru->resv)
4139		goto free_rmrru;
4140
4141	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4142				((void *)rmrr) + rmrr->header.length,
4143				&rmrru->devices_cnt);
4144	if (rmrru->devices_cnt && rmrru->devices == NULL)
4145		goto free_all;
4146
4147	list_add(&rmrru->list, &dmar_rmrr_units);
4148
4149	return 0;
4150free_all:
4151	kfree(rmrru->resv);
4152free_rmrru:
4153	kfree(rmrru);
4154out:
4155	return -ENOMEM;
4156}
4157
4158static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4159{
4160	struct dmar_atsr_unit *atsru;
4161	struct acpi_dmar_atsr *tmp;
4162
4163	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4164		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4165		if (atsr->segment != tmp->segment)
4166			continue;
4167		if (atsr->header.length != tmp->header.length)
4168			continue;
4169		if (memcmp(atsr, tmp, atsr->header.length) == 0)
4170			return atsru;
4171	}
4172
4173	return NULL;
4174}
4175
4176int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4177{
4178	struct acpi_dmar_atsr *atsr;
4179	struct dmar_atsr_unit *atsru;
4180
4181	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4182		return 0;
4183
4184	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4185	atsru = dmar_find_atsr(atsr);
4186	if (atsru)
4187		return 0;
4188
4189	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4190	if (!atsru)
4191		return -ENOMEM;
4192
4193	/*
4194	 * If memory is allocated from slab by ACPI _DSM method, we need to
4195	 * copy the memory content because the memory buffer will be freed
4196	 * on return.
4197	 */
4198	atsru->hdr = (void *)(atsru + 1);
4199	memcpy(atsru->hdr, hdr, hdr->length);
4200	atsru->include_all = atsr->flags & 0x1;
4201	if (!atsru->include_all) {
4202		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4203				(void *)atsr + atsr->header.length,
4204				&atsru->devices_cnt);
4205		if (atsru->devices_cnt && atsru->devices == NULL) {
4206			kfree(atsru);
4207			return -ENOMEM;
4208		}
4209	}
4210
4211	list_add_rcu(&atsru->list, &dmar_atsr_units);
4212
4213	return 0;
4214}
4215
4216static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4217{
4218	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4219	kfree(atsru);
4220}
4221
4222int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4223{
4224	struct acpi_dmar_atsr *atsr;
4225	struct dmar_atsr_unit *atsru;
4226
4227	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4228	atsru = dmar_find_atsr(atsr);
4229	if (atsru) {
4230		list_del_rcu(&atsru->list);
4231		synchronize_rcu();
4232		intel_iommu_free_atsr(atsru);
4233	}
4234
4235	return 0;
4236}
4237
4238int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4239{
4240	int i;
4241	struct device *dev;
4242	struct acpi_dmar_atsr *atsr;
4243	struct dmar_atsr_unit *atsru;
4244
4245	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4246	atsru = dmar_find_atsr(atsr);
4247	if (!atsru)
4248		return 0;
4249
4250	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4251		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4252					  i, dev)
4253			return -EBUSY;
4254	}
4255
4256	return 0;
4257}
4258
4259static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4260{
4261	int sp, ret = 0;
4262	struct intel_iommu *iommu = dmaru->iommu;
4263
4264	if (g_iommus[iommu->seq_id])
4265		return 0;
4266
4267	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4268		pr_warn("%s: Doesn't support hardware pass through.\n",
4269			iommu->name);
4270		return -ENXIO;
4271	}
4272	if (!ecap_sc_support(iommu->ecap) &&
4273	    domain_update_iommu_snooping(iommu)) {
4274		pr_warn("%s: Doesn't support snooping.\n",
4275			iommu->name);
4276		return -ENXIO;
4277	}
4278	sp = domain_update_iommu_superpage(iommu) - 1;
4279	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4280		pr_warn("%s: Doesn't support large page.\n",
4281			iommu->name);
4282		return -ENXIO;
4283	}
4284
4285	/*
4286	 * Disable translation if already enabled prior to OS handover.
4287	 */
4288	if (iommu->gcmd & DMA_GCMD_TE)
4289		iommu_disable_translation(iommu);
4290
4291	g_iommus[iommu->seq_id] = iommu;
4292	ret = iommu_init_domains(iommu);
4293	if (ret == 0)
4294		ret = iommu_alloc_root_entry(iommu);
4295	if (ret)
4296		goto out;
4297
4298#ifdef CONFIG_INTEL_IOMMU_SVM
4299	if (pasid_enabled(iommu))
4300		intel_svm_alloc_pasid_tables(iommu);
4301#endif
4302
4303	if (dmaru->ignored) {
4304		/*
4305		 * we always have to disable PMRs or DMA may fail on this device
4306		 */
4307		if (force_on)
4308			iommu_disable_protect_mem_regions(iommu);
4309		return 0;
4310	}
4311
4312	intel_iommu_init_qi(iommu);
4313	iommu_flush_write_buffer(iommu);
4314
4315#ifdef CONFIG_INTEL_IOMMU_SVM
4316	if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4317		ret = intel_svm_enable_prq(iommu);
4318		if (ret)
4319			goto disable_iommu;
4320	}
4321#endif
4322	ret = dmar_set_interrupt(iommu);
4323	if (ret)
4324		goto disable_iommu;
4325
4326	iommu_set_root_entry(iommu);
4327	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4328	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4329	iommu_enable_translation(iommu);
4330
4331	iommu_disable_protect_mem_regions(iommu);
4332	return 0;
4333
4334disable_iommu:
4335	disable_dmar_iommu(iommu);
4336out:
4337	free_dmar_iommu(iommu);
4338	return ret;
4339}
4340
4341int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4342{
4343	int ret = 0;
4344	struct intel_iommu *iommu = dmaru->iommu;
4345
4346	if (!intel_iommu_enabled)
4347		return 0;
4348	if (iommu == NULL)
4349		return -EINVAL;
4350
4351	if (insert) {
4352		ret = intel_iommu_add(dmaru);
4353	} else {
4354		disable_dmar_iommu(iommu);
4355		free_dmar_iommu(iommu);
4356	}
4357
4358	return ret;
4359}
4360
4361static void intel_iommu_free_dmars(void)
4362{
4363	struct dmar_rmrr_unit *rmrru, *rmrr_n;
4364	struct dmar_atsr_unit *atsru, *atsr_n;
4365
4366	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4367		list_del(&rmrru->list);
4368		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4369		kfree(rmrru->resv);
4370		kfree(rmrru);
4371	}
4372
4373	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4374		list_del(&atsru->list);
4375		intel_iommu_free_atsr(atsru);
4376	}
4377}
4378
4379int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4380{
4381	int i, ret = 1;
4382	struct pci_bus *bus;
4383	struct pci_dev *bridge = NULL;
4384	struct device *tmp;
4385	struct acpi_dmar_atsr *atsr;
4386	struct dmar_atsr_unit *atsru;
4387
4388	dev = pci_physfn(dev);
4389	for (bus = dev->bus; bus; bus = bus->parent) {
4390		bridge = bus->self;
4391		/* If it's an integrated device, allow ATS */
4392		if (!bridge)
4393			return 1;
4394		/* Connected via non-PCIe: no ATS */
4395		if (!pci_is_pcie(bridge) ||
4396		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4397			return 0;
4398		/* If we found the root port, look it up in the ATSR */
4399		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4400			break;
4401	}
4402
4403	rcu_read_lock();
4404	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4405		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4406		if (atsr->segment != pci_domain_nr(dev->bus))
4407			continue;
4408
4409		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4410			if (tmp == &bridge->dev)
4411				goto out;
4412
4413		if (atsru->include_all)
4414			goto out;
4415	}
4416	ret = 0;
4417out:
4418	rcu_read_unlock();
4419
4420	return ret;
4421}
4422
4423int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4424{
4425	int ret = 0;
4426	struct dmar_rmrr_unit *rmrru;
4427	struct dmar_atsr_unit *atsru;
4428	struct acpi_dmar_atsr *atsr;
4429	struct acpi_dmar_reserved_memory *rmrr;
4430
4431	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4432		return 0;
4433
4434	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4435		rmrr = container_of(rmrru->hdr,
4436				    struct acpi_dmar_reserved_memory, header);
4437		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4438			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4439				((void *)rmrr) + rmrr->header.length,
4440				rmrr->segment, rmrru->devices,
4441				rmrru->devices_cnt);
4442			if(ret < 0)
4443				return ret;
4444		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4445			dmar_remove_dev_scope(info, rmrr->segment,
4446				rmrru->devices, rmrru->devices_cnt);
4447		}
4448	}
4449
4450	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4451		if (atsru->include_all)
4452			continue;
4453
4454		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4455		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4456			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4457					(void *)atsr + atsr->header.length,
4458					atsr->segment, atsru->devices,
4459					atsru->devices_cnt);
4460			if (ret > 0)
4461				break;
4462			else if(ret < 0)
4463				return ret;
4464		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4465			if (dmar_remove_dev_scope(info, atsr->segment,
4466					atsru->devices, atsru->devices_cnt))
4467				break;
4468		}
4469	}
4470
4471	return 0;
4472}
4473
4474/*
4475 * Here we only respond to action of unbound device from driver.
4476 *
4477 * Added device is not attached to its DMAR domain here yet. That will happen
4478 * when mapping the device to iova.
4479 */
4480static int device_notifier(struct notifier_block *nb,
4481				  unsigned long action, void *data)
4482{
4483	struct device *dev = data;
4484	struct dmar_domain *domain;
4485
4486	if (iommu_dummy(dev))
4487		return 0;
4488
4489	if (action != BUS_NOTIFY_REMOVED_DEVICE)
4490		return 0;
4491
4492	domain = find_domain(dev);
4493	if (!domain)
4494		return 0;
4495
4496	dmar_remove_one_dev_info(domain, dev);
4497	if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4498		domain_exit(domain);
4499
4500	return 0;
4501}
4502
4503static struct notifier_block device_nb = {
4504	.notifier_call = device_notifier,
4505};
4506
4507static int intel_iommu_memory_notifier(struct notifier_block *nb,
4508				       unsigned long val, void *v)
4509{
4510	struct memory_notify *mhp = v;
4511	unsigned long long start, end;
4512	unsigned long start_vpfn, last_vpfn;
4513
4514	switch (val) {
4515	case MEM_GOING_ONLINE:
4516		start = mhp->start_pfn << PAGE_SHIFT;
4517		end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4518		if (iommu_domain_identity_map(si_domain, start, end)) {
4519			pr_warn("Failed to build identity map for [%llx-%llx]\n",
4520				start, end);
4521			return NOTIFY_BAD;
4522		}
4523		break;
4524
4525	case MEM_OFFLINE:
4526	case MEM_CANCEL_ONLINE:
4527		start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4528		last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4529		while (start_vpfn <= last_vpfn) {
4530			struct iova *iova;
4531			struct dmar_drhd_unit *drhd;
4532			struct intel_iommu *iommu;
4533			struct page *freelist;
4534
4535			iova = find_iova(&si_domain->iovad, start_vpfn);
4536			if (iova == NULL) {
4537				pr_debug("Failed get IOVA for PFN %lx\n",
4538					 start_vpfn);
4539				break;
4540			}
4541
4542			iova = split_and_remove_iova(&si_domain->iovad, iova,
4543						     start_vpfn, last_vpfn);
4544			if (iova == NULL) {
4545				pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4546					start_vpfn, last_vpfn);
4547				return NOTIFY_BAD;
4548			}
4549
4550			freelist = domain_unmap(si_domain, iova->pfn_lo,
4551					       iova->pfn_hi);
4552
4553			rcu_read_lock();
4554			for_each_active_iommu(iommu, drhd)
4555				iommu_flush_iotlb_psi(iommu, si_domain,
4556					iova->pfn_lo, iova_size(iova),
4557					!freelist, 0);
4558			rcu_read_unlock();
4559			dma_free_pagelist(freelist);
4560
4561			start_vpfn = iova->pfn_hi + 1;
4562			free_iova_mem(iova);
4563		}
4564		break;
4565	}
4566
4567	return NOTIFY_OK;
4568}
4569
4570static struct notifier_block intel_iommu_memory_nb = {
4571	.notifier_call = intel_iommu_memory_notifier,
4572	.priority = 0
4573};
4574
4575static void free_all_cpu_cached_iovas(unsigned int cpu)
4576{
4577	int i;
4578
4579	for (i = 0; i < g_num_of_iommus; i++) {
4580		struct intel_iommu *iommu = g_iommus[i];
4581		struct dmar_domain *domain;
4582		int did;
4583
4584		if (!iommu)
4585			continue;
4586
4587		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4588			domain = get_iommu_domain(iommu, (u16)did);
4589
4590			if (!domain)
4591				continue;
4592			free_cpu_cached_iovas(cpu, &domain->iovad);
4593		}
4594	}
4595}
4596
4597static int intel_iommu_cpu_dead(unsigned int cpu)
4598{
4599	free_all_cpu_cached_iovas(cpu);
4600	return 0;
4601}
4602
4603static void intel_disable_iommus(void)
4604{
4605	struct intel_iommu *iommu = NULL;
4606	struct dmar_drhd_unit *drhd;
4607
4608	for_each_iommu(iommu, drhd)
4609		iommu_disable_translation(iommu);
4610}
4611
4612static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4613{
4614	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4615
4616	return container_of(iommu_dev, struct intel_iommu, iommu);
4617}
4618
4619static ssize_t intel_iommu_show_version(struct device *dev,
4620					struct device_attribute *attr,
4621					char *buf)
4622{
4623	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4624	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4625	return sprintf(buf, "%d:%d\n",
4626		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4627}
4628static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4629
4630static ssize_t intel_iommu_show_address(struct device *dev,
4631					struct device_attribute *attr,
4632					char *buf)
4633{
4634	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4635	return sprintf(buf, "%llx\n", iommu->reg_phys);
4636}
4637static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4638
4639static ssize_t intel_iommu_show_cap(struct device *dev,
4640				    struct device_attribute *attr,
4641				    char *buf)
4642{
4643	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4644	return sprintf(buf, "%llx\n", iommu->cap);
4645}
4646static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4647
4648static ssize_t intel_iommu_show_ecap(struct device *dev,
4649				    struct device_attribute *attr,
4650				    char *buf)
4651{
4652	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4653	return sprintf(buf, "%llx\n", iommu->ecap);
4654}
4655static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4656
4657static ssize_t intel_iommu_show_ndoms(struct device *dev,
4658				      struct device_attribute *attr,
4659				      char *buf)
4660{
4661	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4662	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4663}
4664static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4665
4666static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4667					   struct device_attribute *attr,
4668					   char *buf)
4669{
4670	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4671	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4672						  cap_ndoms(iommu->cap)));
4673}
4674static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4675
4676static struct attribute *intel_iommu_attrs[] = {
4677	&dev_attr_version.attr,
4678	&dev_attr_address.attr,
4679	&dev_attr_cap.attr,
4680	&dev_attr_ecap.attr,
4681	&dev_attr_domains_supported.attr,
4682	&dev_attr_domains_used.attr,
4683	NULL,
4684};
4685
4686static struct attribute_group intel_iommu_group = {
4687	.name = "intel-iommu",
4688	.attrs = intel_iommu_attrs,
4689};
4690
4691const struct attribute_group *intel_iommu_groups[] = {
4692	&intel_iommu_group,
4693	NULL,
4694};
4695
4696int __init intel_iommu_init(void)
4697{
4698	int ret = -ENODEV;
4699	struct dmar_drhd_unit *drhd;
4700	struct intel_iommu *iommu;
4701
4702	/* VT-d is required for a TXT/tboot launch, so enforce that */
4703	force_on = tboot_force_iommu();
4704
4705	if (iommu_init_mempool()) {
4706		if (force_on)
4707			panic("tboot: Failed to initialize iommu memory\n");
4708		return -ENOMEM;
4709	}
4710
4711	down_write(&dmar_global_lock);
4712	if (dmar_table_init()) {
4713		if (force_on)
4714			panic("tboot: Failed to initialize DMAR table\n");
4715		goto out_free_dmar;
4716	}
4717
4718	if (dmar_dev_scope_init() < 0) {
4719		if (force_on)
4720			panic("tboot: Failed to initialize DMAR device scope\n");
4721		goto out_free_dmar;
4722	}
4723
4724	up_write(&dmar_global_lock);
4725
4726	/*
4727	 * The bus notifier takes the dmar_global_lock, so lockdep will
4728	 * complain later when we register it under the lock.
4729	 */
4730	dmar_register_bus_notifier();
4731
4732	down_write(&dmar_global_lock);
4733
4734	if (no_iommu || dmar_disabled) {
4735		/*
4736		 * We exit the function here to ensure IOMMU's remapping and
4737		 * mempool aren't setup, which means that the IOMMU's PMRs
4738		 * won't be disabled via the call to init_dmars(). So disable
4739		 * it explicitly here. The PMRs were setup by tboot prior to
4740		 * calling SENTER, but the kernel is expected to reset/tear
4741		 * down the PMRs.
4742		 */
4743		if (intel_iommu_tboot_noforce) {
4744			for_each_iommu(iommu, drhd)
4745				iommu_disable_protect_mem_regions(iommu);
4746		}
4747
4748		/*
4749		 * Make sure the IOMMUs are switched off, even when we
4750		 * boot into a kexec kernel and the previous kernel left
4751		 * them enabled
4752		 */
4753		intel_disable_iommus();
4754		goto out_free_dmar;
4755	}
4756
4757	if (list_empty(&dmar_rmrr_units))
4758		pr_info("No RMRR found\n");
4759
4760	if (list_empty(&dmar_atsr_units))
4761		pr_info("No ATSR found\n");
4762
4763	if (dmar_init_reserved_ranges()) {
4764		if (force_on)
4765			panic("tboot: Failed to reserve iommu ranges\n");
4766		goto out_free_reserved_range;
4767	}
4768
4769	init_no_remapping_devices();
4770
4771	ret = init_dmars();
4772	if (ret) {
4773		if (force_on)
4774			panic("tboot: Failed to initialize DMARs\n");
4775		pr_err("Initialization failed\n");
4776		goto out_free_reserved_range;
4777	}
4778	up_write(&dmar_global_lock);
4779	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4780
4781#if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4782	swiotlb = 0;
4783#endif
4784	dma_ops = &intel_dma_ops;
4785
4786	init_iommu_pm_ops();
4787
4788	for_each_active_iommu(iommu, drhd) {
4789		iommu_device_sysfs_add(&iommu->iommu, NULL,
4790				       intel_iommu_groups,
4791				       "%s", iommu->name);
4792		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4793		iommu_device_register(&iommu->iommu);
4794	}
4795
4796	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4797	bus_register_notifier(&pci_bus_type, &device_nb);
4798	if (si_domain && !hw_pass_through)
4799		register_memory_notifier(&intel_iommu_memory_nb);
4800	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4801			  intel_iommu_cpu_dead);
4802	intel_iommu_enabled = 1;
4803
4804	return 0;
4805
4806out_free_reserved_range:
4807	put_iova_domain(&reserved_iova_list);
4808out_free_dmar:
4809	intel_iommu_free_dmars();
4810	up_write(&dmar_global_lock);
4811	iommu_exit_mempool();
4812	return ret;
4813}
4814
4815static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4816{
4817	struct intel_iommu *iommu = opaque;
4818
4819	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4820	return 0;
4821}
4822
4823/*
4824 * NB - intel-iommu lacks any sort of reference counting for the users of
4825 * dependent devices.  If multiple endpoints have intersecting dependent
4826 * devices, unbinding the driver from any one of them will possibly leave
4827 * the others unable to operate.
4828 */
4829static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4830{
4831	if (!iommu || !dev || !dev_is_pci(dev))
4832		return;
4833
4834	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4835}
4836
4837static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4838{
4839	struct intel_iommu *iommu;
4840	unsigned long flags;
4841
4842	assert_spin_locked(&device_domain_lock);
4843
4844	if (WARN_ON(!info))
4845		return;
4846
4847	iommu = info->iommu;
4848
4849	if (info->dev) {
4850		iommu_disable_dev_iotlb(info);
4851		domain_context_clear(iommu, info->dev);
4852	}
4853
4854	unlink_domain_info(info);
4855
4856	spin_lock_irqsave(&iommu->lock, flags);
4857	domain_detach_iommu(info->domain, iommu);
4858	spin_unlock_irqrestore(&iommu->lock, flags);
4859
4860	free_devinfo_mem(info);
4861}
4862
4863static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4864				     struct device *dev)
4865{
4866	struct device_domain_info *info;
4867	unsigned long flags;
4868
4869	spin_lock_irqsave(&device_domain_lock, flags);
4870	info = dev->archdata.iommu;
4871	__dmar_remove_one_dev_info(info);
4872	spin_unlock_irqrestore(&device_domain_lock, flags);
4873}
4874
4875static int md_domain_init(struct dmar_domain *domain, int guest_width)
4876{
4877	int adjust_width;
4878
4879	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4880	domain_reserve_special_ranges(domain);
4881
4882	/* calculate AGAW */
4883	domain->gaw = guest_width;
4884	adjust_width = guestwidth_to_adjustwidth(guest_width);
4885	domain->agaw = width_to_agaw(adjust_width);
4886
4887	domain->iommu_coherency = 0;
4888	domain->iommu_snooping = 0;
4889	domain->iommu_superpage = 0;
4890	domain->max_addr = 0;
4891
4892	/* always allocate the top pgd */
4893	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4894	if (!domain->pgd)
4895		return -ENOMEM;
4896	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4897	return 0;
4898}
4899
4900static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4901{
4902	struct dmar_domain *dmar_domain;
4903	struct iommu_domain *domain;
4904
4905	if (type != IOMMU_DOMAIN_UNMANAGED)
4906		return NULL;
4907
4908	dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4909	if (!dmar_domain) {
4910		pr_err("Can't allocate dmar_domain\n");
4911		return NULL;
4912	}
4913	if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4914		pr_err("Domain initialization failed\n");
4915		domain_exit(dmar_domain);
4916		return NULL;
4917	}
4918	domain_update_iommu_cap(dmar_domain);
4919
4920	domain = &dmar_domain->domain;
4921	domain->geometry.aperture_start = 0;
4922	domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4923	domain->geometry.force_aperture = true;
4924
4925	return domain;
4926}
4927
4928static void intel_iommu_domain_free(struct iommu_domain *domain)
4929{
4930	domain_exit(to_dmar_domain(domain));
4931}
4932
4933static int intel_iommu_attach_device(struct iommu_domain *domain,
4934				     struct device *dev)
4935{
4936	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4937	struct intel_iommu *iommu;
4938	int addr_width;
4939	u8 bus, devfn;
4940
4941	if (device_is_rmrr_locked(dev)) {
4942		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4943		return -EPERM;
4944	}
4945
4946	/* normally dev is not mapped */
4947	if (unlikely(domain_context_mapped(dev))) {
4948		struct dmar_domain *old_domain;
4949
4950		old_domain = find_domain(dev);
4951		if (old_domain) {
4952			rcu_read_lock();
4953			dmar_remove_one_dev_info(old_domain, dev);
4954			rcu_read_unlock();
4955
4956			if (!domain_type_is_vm_or_si(old_domain) &&
4957			     list_empty(&old_domain->devices))
4958				domain_exit(old_domain);
4959		}
4960	}
4961
4962	iommu = device_to_iommu(dev, &bus, &devfn);
4963	if (!iommu)
4964		return -ENODEV;
4965
4966	/* check if this iommu agaw is sufficient for max mapped address */
4967	addr_width = agaw_to_width(iommu->agaw);
4968	if (addr_width > cap_mgaw(iommu->cap))
4969		addr_width = cap_mgaw(iommu->cap);
4970
4971	if (dmar_domain->max_addr > (1LL << addr_width)) {
4972		pr_err("%s: iommu width (%d) is not "
4973		       "sufficient for the mapped address (%llx)\n",
4974		       __func__, addr_width, dmar_domain->max_addr);
4975		return -EFAULT;
4976	}
4977	dmar_domain->gaw = addr_width;
4978
4979	/*
4980	 * Knock out extra levels of page tables if necessary
4981	 */
4982	while (iommu->agaw < dmar_domain->agaw) {
4983		struct dma_pte *pte;
4984
4985		pte = dmar_domain->pgd;
4986		if (dma_pte_present(pte)) {
4987			dmar_domain->pgd = (struct dma_pte *)
4988				phys_to_virt(dma_pte_addr(pte));
4989			free_pgtable_page(pte);
4990		}
4991		dmar_domain->agaw--;
4992	}
4993
4994	return domain_add_dev_info(dmar_domain, dev);
4995}
4996
4997static void intel_iommu_detach_device(struct iommu_domain *domain,
4998				      struct device *dev)
4999{
5000	dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5001}
5002
5003static int intel_iommu_map(struct iommu_domain *domain,
5004			   unsigned long iova, phys_addr_t hpa,
5005			   size_t size, int iommu_prot)
5006{
5007	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5008	u64 max_addr;
5009	int prot = 0;
5010	int ret;
5011
5012	if (iommu_prot & IOMMU_READ)
5013		prot |= DMA_PTE_READ;
5014	if (iommu_prot & IOMMU_WRITE)
5015		prot |= DMA_PTE_WRITE;
5016	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5017		prot |= DMA_PTE_SNP;
5018
5019	max_addr = iova + size;
5020	if (dmar_domain->max_addr < max_addr) {
5021		u64 end;
5022
5023		/* check if minimum agaw is sufficient for mapped address */
5024		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5025		if (end < max_addr) {
5026			pr_err("%s: iommu width (%d) is not "
5027			       "sufficient for the mapped address (%llx)\n",
5028			       __func__, dmar_domain->gaw, max_addr);
5029			return -EFAULT;
5030		}
5031		dmar_domain->max_addr = max_addr;
5032	}
5033	/* Round up size to next multiple of PAGE_SIZE, if it and
5034	   the low bits of hpa would take us onto the next page */
5035	size = aligned_nrpages(hpa, size);
5036	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5037				 hpa >> VTD_PAGE_SHIFT, size, prot);
5038	return ret;
5039}
5040
5041static size_t intel_iommu_unmap(struct iommu_domain *domain,
5042				unsigned long iova, size_t size)
5043{
5044	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5045	struct page *freelist = NULL;
5046	unsigned long start_pfn, last_pfn;
5047	unsigned int npages;
5048	int iommu_id, level = 0;
5049
5050	/* Cope with horrid API which requires us to unmap more than the
5051	   size argument if it happens to be a large-page mapping. */
5052	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5053
5054	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5055		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5056
5057	start_pfn = iova >> VTD_PAGE_SHIFT;
5058	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5059
5060	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5061
5062	npages = last_pfn - start_pfn + 1;
5063
5064	for_each_domain_iommu(iommu_id, dmar_domain)
5065		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5066				      start_pfn, npages, !freelist, 0);
5067
5068	dma_free_pagelist(freelist);
5069
5070	if (dmar_domain->max_addr == iova + size)
5071		dmar_domain->max_addr = iova;
5072
5073	return size;
5074}
5075
5076static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5077					    dma_addr_t iova)
5078{
5079	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5080	struct dma_pte *pte;
5081	int level = 0;
5082	u64 phys = 0;
5083
5084	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5085	if (pte)
5086		phys = dma_pte_addr(pte);
5087
5088	return phys;
5089}
5090
5091static bool intel_iommu_capable(enum iommu_cap cap)
5092{
5093	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5094		return domain_update_iommu_snooping(NULL) == 1;
5095	if (cap == IOMMU_CAP_INTR_REMAP)
5096		return irq_remapping_enabled == 1;
5097
5098	return false;
5099}
5100
5101static int intel_iommu_add_device(struct device *dev)
5102{
5103	struct intel_iommu *iommu;
5104	struct iommu_group *group;
5105	u8 bus, devfn;
5106
5107	iommu = device_to_iommu(dev, &bus, &devfn);
5108	if (!iommu)
5109		return -ENODEV;
5110
5111	iommu_device_link(&iommu->iommu, dev);
5112
5113	group = iommu_group_get_for_dev(dev);
5114
5115	if (IS_ERR(group))
5116		return PTR_ERR(group);
5117
5118	iommu_group_put(group);
5119	return 0;
5120}
5121
5122static void intel_iommu_remove_device(struct device *dev)
5123{
5124	struct intel_iommu *iommu;
5125	u8 bus, devfn;
5126
5127	iommu = device_to_iommu(dev, &bus, &devfn);
5128	if (!iommu)
5129		return;
5130
5131	iommu_group_remove_device(dev);
5132
5133	iommu_device_unlink(&iommu->iommu, dev);
5134}
5135
5136static void intel_iommu_get_resv_regions(struct device *device,
5137					 struct list_head *head)
5138{
5139	struct iommu_resv_region *reg;
5140	struct dmar_rmrr_unit *rmrr;
5141	struct device *i_dev;
5142	int i;
5143
5144	rcu_read_lock();
5145	for_each_rmrr_units(rmrr) {
5146		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5147					  i, i_dev) {
5148			if (i_dev != device)
5149				continue;
5150
5151			list_add_tail(&rmrr->resv->list, head);
5152		}
5153	}
5154	rcu_read_unlock();
5155
5156	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5157				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5158				      0, IOMMU_RESV_MSI);
5159	if (!reg)
5160		return;
5161	list_add_tail(&reg->list, head);
5162}
5163
5164static void intel_iommu_put_resv_regions(struct device *dev,
5165					 struct list_head *head)
5166{
5167	struct iommu_resv_region *entry, *next;
5168
5169	list_for_each_entry_safe(entry, next, head, list) {
5170		if (entry->type == IOMMU_RESV_RESERVED)
5171			kfree(entry);
5172	}
5173}
5174
5175#ifdef CONFIG_INTEL_IOMMU_SVM
5176#define MAX_NR_PASID_BITS (20)
5177static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5178{
5179	/*
5180	 * Convert ecap_pss to extend context entry pts encoding, also
5181	 * respect the soft pasid_max value set by the iommu.
5182	 * - number of PASID bits = ecap_pss + 1
5183	 * - number of PASID table entries = 2^(pts + 5)
5184	 * Therefore, pts = ecap_pss - 4
5185	 * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5186	 */
5187	if (ecap_pss(iommu->ecap) < 5)
5188		return 0;
5189
5190	/* pasid_max is encoded as actual number of entries not the bits */
5191	return find_first_bit((unsigned long *)&iommu->pasid_max,
5192			MAX_NR_PASID_BITS) - 5;
5193}
5194
5195int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5196{
5197	struct device_domain_info *info;
5198	struct context_entry *context;
5199	struct dmar_domain *domain;
5200	unsigned long flags;
5201	u64 ctx_lo;
5202	int ret;
5203
5204	domain = get_valid_domain_for_dev(sdev->dev);
5205	if (!domain)
5206		return -EINVAL;
5207
5208	spin_lock_irqsave(&device_domain_lock, flags);
5209	spin_lock(&iommu->lock);
5210
5211	ret = -EINVAL;
5212	info = sdev->dev->archdata.iommu;
5213	if (!info || !info->pasid_supported)
5214		goto out;
5215
5216	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5217	if (WARN_ON(!context))
5218		goto out;
5219
5220	ctx_lo = context[0].lo;
5221
5222	sdev->did = domain->iommu_did[iommu->seq_id];
5223	sdev->sid = PCI_DEVID(info->bus, info->devfn);
5224
5225	if (!(ctx_lo & CONTEXT_PASIDE)) {
5226		if (iommu->pasid_state_table)
5227			context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5228		context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5229			intel_iommu_get_pts(iommu);
5230
5231		wmb();
5232		/* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5233		 * extended to permit requests-with-PASID if the PASIDE bit
5234		 * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5235		 * however, the PASIDE bit is ignored and requests-with-PASID
5236		 * are unconditionally blocked. Which makes less sense.
5237		 * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5238		 * "guest mode" translation types depending on whether ATS
5239		 * is available or not. Annoyingly, we can't use the new
5240		 * modes *unless* PASIDE is set. */
5241		if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5242			ctx_lo &= ~CONTEXT_TT_MASK;
5243			if (info->ats_supported)
5244				ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5245			else
5246				ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5247		}
5248		ctx_lo |= CONTEXT_PASIDE;
5249		if (iommu->pasid_state_table)
5250			ctx_lo |= CONTEXT_DINVE;
5251		if (info->pri_supported)
5252			ctx_lo |= CONTEXT_PRS;
5253		context[0].lo = ctx_lo;
5254		wmb();
5255		iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5256					   DMA_CCMD_MASK_NOBIT,
5257					   DMA_CCMD_DEVICE_INVL);
5258	}
5259
5260	/* Enable PASID support in the device, if it wasn't already */
5261	if (!info->pasid_enabled)
5262		iommu_enable_dev_iotlb(info);
5263
5264	if (info->ats_enabled) {
5265		sdev->dev_iotlb = 1;
5266		sdev->qdep = info->ats_qdep;
5267		if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5268			sdev->qdep = 0;
5269	}
5270	ret = 0;
5271
5272 out:
5273	spin_unlock(&iommu->lock);
5274	spin_unlock_irqrestore(&device_domain_lock, flags);
5275
5276	return ret;
5277}
5278
5279struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5280{
5281	struct intel_iommu *iommu;
5282	u8 bus, devfn;
5283
5284	if (iommu_dummy(dev)) {
5285		dev_warn(dev,
5286			 "No IOMMU translation for device; cannot enable SVM\n");
5287		return NULL;
5288	}
5289
5290	iommu = device_to_iommu(dev, &bus, &devfn);
5291	if ((!iommu)) {
5292		dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5293		return NULL;
5294	}
5295
5296	if (!iommu->pasid_table) {
5297		dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5298		return NULL;
5299	}
5300
5301	return iommu;
5302}
5303#endif /* CONFIG_INTEL_IOMMU_SVM */
5304
5305const struct iommu_ops intel_iommu_ops = {
5306	.capable		= intel_iommu_capable,
5307	.domain_alloc		= intel_iommu_domain_alloc,
5308	.domain_free		= intel_iommu_domain_free,
5309	.attach_dev		= intel_iommu_attach_device,
5310	.detach_dev		= intel_iommu_detach_device,
5311	.map			= intel_iommu_map,
5312	.unmap			= intel_iommu_unmap,
5313	.map_sg			= default_iommu_map_sg,
5314	.iova_to_phys		= intel_iommu_iova_to_phys,
5315	.add_device		= intel_iommu_add_device,
5316	.remove_device		= intel_iommu_remove_device,
5317	.get_resv_regions	= intel_iommu_get_resv_regions,
5318	.put_resv_regions	= intel_iommu_put_resv_regions,
5319	.device_group		= pci_device_group,
5320	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
5321};
5322
5323static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5324{
5325	/* G4x/GM45 integrated gfx dmar support is totally busted. */
5326	pr_info("Disabling IOMMU for graphics on this chipset\n");
5327	dmar_map_gfx = 0;
5328}
5329
5330DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5331DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5332DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5333DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5334DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5335DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5336DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5337
5338static void quirk_iommu_rwbf(struct pci_dev *dev)
5339{
5340	/*
5341	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5342	 * but needs it. Same seems to hold for the desktop versions.
5343	 */
5344	pr_info("Forcing write-buffer flush capability\n");
5345	rwbf_quirk = 1;
5346}
5347
5348DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5349DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5350DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5351DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5352DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5353DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5354DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5355
5356#define GGC 0x52
5357#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5358#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5359#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5360#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5361#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5362#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5363#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5364#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5365
5366static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5367{
5368	unsigned short ggc;
5369
5370	if (pci_read_config_word(dev, GGC, &ggc))
5371		return;
5372
5373	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5374		pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5375		dmar_map_gfx = 0;
5376	} else if (dmar_map_gfx) {
5377		/* we have to ensure the gfx device is idle before we flush */
5378		pr_info("Disabling batched IOTLB flush on Ironlake\n");
5379		intel_iommu_strict = 1;
5380       }
5381}
5382DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5383DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5384DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5385DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5386
5387/* On Tylersburg chipsets, some BIOSes have been known to enable the
5388   ISOCH DMAR unit for the Azalia sound device, but not give it any
5389   TLB entries, which causes it to deadlock. Check for that.  We do
5390   this in a function called from init_dmars(), instead of in a PCI
5391   quirk, because we don't want to print the obnoxious "BIOS broken"
5392   message if VT-d is actually disabled.
5393*/
5394static void __init check_tylersburg_isoch(void)
5395{
5396	struct pci_dev *pdev;
5397	uint32_t vtisochctrl;
5398
5399	/* If there's no Azalia in the system anyway, forget it. */
5400	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5401	if (!pdev)
5402		return;
5403	pci_dev_put(pdev);
5404
5405	/* System Management Registers. Might be hidden, in which case
5406	   we can't do the sanity check. But that's OK, because the
5407	   known-broken BIOSes _don't_ actually hide it, so far. */
5408	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5409	if (!pdev)
5410		return;
5411
5412	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5413		pci_dev_put(pdev);
5414		return;
5415	}
5416
5417	pci_dev_put(pdev);
5418
5419	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5420	if (vtisochctrl & 1)
5421		return;
5422
5423	/* Drop all bits other than the number of TLB entries */
5424	vtisochctrl &= 0x1c;
5425
5426	/* If we have the recommended number of TLB entries (16), fine. */
5427	if (vtisochctrl == 0x10)
5428		return;
5429
5430	/* Zero TLB entries? You get to ride the short bus to school. */
5431	if (!vtisochctrl) {
5432		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5433		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5434		     dmi_get_system_info(DMI_BIOS_VENDOR),
5435		     dmi_get_system_info(DMI_BIOS_VERSION),
5436		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5437		iommu_identity_mapping |= IDENTMAP_AZALIA;
5438		return;
5439	}
5440
5441	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5442	       vtisochctrl);
5443}