Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.15.
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright © 2006-2014 Intel Corporation.
   4 *
   5 * Authors: David Woodhouse <dwmw2@infradead.org>,
   6 *          Ashok Raj <ashok.raj@intel.com>,
   7 *          Shaohua Li <shaohua.li@intel.com>,
   8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9 *          Fenghua Yu <fenghua.yu@intel.com>
  10 *          Joerg Roedel <jroedel@suse.de>
  11 */
  12
  13#define pr_fmt(fmt)     "DMAR: " fmt
  14#define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16#include <linux/init.h>
  17#include <linux/bitmap.h>
  18#include <linux/debugfs.h>
  19#include <linux/export.h>
  20#include <linux/slab.h>
  21#include <linux/irq.h>
  22#include <linux/interrupt.h>
  23#include <linux/spinlock.h>
  24#include <linux/pci.h>
  25#include <linux/dmar.h>
  26#include <linux/dma-mapping.h>
  27#include <linux/mempool.h>
  28#include <linux/memory.h>
  29#include <linux/cpu.h>
  30#include <linux/timer.h>
  31#include <linux/io.h>
  32#include <linux/iova.h>
  33#include <linux/iommu.h>
  34#include <linux/intel-iommu.h>
  35#include <linux/syscore_ops.h>
  36#include <linux/tboot.h>
  37#include <linux/dmi.h>
  38#include <linux/pci-ats.h>
  39#include <linux/memblock.h>
  40#include <linux/dma-contiguous.h>
  41#include <linux/dma-direct.h>
  42#include <linux/crash_dump.h>
  43#include <linux/numa.h>
  44#include <linux/swiotlb.h>
  45#include <asm/irq_remapping.h>
  46#include <asm/cacheflush.h>
  47#include <asm/iommu.h>
  48#include <trace/events/intel_iommu.h>
  49
  50#include "../irq_remapping.h"
  51#include "pasid.h"
  52
  53#define ROOT_SIZE		VTD_PAGE_SIZE
  54#define CONTEXT_SIZE		VTD_PAGE_SIZE
  55
  56#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  57#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  58#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  59#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  60
  61#define IOAPIC_RANGE_START	(0xfee00000)
  62#define IOAPIC_RANGE_END	(0xfeefffff)
  63#define IOVA_START_ADDR		(0x1000)
  64
  65#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  66
  67#define MAX_AGAW_WIDTH 64
  68#define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  69
  70#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  71#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  72
  73/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  74   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  75#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
  76				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  77#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  78
  79/* IO virtual address start page frame number */
  80#define IOVA_START_PFN		(1)
  81
  82#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
  83
  84/* page table handling */
  85#define LEVEL_STRIDE		(9)
  86#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
  87
  88/*
  89 * This bitmap is used to advertise the page sizes our hardware support
  90 * to the IOMMU core, which will then use this information to split
  91 * physically contiguous memory regions it is mapping into page sizes
  92 * that we support.
  93 *
  94 * Traditionally the IOMMU core just handed us the mappings directly,
  95 * after making sure the size is an order of a 4KiB page and that the
  96 * mapping has natural alignment.
  97 *
  98 * To retain this behavior, we currently advertise that we support
  99 * all page sizes that are an order of 4KiB.
 100 *
 101 * If at some point we'd like to utilize the IOMMU core's new behavior,
 102 * we could change this to advertise the real page sizes we support.
 103 */
 104#define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
 105
 106static inline int agaw_to_level(int agaw)
 107{
 108	return agaw + 2;
 109}
 110
 111static inline int agaw_to_width(int agaw)
 112{
 113	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 114}
 115
 116static inline int width_to_agaw(int width)
 117{
 118	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 119}
 120
 121static inline unsigned int level_to_offset_bits(int level)
 122{
 123	return (level - 1) * LEVEL_STRIDE;
 124}
 125
 126static inline int pfn_level_offset(u64 pfn, int level)
 127{
 128	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 129}
 130
 131static inline u64 level_mask(int level)
 132{
 133	return -1ULL << level_to_offset_bits(level);
 134}
 135
 136static inline u64 level_size(int level)
 137{
 138	return 1ULL << level_to_offset_bits(level);
 139}
 140
 141static inline u64 align_to_level(u64 pfn, int level)
 142{
 143	return (pfn + level_size(level) - 1) & level_mask(level);
 144}
 145
 146static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 147{
 148	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 149}
 150
 151/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 152   are never going to work. */
 153static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 154{
 155	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 156}
 157
 158static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 159{
 160	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 161}
 162static inline unsigned long page_to_dma_pfn(struct page *pg)
 163{
 164	return mm_to_dma_pfn(page_to_pfn(pg));
 165}
 166static inline unsigned long virt_to_dma_pfn(void *p)
 167{
 168	return page_to_dma_pfn(virt_to_page(p));
 169}
 170
 171/* global iommu list, set NULL for ignored DMAR units */
 172static struct intel_iommu **g_iommus;
 173
 174static void __init check_tylersburg_isoch(void);
 175static int rwbf_quirk;
 176
 177/*
 178 * set to 1 to panic kernel if can't successfully enable VT-d
 179 * (used when kernel is launched w/ TXT)
 180 */
 181static int force_on = 0;
 182int intel_iommu_tboot_noforce;
 183static int no_platform_optin;
 184
 185#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 186
 187/*
 188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 189 * if marked present.
 190 */
 191static phys_addr_t root_entry_lctp(struct root_entry *re)
 192{
 193	if (!(re->lo & 1))
 194		return 0;
 195
 196	return re->lo & VTD_PAGE_MASK;
 197}
 198
 199/*
 200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 201 * if marked present.
 202 */
 203static phys_addr_t root_entry_uctp(struct root_entry *re)
 204{
 205	if (!(re->hi & 1))
 206		return 0;
 207
 208	return re->hi & VTD_PAGE_MASK;
 209}
 210
 211static inline void context_clear_pasid_enable(struct context_entry *context)
 212{
 213	context->lo &= ~(1ULL << 11);
 214}
 215
 216static inline bool context_pasid_enabled(struct context_entry *context)
 217{
 218	return !!(context->lo & (1ULL << 11));
 219}
 220
 221static inline void context_set_copied(struct context_entry *context)
 222{
 223	context->hi |= (1ull << 3);
 224}
 225
 226static inline bool context_copied(struct context_entry *context)
 227{
 228	return !!(context->hi & (1ULL << 3));
 229}
 230
 231static inline bool __context_present(struct context_entry *context)
 232{
 233	return (context->lo & 1);
 234}
 235
 236bool context_present(struct context_entry *context)
 237{
 238	return context_pasid_enabled(context) ?
 239	     __context_present(context) :
 240	     __context_present(context) && !context_copied(context);
 241}
 242
 243static inline void context_set_present(struct context_entry *context)
 244{
 245	context->lo |= 1;
 246}
 247
 248static inline void context_set_fault_enable(struct context_entry *context)
 249{
 250	context->lo &= (((u64)-1) << 2) | 1;
 251}
 252
 253static inline void context_set_translation_type(struct context_entry *context,
 254						unsigned long value)
 255{
 256	context->lo &= (((u64)-1) << 4) | 3;
 257	context->lo |= (value & 3) << 2;
 258}
 259
 260static inline void context_set_address_root(struct context_entry *context,
 261					    unsigned long value)
 262{
 263	context->lo &= ~VTD_PAGE_MASK;
 264	context->lo |= value & VTD_PAGE_MASK;
 265}
 266
 267static inline void context_set_address_width(struct context_entry *context,
 268					     unsigned long value)
 269{
 270	context->hi |= value & 7;
 271}
 272
 273static inline void context_set_domain_id(struct context_entry *context,
 274					 unsigned long value)
 275{
 276	context->hi |= (value & ((1 << 16) - 1)) << 8;
 277}
 278
 279static inline int context_domain_id(struct context_entry *c)
 280{
 281	return((c->hi >> 8) & 0xffff);
 282}
 283
 284static inline void context_clear_entry(struct context_entry *context)
 285{
 286	context->lo = 0;
 287	context->hi = 0;
 288}
 289
 290/*
 291 * This domain is a statically identity mapping domain.
 292 *	1. This domain creats a static 1:1 mapping to all usable memory.
 293 * 	2. It maps to each iommu if successful.
 294 *	3. Each iommu mapps to this domain if successful.
 295 */
 296static struct dmar_domain *si_domain;
 297static int hw_pass_through = 1;
 298
 299#define for_each_domain_iommu(idx, domain)			\
 300	for (idx = 0; idx < g_num_of_iommus; idx++)		\
 301		if (domain->iommu_refcnt[idx])
 302
 303struct dmar_rmrr_unit {
 304	struct list_head list;		/* list of rmrr units	*/
 305	struct acpi_dmar_header *hdr;	/* ACPI header		*/
 306	u64	base_address;		/* reserved base address*/
 307	u64	end_address;		/* reserved end address */
 308	struct dmar_dev_scope *devices;	/* target devices */
 309	int	devices_cnt;		/* target device count */
 310};
 311
 312struct dmar_atsr_unit {
 313	struct list_head list;		/* list of ATSR units */
 314	struct acpi_dmar_header *hdr;	/* ACPI header */
 315	struct dmar_dev_scope *devices;	/* target devices */
 316	int devices_cnt;		/* target device count */
 317	u8 include_all:1;		/* include all ports */
 318};
 319
 320static LIST_HEAD(dmar_atsr_units);
 321static LIST_HEAD(dmar_rmrr_units);
 322
 323#define for_each_rmrr_units(rmrr) \
 324	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 325
 326/* bitmap for indexing intel_iommus */
 327static int g_num_of_iommus;
 328
 329static void domain_exit(struct dmar_domain *domain);
 330static void domain_remove_dev_info(struct dmar_domain *domain);
 331static void dmar_remove_one_dev_info(struct device *dev);
 332static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 333static int intel_iommu_attach_device(struct iommu_domain *domain,
 334				     struct device *dev);
 335static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 336					    dma_addr_t iova);
 337
 338#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 339int dmar_disabled = 0;
 340#else
 341int dmar_disabled = 1;
 342#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
 343
 344#ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
 345int intel_iommu_sm = 1;
 346#else
 347int intel_iommu_sm;
 348#endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
 349
 350int intel_iommu_enabled = 0;
 351EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 352
 353static int dmar_map_gfx = 1;
 354static int dmar_forcedac;
 355static int intel_iommu_strict;
 356static int intel_iommu_superpage = 1;
 357static int iommu_identity_mapping;
 358static int intel_no_bounce;
 359static int iommu_skip_te_disable;
 360
 361#define IDENTMAP_GFX		2
 362#define IDENTMAP_AZALIA		4
 363
 364int intel_iommu_gfx_mapped;
 365EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 366
 367#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
 368struct device_domain_info *get_domain_info(struct device *dev)
 369{
 370	struct device_domain_info *info;
 371
 372	if (!dev)
 373		return NULL;
 374
 375	info = dev_iommu_priv_get(dev);
 376	if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
 377		return NULL;
 378
 379	return info;
 380}
 381
 382DEFINE_SPINLOCK(device_domain_lock);
 383static LIST_HEAD(device_domain_list);
 384
 385#define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&	\
 386				to_pci_dev(d)->untrusted)
 387
 388/*
 389 * Iterate over elements in device_domain_list and call the specified
 390 * callback @fn against each element.
 391 */
 392int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 393				     void *data), void *data)
 394{
 395	int ret = 0;
 396	unsigned long flags;
 397	struct device_domain_info *info;
 398
 399	spin_lock_irqsave(&device_domain_lock, flags);
 400	list_for_each_entry(info, &device_domain_list, global) {
 401		ret = fn(info, data);
 402		if (ret) {
 403			spin_unlock_irqrestore(&device_domain_lock, flags);
 404			return ret;
 405		}
 406	}
 407	spin_unlock_irqrestore(&device_domain_lock, flags);
 408
 409	return 0;
 410}
 411
 412const struct iommu_ops intel_iommu_ops;
 413
 414static bool translation_pre_enabled(struct intel_iommu *iommu)
 415{
 416	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 417}
 418
 419static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 420{
 421	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 422}
 423
 424static void init_translation_status(struct intel_iommu *iommu)
 425{
 426	u32 gsts;
 427
 428	gsts = readl(iommu->reg + DMAR_GSTS_REG);
 429	if (gsts & DMA_GSTS_TES)
 430		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 431}
 432
 433static int __init intel_iommu_setup(char *str)
 434{
 435	if (!str)
 436		return -EINVAL;
 437	while (*str) {
 438		if (!strncmp(str, "on", 2)) {
 439			dmar_disabled = 0;
 440			pr_info("IOMMU enabled\n");
 441		} else if (!strncmp(str, "off", 3)) {
 442			dmar_disabled = 1;
 443			no_platform_optin = 1;
 444			pr_info("IOMMU disabled\n");
 445		} else if (!strncmp(str, "igfx_off", 8)) {
 446			dmar_map_gfx = 0;
 447			pr_info("Disable GFX device mapping\n");
 448		} else if (!strncmp(str, "forcedac", 8)) {
 449			pr_info("Forcing DAC for PCI devices\n");
 450			dmar_forcedac = 1;
 451		} else if (!strncmp(str, "strict", 6)) {
 452			pr_info("Disable batched IOTLB flush\n");
 453			intel_iommu_strict = 1;
 454		} else if (!strncmp(str, "sp_off", 6)) {
 455			pr_info("Disable supported super page\n");
 456			intel_iommu_superpage = 0;
 457		} else if (!strncmp(str, "sm_on", 5)) {
 458			pr_info("Intel-IOMMU: scalable mode supported\n");
 459			intel_iommu_sm = 1;
 460		} else if (!strncmp(str, "tboot_noforce", 13)) {
 461			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 462			intel_iommu_tboot_noforce = 1;
 463		} else if (!strncmp(str, "nobounce", 8)) {
 464			pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
 465			intel_no_bounce = 1;
 466		}
 467
 468		str += strcspn(str, ",");
 469		while (*str == ',')
 470			str++;
 471	}
 472	return 0;
 473}
 474__setup("intel_iommu=", intel_iommu_setup);
 475
 476static struct kmem_cache *iommu_domain_cache;
 477static struct kmem_cache *iommu_devinfo_cache;
 478
 479static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 480{
 481	struct dmar_domain **domains;
 482	int idx = did >> 8;
 483
 484	domains = iommu->domains[idx];
 485	if (!domains)
 486		return NULL;
 487
 488	return domains[did & 0xff];
 489}
 490
 491static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 492			     struct dmar_domain *domain)
 493{
 494	struct dmar_domain **domains;
 495	int idx = did >> 8;
 496
 497	if (!iommu->domains[idx]) {
 498		size_t size = 256 * sizeof(struct dmar_domain *);
 499		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 500	}
 501
 502	domains = iommu->domains[idx];
 503	if (WARN_ON(!domains))
 504		return;
 505	else
 506		domains[did & 0xff] = domain;
 507}
 508
 509void *alloc_pgtable_page(int node)
 510{
 511	struct page *page;
 512	void *vaddr = NULL;
 513
 514	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 515	if (page)
 516		vaddr = page_address(page);
 517	return vaddr;
 518}
 519
 520void free_pgtable_page(void *vaddr)
 521{
 522	free_page((unsigned long)vaddr);
 523}
 524
 525static inline void *alloc_domain_mem(void)
 526{
 527	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 528}
 529
 530static void free_domain_mem(void *vaddr)
 531{
 532	kmem_cache_free(iommu_domain_cache, vaddr);
 533}
 534
 535static inline void * alloc_devinfo_mem(void)
 536{
 537	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 538}
 539
 540static inline void free_devinfo_mem(void *vaddr)
 541{
 542	kmem_cache_free(iommu_devinfo_cache, vaddr);
 543}
 544
 545static inline int domain_type_is_si(struct dmar_domain *domain)
 546{
 547	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 548}
 549
 550static inline bool domain_use_first_level(struct dmar_domain *domain)
 551{
 552	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
 553}
 554
 555static inline int domain_pfn_supported(struct dmar_domain *domain,
 556				       unsigned long pfn)
 557{
 558	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 559
 560	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 561}
 562
 563static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 564{
 565	unsigned long sagaw;
 566	int agaw = -1;
 567
 568	sagaw = cap_sagaw(iommu->cap);
 569	for (agaw = width_to_agaw(max_gaw);
 570	     agaw >= 0; agaw--) {
 571		if (test_bit(agaw, &sagaw))
 572			break;
 573	}
 574
 575	return agaw;
 576}
 577
 578/*
 579 * Calculate max SAGAW for each iommu.
 580 */
 581int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 582{
 583	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 584}
 585
 586/*
 587 * calculate agaw for each iommu.
 588 * "SAGAW" may be different across iommus, use a default agaw, and
 589 * get a supported less agaw for iommus that don't support the default agaw.
 590 */
 591int iommu_calculate_agaw(struct intel_iommu *iommu)
 592{
 593	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 594}
 595
 596/* This functionin only returns single iommu in a domain */
 597struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 598{
 599	int iommu_id;
 600
 601	/* si_domain and vm domain should not get here. */
 602	if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
 603		return NULL;
 604
 605	for_each_domain_iommu(iommu_id, domain)
 606		break;
 607
 608	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 609		return NULL;
 610
 611	return g_iommus[iommu_id];
 612}
 613
 614static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 615{
 616	return sm_supported(iommu) ?
 617			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 618}
 619
 620static void domain_update_iommu_coherency(struct dmar_domain *domain)
 621{
 622	struct dmar_drhd_unit *drhd;
 623	struct intel_iommu *iommu;
 624	bool found = false;
 625	int i;
 626
 627	domain->iommu_coherency = 1;
 628
 629	for_each_domain_iommu(i, domain) {
 630		found = true;
 631		if (!iommu_paging_structure_coherency(g_iommus[i])) {
 632			domain->iommu_coherency = 0;
 633			break;
 634		}
 635	}
 636	if (found)
 637		return;
 638
 639	/* No hardware attached; use lowest common denominator */
 640	rcu_read_lock();
 641	for_each_active_iommu(iommu, drhd) {
 642		if (!iommu_paging_structure_coherency(iommu)) {
 643			domain->iommu_coherency = 0;
 644			break;
 645		}
 646	}
 647	rcu_read_unlock();
 648}
 649
 650static int domain_update_iommu_snooping(struct intel_iommu *skip)
 651{
 652	struct dmar_drhd_unit *drhd;
 653	struct intel_iommu *iommu;
 654	int ret = 1;
 655
 656	rcu_read_lock();
 657	for_each_active_iommu(iommu, drhd) {
 658		if (iommu != skip) {
 659			if (!ecap_sc_support(iommu->ecap)) {
 660				ret = 0;
 661				break;
 662			}
 663		}
 664	}
 665	rcu_read_unlock();
 666
 667	return ret;
 668}
 669
 670static int domain_update_iommu_superpage(struct dmar_domain *domain,
 671					 struct intel_iommu *skip)
 672{
 673	struct dmar_drhd_unit *drhd;
 674	struct intel_iommu *iommu;
 675	int mask = 0x3;
 676
 677	if (!intel_iommu_superpage) {
 678		return 0;
 679	}
 680
 681	/* set iommu_superpage to the smallest common denominator */
 682	rcu_read_lock();
 683	for_each_active_iommu(iommu, drhd) {
 684		if (iommu != skip) {
 685			if (domain && domain_use_first_level(domain)) {
 686				if (!cap_fl1gp_support(iommu->cap))
 687					mask = 0x1;
 688			} else {
 689				mask &= cap_super_page_val(iommu->cap);
 690			}
 691
 692			if (!mask)
 693				break;
 694		}
 695	}
 696	rcu_read_unlock();
 697
 698	return fls(mask);
 699}
 700
 701/* Some capabilities may be different across iommus */
 702static void domain_update_iommu_cap(struct dmar_domain *domain)
 703{
 704	domain_update_iommu_coherency(domain);
 705	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 706	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 707}
 708
 709struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 710					 u8 devfn, int alloc)
 711{
 712	struct root_entry *root = &iommu->root_entry[bus];
 713	struct context_entry *context;
 714	u64 *entry;
 715
 716	entry = &root->lo;
 717	if (sm_supported(iommu)) {
 718		if (devfn >= 0x80) {
 719			devfn -= 0x80;
 720			entry = &root->hi;
 721		}
 722		devfn *= 2;
 723	}
 724	if (*entry & 1)
 725		context = phys_to_virt(*entry & VTD_PAGE_MASK);
 726	else {
 727		unsigned long phy_addr;
 728		if (!alloc)
 729			return NULL;
 730
 731		context = alloc_pgtable_page(iommu->node);
 732		if (!context)
 733			return NULL;
 734
 735		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 736		phy_addr = virt_to_phys((void *)context);
 737		*entry = phy_addr | 1;
 738		__iommu_flush_cache(iommu, entry, sizeof(*entry));
 739	}
 740	return &context[devfn];
 741}
 742
 743static bool attach_deferred(struct device *dev)
 744{
 745	return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
 746}
 747
 748/**
 749 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 750 *				 sub-hierarchy of a candidate PCI-PCI bridge
 751 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 752 * @bridge: the candidate PCI-PCI bridge
 753 *
 754 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 755 */
 756static bool
 757is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 758{
 759	struct pci_dev *pdev, *pbridge;
 760
 761	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 762		return false;
 763
 764	pdev = to_pci_dev(dev);
 765	pbridge = to_pci_dev(bridge);
 766
 767	if (pbridge->subordinate &&
 768	    pbridge->subordinate->number <= pdev->bus->number &&
 769	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
 770		return true;
 771
 772	return false;
 773}
 774
 775static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 776{
 777	struct dmar_drhd_unit *drhd;
 778	u32 vtbar;
 779	int rc;
 780
 781	/* We know that this device on this chipset has its own IOMMU.
 782	 * If we find it under a different IOMMU, then the BIOS is lying
 783	 * to us. Hope that the IOMMU for this device is actually
 784	 * disabled, and it needs no translation...
 785	 */
 786	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 787	if (rc) {
 788		/* "can't" happen */
 789		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 790		return false;
 791	}
 792	vtbar &= 0xffff0000;
 793
 794	/* we know that the this iommu should be at offset 0xa000 from vtbar */
 795	drhd = dmar_find_matched_drhd_unit(pdev);
 796	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 797		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 798		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 799		return true;
 800	}
 801
 802	return false;
 803}
 804
 805static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 806{
 807	if (!iommu || iommu->drhd->ignored)
 808		return true;
 809
 810	if (dev_is_pci(dev)) {
 811		struct pci_dev *pdev = to_pci_dev(dev);
 812
 813		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 814		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 815		    quirk_ioat_snb_local_iommu(pdev))
 816			return true;
 817	}
 818
 819	return false;
 820}
 821
 822struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 823{
 824	struct dmar_drhd_unit *drhd = NULL;
 825	struct pci_dev *pdev = NULL;
 826	struct intel_iommu *iommu;
 827	struct device *tmp;
 828	u16 segment = 0;
 829	int i;
 830
 831	if (!dev)
 832		return NULL;
 833
 834	if (dev_is_pci(dev)) {
 835		struct pci_dev *pf_pdev;
 836
 837		pdev = pci_real_dma_dev(to_pci_dev(dev));
 838
 839		/* VFs aren't listed in scope tables; we need to look up
 840		 * the PF instead to find the IOMMU. */
 841		pf_pdev = pci_physfn(pdev);
 842		dev = &pf_pdev->dev;
 843		segment = pci_domain_nr(pdev->bus);
 844	} else if (has_acpi_companion(dev))
 845		dev = &ACPI_COMPANION(dev)->dev;
 846
 847	rcu_read_lock();
 848	for_each_iommu(iommu, drhd) {
 849		if (pdev && segment != drhd->segment)
 850			continue;
 851
 852		for_each_active_dev_scope(drhd->devices,
 853					  drhd->devices_cnt, i, tmp) {
 854			if (tmp == dev) {
 855				/* For a VF use its original BDF# not that of the PF
 856				 * which we used for the IOMMU lookup. Strictly speaking
 857				 * we could do this for all PCI devices; we only need to
 858				 * get the BDF# from the scope table for ACPI matches. */
 859				if (pdev && pdev->is_virtfn)
 860					goto got_pdev;
 861
 862				if (bus && devfn) {
 863					*bus = drhd->devices[i].bus;
 864					*devfn = drhd->devices[i].devfn;
 865				}
 866				goto out;
 867			}
 868
 869			if (is_downstream_to_pci_bridge(dev, tmp))
 870				goto got_pdev;
 871		}
 872
 873		if (pdev && drhd->include_all) {
 874		got_pdev:
 875			if (bus && devfn) {
 876				*bus = pdev->bus->number;
 877				*devfn = pdev->devfn;
 878			}
 879			goto out;
 880		}
 881	}
 882	iommu = NULL;
 883 out:
 884	if (iommu_is_dummy(iommu, dev))
 885		iommu = NULL;
 886
 887	rcu_read_unlock();
 888
 889	return iommu;
 890}
 891
 892static void domain_flush_cache(struct dmar_domain *domain,
 893			       void *addr, int size)
 894{
 895	if (!domain->iommu_coherency)
 896		clflush_cache_range(addr, size);
 897}
 898
 899static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 900{
 901	struct context_entry *context;
 902	int ret = 0;
 903	unsigned long flags;
 904
 905	spin_lock_irqsave(&iommu->lock, flags);
 906	context = iommu_context_addr(iommu, bus, devfn, 0);
 907	if (context)
 908		ret = context_present(context);
 909	spin_unlock_irqrestore(&iommu->lock, flags);
 910	return ret;
 911}
 912
 913static void free_context_table(struct intel_iommu *iommu)
 914{
 915	int i;
 916	unsigned long flags;
 917	struct context_entry *context;
 918
 919	spin_lock_irqsave(&iommu->lock, flags);
 920	if (!iommu->root_entry) {
 921		goto out;
 922	}
 923	for (i = 0; i < ROOT_ENTRY_NR; i++) {
 924		context = iommu_context_addr(iommu, i, 0, 0);
 925		if (context)
 926			free_pgtable_page(context);
 927
 928		if (!sm_supported(iommu))
 929			continue;
 930
 931		context = iommu_context_addr(iommu, i, 0x80, 0);
 932		if (context)
 933			free_pgtable_page(context);
 934
 935	}
 936	free_pgtable_page(iommu->root_entry);
 937	iommu->root_entry = NULL;
 938out:
 939	spin_unlock_irqrestore(&iommu->lock, flags);
 940}
 941
 942static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 943				      unsigned long pfn, int *target_level)
 944{
 945	struct dma_pte *parent, *pte;
 946	int level = agaw_to_level(domain->agaw);
 947	int offset;
 948
 949	BUG_ON(!domain->pgd);
 950
 951	if (!domain_pfn_supported(domain, pfn))
 952		/* Address beyond IOMMU's addressing capabilities. */
 953		return NULL;
 954
 955	parent = domain->pgd;
 956
 957	while (1) {
 958		void *tmp_page;
 959
 960		offset = pfn_level_offset(pfn, level);
 961		pte = &parent[offset];
 962		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 963			break;
 964		if (level == *target_level)
 965			break;
 966
 967		if (!dma_pte_present(pte)) {
 968			uint64_t pteval;
 969
 970			tmp_page = alloc_pgtable_page(domain->nid);
 971
 972			if (!tmp_page)
 973				return NULL;
 974
 975			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 976			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 977			if (domain_use_first_level(domain))
 978				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
 979			if (cmpxchg64(&pte->val, 0ULL, pteval))
 980				/* Someone else set it while we were thinking; use theirs. */
 981				free_pgtable_page(tmp_page);
 982			else
 983				domain_flush_cache(domain, pte, sizeof(*pte));
 984		}
 985		if (level == 1)
 986			break;
 987
 988		parent = phys_to_virt(dma_pte_addr(pte));
 989		level--;
 990	}
 991
 992	if (!*target_level)
 993		*target_level = level;
 994
 995	return pte;
 996}
 997
 998/* return address's pte at specific level */
 999static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1000					 unsigned long pfn,
1001					 int level, int *large_page)
1002{
1003	struct dma_pte *parent, *pte;
1004	int total = agaw_to_level(domain->agaw);
1005	int offset;
1006
1007	parent = domain->pgd;
1008	while (level <= total) {
1009		offset = pfn_level_offset(pfn, total);
1010		pte = &parent[offset];
1011		if (level == total)
1012			return pte;
1013
1014		if (!dma_pte_present(pte)) {
1015			*large_page = total;
1016			break;
1017		}
1018
1019		if (dma_pte_superpage(pte)) {
1020			*large_page = total;
1021			return pte;
1022		}
1023
1024		parent = phys_to_virt(dma_pte_addr(pte));
1025		total--;
1026	}
1027	return NULL;
1028}
1029
1030/* clear last level pte, a tlb flush should be followed */
1031static void dma_pte_clear_range(struct dmar_domain *domain,
1032				unsigned long start_pfn,
1033				unsigned long last_pfn)
1034{
1035	unsigned int large_page;
1036	struct dma_pte *first_pte, *pte;
1037
1038	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1039	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1040	BUG_ON(start_pfn > last_pfn);
1041
1042	/* we don't need lock here; nobody else touches the iova range */
1043	do {
1044		large_page = 1;
1045		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1046		if (!pte) {
1047			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1048			continue;
1049		}
1050		do {
1051			dma_clear_pte(pte);
1052			start_pfn += lvl_to_nr_pages(large_page);
1053			pte++;
1054		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1055
1056		domain_flush_cache(domain, first_pte,
1057				   (void *)pte - (void *)first_pte);
1058
1059	} while (start_pfn && start_pfn <= last_pfn);
1060}
1061
1062static void dma_pte_free_level(struct dmar_domain *domain, int level,
1063			       int retain_level, struct dma_pte *pte,
1064			       unsigned long pfn, unsigned long start_pfn,
1065			       unsigned long last_pfn)
1066{
1067	pfn = max(start_pfn, pfn);
1068	pte = &pte[pfn_level_offset(pfn, level)];
1069
1070	do {
1071		unsigned long level_pfn;
1072		struct dma_pte *level_pte;
1073
1074		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1075			goto next;
1076
1077		level_pfn = pfn & level_mask(level);
1078		level_pte = phys_to_virt(dma_pte_addr(pte));
1079
1080		if (level > 2) {
1081			dma_pte_free_level(domain, level - 1, retain_level,
1082					   level_pte, level_pfn, start_pfn,
1083					   last_pfn);
1084		}
1085
1086		/*
1087		 * Free the page table if we're below the level we want to
1088		 * retain and the range covers the entire table.
1089		 */
1090		if (level < retain_level && !(start_pfn > level_pfn ||
1091		      last_pfn < level_pfn + level_size(level) - 1)) {
1092			dma_clear_pte(pte);
1093			domain_flush_cache(domain, pte, sizeof(*pte));
1094			free_pgtable_page(level_pte);
1095		}
1096next:
1097		pfn += level_size(level);
1098	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1099}
1100
1101/*
1102 * clear last level (leaf) ptes and free page table pages below the
1103 * level we wish to keep intact.
1104 */
1105static void dma_pte_free_pagetable(struct dmar_domain *domain,
1106				   unsigned long start_pfn,
1107				   unsigned long last_pfn,
1108				   int retain_level)
1109{
1110	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1111	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1112	BUG_ON(start_pfn > last_pfn);
1113
1114	dma_pte_clear_range(domain, start_pfn, last_pfn);
1115
1116	/* We don't need lock here; nobody else touches the iova range */
1117	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1118			   domain->pgd, 0, start_pfn, last_pfn);
1119
1120	/* free pgd */
1121	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1122		free_pgtable_page(domain->pgd);
1123		domain->pgd = NULL;
1124	}
1125}
1126
1127/* When a page at a given level is being unlinked from its parent, we don't
1128   need to *modify* it at all. All we need to do is make a list of all the
1129   pages which can be freed just as soon as we've flushed the IOTLB and we
1130   know the hardware page-walk will no longer touch them.
1131   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1132   be freed. */
1133static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1134					    int level, struct dma_pte *pte,
1135					    struct page *freelist)
1136{
1137	struct page *pg;
1138
1139	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1140	pg->freelist = freelist;
1141	freelist = pg;
1142
1143	if (level == 1)
1144		return freelist;
1145
1146	pte = page_address(pg);
1147	do {
1148		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1149			freelist = dma_pte_list_pagetables(domain, level - 1,
1150							   pte, freelist);
1151		pte++;
1152	} while (!first_pte_in_page(pte));
1153
1154	return freelist;
1155}
1156
1157static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1158					struct dma_pte *pte, unsigned long pfn,
1159					unsigned long start_pfn,
1160					unsigned long last_pfn,
1161					struct page *freelist)
1162{
1163	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1164
1165	pfn = max(start_pfn, pfn);
1166	pte = &pte[pfn_level_offset(pfn, level)];
1167
1168	do {
1169		unsigned long level_pfn;
1170
1171		if (!dma_pte_present(pte))
1172			goto next;
1173
1174		level_pfn = pfn & level_mask(level);
1175
1176		/* If range covers entire pagetable, free it */
1177		if (start_pfn <= level_pfn &&
1178		    last_pfn >= level_pfn + level_size(level) - 1) {
1179			/* These suborbinate page tables are going away entirely. Don't
1180			   bother to clear them; we're just going to *free* them. */
1181			if (level > 1 && !dma_pte_superpage(pte))
1182				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1183
1184			dma_clear_pte(pte);
1185			if (!first_pte)
1186				first_pte = pte;
1187			last_pte = pte;
1188		} else if (level > 1) {
1189			/* Recurse down into a level that isn't *entirely* obsolete */
1190			freelist = dma_pte_clear_level(domain, level - 1,
1191						       phys_to_virt(dma_pte_addr(pte)),
1192						       level_pfn, start_pfn, last_pfn,
1193						       freelist);
1194		}
1195next:
1196		pfn += level_size(level);
1197	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1198
1199	if (first_pte)
1200		domain_flush_cache(domain, first_pte,
1201				   (void *)++last_pte - (void *)first_pte);
1202
1203	return freelist;
1204}
1205
1206/* We can't just free the pages because the IOMMU may still be walking
1207   the page tables, and may have cached the intermediate levels. The
1208   pages can only be freed after the IOTLB flush has been done. */
1209static struct page *domain_unmap(struct dmar_domain *domain,
1210				 unsigned long start_pfn,
1211				 unsigned long last_pfn)
1212{
1213	struct page *freelist;
1214
1215	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1216	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1217	BUG_ON(start_pfn > last_pfn);
1218
1219	/* we don't need lock here; nobody else touches the iova range */
1220	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1221				       domain->pgd, 0, start_pfn, last_pfn, NULL);
1222
1223	/* free pgd */
1224	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1225		struct page *pgd_page = virt_to_page(domain->pgd);
1226		pgd_page->freelist = freelist;
1227		freelist = pgd_page;
1228
1229		domain->pgd = NULL;
1230	}
1231
1232	return freelist;
1233}
1234
1235static void dma_free_pagelist(struct page *freelist)
1236{
1237	struct page *pg;
1238
1239	while ((pg = freelist)) {
1240		freelist = pg->freelist;
1241		free_pgtable_page(page_address(pg));
1242	}
1243}
1244
1245static void iova_entry_free(unsigned long data)
1246{
1247	struct page *freelist = (struct page *)data;
1248
1249	dma_free_pagelist(freelist);
1250}
1251
1252/* iommu handling */
1253static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1254{
1255	struct root_entry *root;
1256	unsigned long flags;
1257
1258	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1259	if (!root) {
1260		pr_err("Allocating root entry for %s failed\n",
1261			iommu->name);
1262		return -ENOMEM;
1263	}
1264
1265	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1266
1267	spin_lock_irqsave(&iommu->lock, flags);
1268	iommu->root_entry = root;
1269	spin_unlock_irqrestore(&iommu->lock, flags);
1270
1271	return 0;
1272}
1273
1274static void iommu_set_root_entry(struct intel_iommu *iommu)
1275{
1276	u64 addr;
1277	u32 sts;
1278	unsigned long flag;
1279
1280	addr = virt_to_phys(iommu->root_entry);
1281	if (sm_supported(iommu))
1282		addr |= DMA_RTADDR_SMT;
1283
1284	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1286
1287	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1288
1289	/* Make sure hardware complete it */
1290	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1291		      readl, (sts & DMA_GSTS_RTPS), sts);
1292
1293	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1294}
1295
1296void iommu_flush_write_buffer(struct intel_iommu *iommu)
1297{
1298	u32 val;
1299	unsigned long flag;
1300
1301	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1302		return;
1303
1304	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1305	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1306
1307	/* Make sure hardware complete it */
1308	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1309		      readl, (!(val & DMA_GSTS_WBFS)), val);
1310
1311	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1312}
1313
1314/* return value determine if we need a write buffer flush */
1315static void __iommu_flush_context(struct intel_iommu *iommu,
1316				  u16 did, u16 source_id, u8 function_mask,
1317				  u64 type)
1318{
1319	u64 val = 0;
1320	unsigned long flag;
1321
1322	switch (type) {
1323	case DMA_CCMD_GLOBAL_INVL:
1324		val = DMA_CCMD_GLOBAL_INVL;
1325		break;
1326	case DMA_CCMD_DOMAIN_INVL:
1327		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1328		break;
1329	case DMA_CCMD_DEVICE_INVL:
1330		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1331			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1332		break;
1333	default:
1334		BUG();
1335	}
1336	val |= DMA_CCMD_ICC;
1337
1338	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1339	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1340
1341	/* Make sure hardware complete it */
1342	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1343		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1344
1345	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1346}
1347
1348/* return value determine if we need a write buffer flush */
1349static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1350				u64 addr, unsigned int size_order, u64 type)
1351{
1352	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1353	u64 val = 0, val_iva = 0;
1354	unsigned long flag;
1355
1356	switch (type) {
1357	case DMA_TLB_GLOBAL_FLUSH:
1358		/* global flush doesn't need set IVA_REG */
1359		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1360		break;
1361	case DMA_TLB_DSI_FLUSH:
1362		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1363		break;
1364	case DMA_TLB_PSI_FLUSH:
1365		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1366		/* IH bit is passed in as part of address */
1367		val_iva = size_order | addr;
1368		break;
1369	default:
1370		BUG();
1371	}
1372	/* Note: set drain read/write */
1373#if 0
1374	/*
1375	 * This is probably to be super secure.. Looks like we can
1376	 * ignore it without any impact.
1377	 */
1378	if (cap_read_drain(iommu->cap))
1379		val |= DMA_TLB_READ_DRAIN;
1380#endif
1381	if (cap_write_drain(iommu->cap))
1382		val |= DMA_TLB_WRITE_DRAIN;
1383
1384	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1385	/* Note: Only uses first TLB reg currently */
1386	if (val_iva)
1387		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1388	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1389
1390	/* Make sure hardware complete it */
1391	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1392		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1393
1394	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1395
1396	/* check IOTLB invalidation granularity */
1397	if (DMA_TLB_IAIG(val) == 0)
1398		pr_err("Flush IOTLB failed\n");
1399	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1400		pr_debug("TLB flush request %Lx, actual %Lx\n",
1401			(unsigned long long)DMA_TLB_IIRG(type),
1402			(unsigned long long)DMA_TLB_IAIG(val));
1403}
1404
1405static struct device_domain_info *
1406iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1407			 u8 bus, u8 devfn)
1408{
1409	struct device_domain_info *info;
1410
1411	assert_spin_locked(&device_domain_lock);
1412
1413	if (!iommu->qi)
1414		return NULL;
1415
1416	list_for_each_entry(info, &domain->devices, link)
1417		if (info->iommu == iommu && info->bus == bus &&
1418		    info->devfn == devfn) {
1419			if (info->ats_supported && info->dev)
1420				return info;
1421			break;
1422		}
1423
1424	return NULL;
1425}
1426
1427static void domain_update_iotlb(struct dmar_domain *domain)
1428{
1429	struct device_domain_info *info;
1430	bool has_iotlb_device = false;
1431
1432	assert_spin_locked(&device_domain_lock);
1433
1434	list_for_each_entry(info, &domain->devices, link) {
1435		struct pci_dev *pdev;
1436
1437		if (!info->dev || !dev_is_pci(info->dev))
1438			continue;
1439
1440		pdev = to_pci_dev(info->dev);
1441		if (pdev->ats_enabled) {
1442			has_iotlb_device = true;
1443			break;
1444		}
1445	}
1446
1447	domain->has_iotlb_device = has_iotlb_device;
1448}
1449
1450static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1451{
1452	struct pci_dev *pdev;
1453
1454	assert_spin_locked(&device_domain_lock);
1455
1456	if (!info || !dev_is_pci(info->dev))
1457		return;
1458
1459	pdev = to_pci_dev(info->dev);
1460	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1461	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1462	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1463	 * reserved, which should be set to 0.
1464	 */
1465	if (!ecap_dit(info->iommu->ecap))
1466		info->pfsid = 0;
1467	else {
1468		struct pci_dev *pf_pdev;
1469
1470		/* pdev will be returned if device is not a vf */
1471		pf_pdev = pci_physfn(pdev);
1472		info->pfsid = pci_dev_id(pf_pdev);
1473	}
1474
1475#ifdef CONFIG_INTEL_IOMMU_SVM
1476	/* The PCIe spec, in its wisdom, declares that the behaviour of
1477	   the device if you enable PASID support after ATS support is
1478	   undefined. So always enable PASID support on devices which
1479	   have it, even if we can't yet know if we're ever going to
1480	   use it. */
1481	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1482		info->pasid_enabled = 1;
1483
1484	if (info->pri_supported &&
1485	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1486	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1487		info->pri_enabled = 1;
1488#endif
1489	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1490	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1491		info->ats_enabled = 1;
1492		domain_update_iotlb(info->domain);
1493		info->ats_qdep = pci_ats_queue_depth(pdev);
1494	}
1495}
1496
1497static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1498{
1499	struct pci_dev *pdev;
1500
1501	assert_spin_locked(&device_domain_lock);
1502
1503	if (!dev_is_pci(info->dev))
1504		return;
1505
1506	pdev = to_pci_dev(info->dev);
1507
1508	if (info->ats_enabled) {
1509		pci_disable_ats(pdev);
1510		info->ats_enabled = 0;
1511		domain_update_iotlb(info->domain);
1512	}
1513#ifdef CONFIG_INTEL_IOMMU_SVM
1514	if (info->pri_enabled) {
1515		pci_disable_pri(pdev);
1516		info->pri_enabled = 0;
1517	}
1518	if (info->pasid_enabled) {
1519		pci_disable_pasid(pdev);
1520		info->pasid_enabled = 0;
1521	}
1522#endif
1523}
1524
1525static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1526				  u64 addr, unsigned mask)
1527{
1528	u16 sid, qdep;
1529	unsigned long flags;
1530	struct device_domain_info *info;
1531
1532	if (!domain->has_iotlb_device)
1533		return;
1534
1535	spin_lock_irqsave(&device_domain_lock, flags);
1536	list_for_each_entry(info, &domain->devices, link) {
1537		if (!info->ats_enabled)
1538			continue;
1539
1540		sid = info->bus << 8 | info->devfn;
1541		qdep = info->ats_qdep;
1542		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1543				qdep, addr, mask);
1544	}
1545	spin_unlock_irqrestore(&device_domain_lock, flags);
1546}
1547
1548static void domain_flush_piotlb(struct intel_iommu *iommu,
1549				struct dmar_domain *domain,
1550				u64 addr, unsigned long npages, bool ih)
1551{
1552	u16 did = domain->iommu_did[iommu->seq_id];
1553
1554	if (domain->default_pasid)
1555		qi_flush_piotlb(iommu, did, domain->default_pasid,
1556				addr, npages, ih);
1557
1558	if (!list_empty(&domain->devices))
1559		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1560}
1561
1562static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1563				  struct dmar_domain *domain,
1564				  unsigned long pfn, unsigned int pages,
1565				  int ih, int map)
1566{
1567	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1568	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1569	u16 did = domain->iommu_did[iommu->seq_id];
1570
1571	BUG_ON(pages == 0);
1572
1573	if (ih)
1574		ih = 1 << 6;
1575
1576	if (domain_use_first_level(domain)) {
1577		domain_flush_piotlb(iommu, domain, addr, pages, ih);
1578	} else {
1579		/*
1580		 * Fallback to domain selective flush if no PSI support or
1581		 * the size is too big. PSI requires page size to be 2 ^ x,
1582		 * and the base address is naturally aligned to the size.
1583		 */
1584		if (!cap_pgsel_inv(iommu->cap) ||
1585		    mask > cap_max_amask_val(iommu->cap))
1586			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1587							DMA_TLB_DSI_FLUSH);
1588		else
1589			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1590							DMA_TLB_PSI_FLUSH);
1591	}
1592
1593	/*
1594	 * In caching mode, changes of pages from non-present to present require
1595	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1596	 */
1597	if (!cap_caching_mode(iommu->cap) || !map)
1598		iommu_flush_dev_iotlb(domain, addr, mask);
1599}
1600
1601/* Notification for newly created mappings */
1602static inline void __mapping_notify_one(struct intel_iommu *iommu,
1603					struct dmar_domain *domain,
1604					unsigned long pfn, unsigned int pages)
1605{
1606	/*
1607	 * It's a non-present to present mapping. Only flush if caching mode
1608	 * and second level.
1609	 */
1610	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1611		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1612	else
1613		iommu_flush_write_buffer(iommu);
1614}
1615
1616static void iommu_flush_iova(struct iova_domain *iovad)
1617{
1618	struct dmar_domain *domain;
1619	int idx;
1620
1621	domain = container_of(iovad, struct dmar_domain, iovad);
1622
1623	for_each_domain_iommu(idx, domain) {
1624		struct intel_iommu *iommu = g_iommus[idx];
1625		u16 did = domain->iommu_did[iommu->seq_id];
1626
1627		if (domain_use_first_level(domain))
1628			domain_flush_piotlb(iommu, domain, 0, -1, 0);
1629		else
1630			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1631						 DMA_TLB_DSI_FLUSH);
1632
1633		if (!cap_caching_mode(iommu->cap))
1634			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1635					      0, MAX_AGAW_PFN_WIDTH);
1636	}
1637}
1638
1639static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1640{
1641	u32 pmen;
1642	unsigned long flags;
1643
1644	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1645		return;
1646
1647	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1648	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1649	pmen &= ~DMA_PMEN_EPM;
1650	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1651
1652	/* wait for the protected region status bit to clear */
1653	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1654		readl, !(pmen & DMA_PMEN_PRS), pmen);
1655
1656	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1657}
1658
1659static void iommu_enable_translation(struct intel_iommu *iommu)
1660{
1661	u32 sts;
1662	unsigned long flags;
1663
1664	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1665	iommu->gcmd |= DMA_GCMD_TE;
1666	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1667
1668	/* Make sure hardware complete it */
1669	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1670		      readl, (sts & DMA_GSTS_TES), sts);
1671
1672	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1673}
1674
1675static void iommu_disable_translation(struct intel_iommu *iommu)
1676{
1677	u32 sts;
1678	unsigned long flag;
1679
1680	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1681	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1682		return;
1683
1684	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1685	iommu->gcmd &= ~DMA_GCMD_TE;
1686	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1687
1688	/* Make sure hardware complete it */
1689	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1690		      readl, (!(sts & DMA_GSTS_TES)), sts);
1691
1692	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1693}
1694
1695static int iommu_init_domains(struct intel_iommu *iommu)
1696{
1697	u32 ndomains, nlongs;
1698	size_t size;
1699
1700	ndomains = cap_ndoms(iommu->cap);
1701	pr_debug("%s: Number of Domains supported <%d>\n",
1702		 iommu->name, ndomains);
1703	nlongs = BITS_TO_LONGS(ndomains);
1704
1705	spin_lock_init(&iommu->lock);
1706
1707	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1708	if (!iommu->domain_ids) {
1709		pr_err("%s: Allocating domain id array failed\n",
1710		       iommu->name);
1711		return -ENOMEM;
1712	}
1713
1714	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1715	iommu->domains = kzalloc(size, GFP_KERNEL);
1716
1717	if (iommu->domains) {
1718		size = 256 * sizeof(struct dmar_domain *);
1719		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1720	}
1721
1722	if (!iommu->domains || !iommu->domains[0]) {
1723		pr_err("%s: Allocating domain array failed\n",
1724		       iommu->name);
1725		kfree(iommu->domain_ids);
1726		kfree(iommu->domains);
1727		iommu->domain_ids = NULL;
1728		iommu->domains    = NULL;
1729		return -ENOMEM;
1730	}
1731
1732	/*
1733	 * If Caching mode is set, then invalid translations are tagged
1734	 * with domain-id 0, hence we need to pre-allocate it. We also
1735	 * use domain-id 0 as a marker for non-allocated domain-id, so
1736	 * make sure it is not used for a real domain.
1737	 */
1738	set_bit(0, iommu->domain_ids);
1739
1740	/*
1741	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1742	 * entry for first-level or pass-through translation modes should
1743	 * be programmed with a domain id different from those used for
1744	 * second-level or nested translation. We reserve a domain id for
1745	 * this purpose.
1746	 */
1747	if (sm_supported(iommu))
1748		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1749
1750	return 0;
1751}
1752
1753static void disable_dmar_iommu(struct intel_iommu *iommu)
1754{
1755	struct device_domain_info *info, *tmp;
1756	unsigned long flags;
1757
1758	if (!iommu->domains || !iommu->domain_ids)
1759		return;
1760
1761	spin_lock_irqsave(&device_domain_lock, flags);
1762	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1763		if (info->iommu != iommu)
1764			continue;
1765
1766		if (!info->dev || !info->domain)
1767			continue;
1768
1769		__dmar_remove_one_dev_info(info);
1770	}
1771	spin_unlock_irqrestore(&device_domain_lock, flags);
1772
1773	if (iommu->gcmd & DMA_GCMD_TE)
1774		iommu_disable_translation(iommu);
1775}
1776
1777static void free_dmar_iommu(struct intel_iommu *iommu)
1778{
1779	if ((iommu->domains) && (iommu->domain_ids)) {
1780		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1781		int i;
1782
1783		for (i = 0; i < elems; i++)
1784			kfree(iommu->domains[i]);
1785		kfree(iommu->domains);
1786		kfree(iommu->domain_ids);
1787		iommu->domains = NULL;
1788		iommu->domain_ids = NULL;
1789	}
1790
1791	g_iommus[iommu->seq_id] = NULL;
1792
1793	/* free context mapping */
1794	free_context_table(iommu);
1795
1796#ifdef CONFIG_INTEL_IOMMU_SVM
1797	if (pasid_supported(iommu)) {
1798		if (ecap_prs(iommu->ecap))
1799			intel_svm_finish_prq(iommu);
1800	}
1801	if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1802		ioasid_unregister_allocator(&iommu->pasid_allocator);
1803
1804#endif
1805}
1806
1807/*
1808 * Check and return whether first level is used by default for
1809 * DMA translation.
1810 */
1811static bool first_level_by_default(void)
1812{
1813	struct dmar_drhd_unit *drhd;
1814	struct intel_iommu *iommu;
1815	static int first_level_support = -1;
1816
1817	if (likely(first_level_support != -1))
1818		return first_level_support;
1819
1820	first_level_support = 1;
1821
1822	rcu_read_lock();
1823	for_each_active_iommu(iommu, drhd) {
1824		if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1825			first_level_support = 0;
1826			break;
1827		}
1828	}
1829	rcu_read_unlock();
1830
1831	return first_level_support;
1832}
1833
1834static struct dmar_domain *alloc_domain(int flags)
1835{
1836	struct dmar_domain *domain;
1837
1838	domain = alloc_domain_mem();
1839	if (!domain)
1840		return NULL;
1841
1842	memset(domain, 0, sizeof(*domain));
1843	domain->nid = NUMA_NO_NODE;
1844	domain->flags = flags;
1845	if (first_level_by_default())
1846		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1847	domain->has_iotlb_device = false;
1848	INIT_LIST_HEAD(&domain->devices);
1849
1850	return domain;
1851}
1852
1853/* Must be called with iommu->lock */
1854static int domain_attach_iommu(struct dmar_domain *domain,
1855			       struct intel_iommu *iommu)
1856{
1857	unsigned long ndomains;
1858	int num;
1859
1860	assert_spin_locked(&device_domain_lock);
1861	assert_spin_locked(&iommu->lock);
1862
1863	domain->iommu_refcnt[iommu->seq_id] += 1;
1864	domain->iommu_count += 1;
1865	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1866		ndomains = cap_ndoms(iommu->cap);
1867		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1868
1869		if (num >= ndomains) {
1870			pr_err("%s: No free domain ids\n", iommu->name);
1871			domain->iommu_refcnt[iommu->seq_id] -= 1;
1872			domain->iommu_count -= 1;
1873			return -ENOSPC;
1874		}
1875
1876		set_bit(num, iommu->domain_ids);
1877		set_iommu_domain(iommu, num, domain);
1878
1879		domain->iommu_did[iommu->seq_id] = num;
1880		domain->nid			 = iommu->node;
1881
1882		domain_update_iommu_cap(domain);
1883	}
1884
1885	return 0;
1886}
1887
1888static int domain_detach_iommu(struct dmar_domain *domain,
1889			       struct intel_iommu *iommu)
1890{
1891	int num, count;
1892
1893	assert_spin_locked(&device_domain_lock);
1894	assert_spin_locked(&iommu->lock);
1895
1896	domain->iommu_refcnt[iommu->seq_id] -= 1;
1897	count = --domain->iommu_count;
1898	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1899		num = domain->iommu_did[iommu->seq_id];
1900		clear_bit(num, iommu->domain_ids);
1901		set_iommu_domain(iommu, num, NULL);
1902
1903		domain_update_iommu_cap(domain);
1904		domain->iommu_did[iommu->seq_id] = 0;
1905	}
1906
1907	return count;
1908}
1909
1910static struct iova_domain reserved_iova_list;
1911static struct lock_class_key reserved_rbtree_key;
1912
1913static int dmar_init_reserved_ranges(void)
1914{
1915	struct pci_dev *pdev = NULL;
1916	struct iova *iova;
1917	int i;
1918
1919	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1920
1921	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1922		&reserved_rbtree_key);
1923
1924	/* IOAPIC ranges shouldn't be accessed by DMA */
1925	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1926		IOVA_PFN(IOAPIC_RANGE_END));
1927	if (!iova) {
1928		pr_err("Reserve IOAPIC range failed\n");
1929		return -ENODEV;
1930	}
1931
1932	/* Reserve all PCI MMIO to avoid peer-to-peer access */
1933	for_each_pci_dev(pdev) {
1934		struct resource *r;
1935
1936		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1937			r = &pdev->resource[i];
1938			if (!r->flags || !(r->flags & IORESOURCE_MEM))
1939				continue;
1940			iova = reserve_iova(&reserved_iova_list,
1941					    IOVA_PFN(r->start),
1942					    IOVA_PFN(r->end));
1943			if (!iova) {
1944				pci_err(pdev, "Reserve iova for %pR failed\n", r);
1945				return -ENODEV;
1946			}
1947		}
1948	}
1949	return 0;
1950}
1951
1952static inline int guestwidth_to_adjustwidth(int gaw)
1953{
1954	int agaw;
1955	int r = (gaw - 12) % 9;
1956
1957	if (r == 0)
1958		agaw = gaw;
1959	else
1960		agaw = gaw + 9 - r;
1961	if (agaw > 64)
1962		agaw = 64;
1963	return agaw;
1964}
1965
1966static void domain_exit(struct dmar_domain *domain)
1967{
1968
1969	/* Remove associated devices and clear attached or cached domains */
1970	domain_remove_dev_info(domain);
1971
1972	/* destroy iovas */
1973	if (domain->domain.type == IOMMU_DOMAIN_DMA)
1974		put_iova_domain(&domain->iovad);
1975
1976	if (domain->pgd) {
1977		struct page *freelist;
1978
1979		freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1980		dma_free_pagelist(freelist);
1981	}
1982
1983	free_domain_mem(domain);
1984}
1985
1986/*
1987 * Get the PASID directory size for scalable mode context entry.
1988 * Value of X in the PDTS field of a scalable mode context entry
1989 * indicates PASID directory with 2^(X + 7) entries.
1990 */
1991static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1992{
1993	int pds, max_pde;
1994
1995	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1996	pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1997	if (pds < 7)
1998		return 0;
1999
2000	return pds - 7;
2001}
2002
2003/*
2004 * Set the RID_PASID field of a scalable mode context entry. The
2005 * IOMMU hardware will use the PASID value set in this field for
2006 * DMA translations of DMA requests without PASID.
2007 */
2008static inline void
2009context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2010{
2011	context->hi |= pasid & ((1 << 20) - 1);
2012}
2013
2014/*
2015 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2016 * entry.
2017 */
2018static inline void context_set_sm_dte(struct context_entry *context)
2019{
2020	context->lo |= (1 << 2);
2021}
2022
2023/*
2024 * Set the PRE(Page Request Enable) field of a scalable mode context
2025 * entry.
2026 */
2027static inline void context_set_sm_pre(struct context_entry *context)
2028{
2029	context->lo |= (1 << 4);
2030}
2031
2032/* Convert value to context PASID directory size field coding. */
2033#define context_pdts(pds)	(((pds) & 0x7) << 9)
2034
2035static int domain_context_mapping_one(struct dmar_domain *domain,
2036				      struct intel_iommu *iommu,
2037				      struct pasid_table *table,
2038				      u8 bus, u8 devfn)
2039{
2040	u16 did = domain->iommu_did[iommu->seq_id];
2041	int translation = CONTEXT_TT_MULTI_LEVEL;
2042	struct device_domain_info *info = NULL;
2043	struct context_entry *context;
2044	unsigned long flags;
2045	int ret;
2046
2047	WARN_ON(did == 0);
2048
2049	if (hw_pass_through && domain_type_is_si(domain))
2050		translation = CONTEXT_TT_PASS_THROUGH;
2051
2052	pr_debug("Set context mapping for %02x:%02x.%d\n",
2053		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2054
2055	BUG_ON(!domain->pgd);
2056
2057	spin_lock_irqsave(&device_domain_lock, flags);
2058	spin_lock(&iommu->lock);
2059
2060	ret = -ENOMEM;
2061	context = iommu_context_addr(iommu, bus, devfn, 1);
2062	if (!context)
2063		goto out_unlock;
2064
2065	ret = 0;
2066	if (context_present(context))
2067		goto out_unlock;
2068
2069	/*
2070	 * For kdump cases, old valid entries may be cached due to the
2071	 * in-flight DMA and copied pgtable, but there is no unmapping
2072	 * behaviour for them, thus we need an explicit cache flush for
2073	 * the newly-mapped device. For kdump, at this point, the device
2074	 * is supposed to finish reset at its driver probe stage, so no
2075	 * in-flight DMA will exist, and we don't need to worry anymore
2076	 * hereafter.
2077	 */
2078	if (context_copied(context)) {
2079		u16 did_old = context_domain_id(context);
2080
2081		if (did_old < cap_ndoms(iommu->cap)) {
2082			iommu->flush.flush_context(iommu, did_old,
2083						   (((u16)bus) << 8) | devfn,
2084						   DMA_CCMD_MASK_NOBIT,
2085						   DMA_CCMD_DEVICE_INVL);
2086			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2087						 DMA_TLB_DSI_FLUSH);
2088		}
2089	}
2090
2091	context_clear_entry(context);
2092
2093	if (sm_supported(iommu)) {
2094		unsigned long pds;
2095
2096		WARN_ON(!table);
2097
2098		/* Setup the PASID DIR pointer: */
2099		pds = context_get_sm_pds(table);
2100		context->lo = (u64)virt_to_phys(table->table) |
2101				context_pdts(pds);
2102
2103		/* Setup the RID_PASID field: */
2104		context_set_sm_rid2pasid(context, PASID_RID2PASID);
2105
2106		/*
2107		 * Setup the Device-TLB enable bit and Page request
2108		 * Enable bit:
2109		 */
2110		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2111		if (info && info->ats_supported)
2112			context_set_sm_dte(context);
2113		if (info && info->pri_supported)
2114			context_set_sm_pre(context);
2115	} else {
2116		struct dma_pte *pgd = domain->pgd;
2117		int agaw;
2118
2119		context_set_domain_id(context, did);
2120
2121		if (translation != CONTEXT_TT_PASS_THROUGH) {
2122			/*
2123			 * Skip top levels of page tables for iommu which has
2124			 * less agaw than default. Unnecessary for PT mode.
2125			 */
2126			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2127				ret = -ENOMEM;
2128				pgd = phys_to_virt(dma_pte_addr(pgd));
2129				if (!dma_pte_present(pgd))
2130					goto out_unlock;
2131			}
2132
2133			info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2134			if (info && info->ats_supported)
2135				translation = CONTEXT_TT_DEV_IOTLB;
2136			else
2137				translation = CONTEXT_TT_MULTI_LEVEL;
2138
2139			context_set_address_root(context, virt_to_phys(pgd));
2140			context_set_address_width(context, agaw);
2141		} else {
2142			/*
2143			 * In pass through mode, AW must be programmed to
2144			 * indicate the largest AGAW value supported by
2145			 * hardware. And ASR is ignored by hardware.
2146			 */
2147			context_set_address_width(context, iommu->msagaw);
2148		}
2149
2150		context_set_translation_type(context, translation);
2151	}
2152
2153	context_set_fault_enable(context);
2154	context_set_present(context);
2155	if (!ecap_coherent(iommu->ecap))
2156		clflush_cache_range(context, sizeof(*context));
2157
2158	/*
2159	 * It's a non-present to present mapping. If hardware doesn't cache
2160	 * non-present entry we only need to flush the write-buffer. If the
2161	 * _does_ cache non-present entries, then it does so in the special
2162	 * domain #0, which we have to flush:
2163	 */
2164	if (cap_caching_mode(iommu->cap)) {
2165		iommu->flush.flush_context(iommu, 0,
2166					   (((u16)bus) << 8) | devfn,
2167					   DMA_CCMD_MASK_NOBIT,
2168					   DMA_CCMD_DEVICE_INVL);
2169		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2170	} else {
2171		iommu_flush_write_buffer(iommu);
2172	}
2173	iommu_enable_dev_iotlb(info);
2174
2175	ret = 0;
2176
2177out_unlock:
2178	spin_unlock(&iommu->lock);
2179	spin_unlock_irqrestore(&device_domain_lock, flags);
2180
2181	return ret;
2182}
2183
2184struct domain_context_mapping_data {
2185	struct dmar_domain *domain;
2186	struct intel_iommu *iommu;
2187	struct pasid_table *table;
2188};
2189
2190static int domain_context_mapping_cb(struct pci_dev *pdev,
2191				     u16 alias, void *opaque)
2192{
2193	struct domain_context_mapping_data *data = opaque;
2194
2195	return domain_context_mapping_one(data->domain, data->iommu,
2196					  data->table, PCI_BUS_NUM(alias),
2197					  alias & 0xff);
2198}
2199
2200static int
2201domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2202{
2203	struct domain_context_mapping_data data;
2204	struct pasid_table *table;
2205	struct intel_iommu *iommu;
2206	u8 bus, devfn;
2207
2208	iommu = device_to_iommu(dev, &bus, &devfn);
2209	if (!iommu)
2210		return -ENODEV;
2211
2212	table = intel_pasid_get_table(dev);
2213
2214	if (!dev_is_pci(dev))
2215		return domain_context_mapping_one(domain, iommu, table,
2216						  bus, devfn);
2217
2218	data.domain = domain;
2219	data.iommu = iommu;
2220	data.table = table;
2221
2222	return pci_for_each_dma_alias(to_pci_dev(dev),
2223				      &domain_context_mapping_cb, &data);
2224}
2225
2226static int domain_context_mapped_cb(struct pci_dev *pdev,
2227				    u16 alias, void *opaque)
2228{
2229	struct intel_iommu *iommu = opaque;
2230
2231	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2232}
2233
2234static int domain_context_mapped(struct device *dev)
2235{
2236	struct intel_iommu *iommu;
2237	u8 bus, devfn;
2238
2239	iommu = device_to_iommu(dev, &bus, &devfn);
2240	if (!iommu)
2241		return -ENODEV;
2242
2243	if (!dev_is_pci(dev))
2244		return device_context_mapped(iommu, bus, devfn);
2245
2246	return !pci_for_each_dma_alias(to_pci_dev(dev),
2247				       domain_context_mapped_cb, iommu);
2248}
2249
2250/* Returns a number of VTD pages, but aligned to MM page size */
2251static inline unsigned long aligned_nrpages(unsigned long host_addr,
2252					    size_t size)
2253{
2254	host_addr &= ~PAGE_MASK;
2255	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2256}
2257
2258/* Return largest possible superpage level for a given mapping */
2259static inline int hardware_largepage_caps(struct dmar_domain *domain,
2260					  unsigned long iov_pfn,
2261					  unsigned long phy_pfn,
2262					  unsigned long pages)
2263{
2264	int support, level = 1;
2265	unsigned long pfnmerge;
2266
2267	support = domain->iommu_superpage;
2268
2269	/* To use a large page, the virtual *and* physical addresses
2270	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2271	   of them will mean we have to use smaller pages. So just
2272	   merge them and check both at once. */
2273	pfnmerge = iov_pfn | phy_pfn;
2274
2275	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2276		pages >>= VTD_STRIDE_SHIFT;
2277		if (!pages)
2278			break;
2279		pfnmerge >>= VTD_STRIDE_SHIFT;
2280		level++;
2281		support--;
2282	}
2283	return level;
2284}
2285
2286static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2287			    struct scatterlist *sg, unsigned long phys_pfn,
2288			    unsigned long nr_pages, int prot)
2289{
2290	struct dma_pte *first_pte = NULL, *pte = NULL;
2291	phys_addr_t pteval;
2292	unsigned long sg_res = 0;
2293	unsigned int largepage_lvl = 0;
2294	unsigned long lvl_pages = 0;
2295	u64 attr;
2296
2297	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2298
2299	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2300		return -EINVAL;
2301
2302	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2303	if (domain_use_first_level(domain))
2304		attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2305
2306	if (!sg) {
2307		sg_res = nr_pages;
2308		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2309	}
2310
2311	while (nr_pages > 0) {
2312		uint64_t tmp;
2313
2314		if (!sg_res) {
2315			unsigned int pgoff = sg->offset & ~PAGE_MASK;
2316
2317			sg_res = aligned_nrpages(sg->offset, sg->length);
2318			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2319			sg->dma_length = sg->length;
2320			pteval = (sg_phys(sg) - pgoff) | attr;
2321			phys_pfn = pteval >> VTD_PAGE_SHIFT;
2322		}
2323
2324		if (!pte) {
2325			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2326
2327			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2328			if (!pte)
2329				return -ENOMEM;
2330			/* It is large page*/
2331			if (largepage_lvl > 1) {
2332				unsigned long nr_superpages, end_pfn;
2333
2334				pteval |= DMA_PTE_LARGE_PAGE;
2335				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2336
2337				nr_superpages = sg_res / lvl_pages;
2338				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2339
2340				/*
2341				 * Ensure that old small page tables are
2342				 * removed to make room for superpage(s).
2343				 * We're adding new large pages, so make sure
2344				 * we don't remove their parent tables.
2345				 */
2346				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2347						       largepage_lvl + 1);
2348			} else {
2349				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2350			}
2351
2352		}
2353		/* We don't need lock here, nobody else
2354		 * touches the iova range
2355		 */
2356		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2357		if (tmp) {
2358			static int dumps = 5;
2359			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2360				iov_pfn, tmp, (unsigned long long)pteval);
2361			if (dumps) {
2362				dumps--;
2363				debug_dma_dump_mappings(NULL);
2364			}
2365			WARN_ON(1);
2366		}
2367
2368		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2369
2370		BUG_ON(nr_pages < lvl_pages);
2371		BUG_ON(sg_res < lvl_pages);
2372
2373		nr_pages -= lvl_pages;
2374		iov_pfn += lvl_pages;
2375		phys_pfn += lvl_pages;
2376		pteval += lvl_pages * VTD_PAGE_SIZE;
2377		sg_res -= lvl_pages;
2378
2379		/* If the next PTE would be the first in a new page, then we
2380		   need to flush the cache on the entries we've just written.
2381		   And then we'll need to recalculate 'pte', so clear it and
2382		   let it get set again in the if (!pte) block above.
2383
2384		   If we're done (!nr_pages) we need to flush the cache too.
2385
2386		   Also if we've been setting superpages, we may need to
2387		   recalculate 'pte' and switch back to smaller pages for the
2388		   end of the mapping, if the trailing size is not enough to
2389		   use another superpage (i.e. sg_res < lvl_pages). */
2390		pte++;
2391		if (!nr_pages || first_pte_in_page(pte) ||
2392		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2393			domain_flush_cache(domain, first_pte,
2394					   (void *)pte - (void *)first_pte);
2395			pte = NULL;
2396		}
2397
2398		if (!sg_res && nr_pages)
2399			sg = sg_next(sg);
2400	}
2401	return 0;
2402}
2403
2404static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2405			  struct scatterlist *sg, unsigned long phys_pfn,
2406			  unsigned long nr_pages, int prot)
2407{
2408	int iommu_id, ret;
2409	struct intel_iommu *iommu;
2410
2411	/* Do the real mapping first */
2412	ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2413	if (ret)
2414		return ret;
2415
2416	for_each_domain_iommu(iommu_id, domain) {
2417		iommu = g_iommus[iommu_id];
2418		__mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2419	}
2420
2421	return 0;
2422}
2423
2424static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2425				    struct scatterlist *sg, unsigned long nr_pages,
2426				    int prot)
2427{
2428	return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2429}
2430
2431static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2432				     unsigned long phys_pfn, unsigned long nr_pages,
2433				     int prot)
2434{
2435	return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2436}
2437
2438static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2439{
2440	unsigned long flags;
2441	struct context_entry *context;
2442	u16 did_old;
2443
2444	if (!iommu)
2445		return;
2446
2447	spin_lock_irqsave(&iommu->lock, flags);
2448	context = iommu_context_addr(iommu, bus, devfn, 0);
2449	if (!context) {
2450		spin_unlock_irqrestore(&iommu->lock, flags);
2451		return;
2452	}
2453	did_old = context_domain_id(context);
2454	context_clear_entry(context);
2455	__iommu_flush_cache(iommu, context, sizeof(*context));
2456	spin_unlock_irqrestore(&iommu->lock, flags);
2457	iommu->flush.flush_context(iommu,
2458				   did_old,
2459				   (((u16)bus) << 8) | devfn,
2460				   DMA_CCMD_MASK_NOBIT,
2461				   DMA_CCMD_DEVICE_INVL);
2462	iommu->flush.flush_iotlb(iommu,
2463				 did_old,
2464				 0,
2465				 0,
2466				 DMA_TLB_DSI_FLUSH);
2467}
2468
2469static inline void unlink_domain_info(struct device_domain_info *info)
2470{
2471	assert_spin_locked(&device_domain_lock);
2472	list_del(&info->link);
2473	list_del(&info->global);
2474	if (info->dev)
2475		dev_iommu_priv_set(info->dev, NULL);
2476}
2477
2478static void domain_remove_dev_info(struct dmar_domain *domain)
2479{
2480	struct device_domain_info *info, *tmp;
2481	unsigned long flags;
2482
2483	spin_lock_irqsave(&device_domain_lock, flags);
2484	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2485		__dmar_remove_one_dev_info(info);
2486	spin_unlock_irqrestore(&device_domain_lock, flags);
2487}
2488
2489struct dmar_domain *find_domain(struct device *dev)
2490{
2491	struct device_domain_info *info;
2492
2493	if (unlikely(attach_deferred(dev)))
2494		return NULL;
2495
2496	/* No lock here, assumes no domain exit in normal case */
2497	info = get_domain_info(dev);
2498	if (likely(info))
2499		return info->domain;
2500
2501	return NULL;
2502}
2503
2504static void do_deferred_attach(struct device *dev)
2505{
2506	struct iommu_domain *domain;
2507
2508	dev_iommu_priv_set(dev, NULL);
2509	domain = iommu_get_domain_for_dev(dev);
2510	if (domain)
2511		intel_iommu_attach_device(domain, dev);
2512}
2513
2514static inline struct device_domain_info *
2515dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2516{
2517	struct device_domain_info *info;
2518
2519	list_for_each_entry(info, &device_domain_list, global)
2520		if (info->segment == segment && info->bus == bus &&
2521		    info->devfn == devfn)
2522			return info;
2523
2524	return NULL;
2525}
2526
2527static int domain_setup_first_level(struct intel_iommu *iommu,
2528				    struct dmar_domain *domain,
2529				    struct device *dev,
2530				    int pasid)
2531{
2532	int flags = PASID_FLAG_SUPERVISOR_MODE;
2533	struct dma_pte *pgd = domain->pgd;
2534	int agaw, level;
2535
2536	/*
2537	 * Skip top levels of page tables for iommu which has
2538	 * less agaw than default. Unnecessary for PT mode.
2539	 */
2540	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2541		pgd = phys_to_virt(dma_pte_addr(pgd));
2542		if (!dma_pte_present(pgd))
2543			return -ENOMEM;
2544	}
2545
2546	level = agaw_to_level(agaw);
2547	if (level != 4 && level != 5)
2548		return -EINVAL;
2549
2550	flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2551
2552	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2553					     domain->iommu_did[iommu->seq_id],
2554					     flags);
2555}
2556
2557static bool dev_is_real_dma_subdevice(struct device *dev)
2558{
2559	return dev && dev_is_pci(dev) &&
2560	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2561}
2562
2563static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2564						    int bus, int devfn,
2565						    struct device *dev,
2566						    struct dmar_domain *domain)
2567{
2568	struct dmar_domain *found = NULL;
2569	struct device_domain_info *info;
2570	unsigned long flags;
2571	int ret;
2572
2573	info = alloc_devinfo_mem();
2574	if (!info)
2575		return NULL;
2576
2577	if (!dev_is_real_dma_subdevice(dev)) {
2578		info->bus = bus;
2579		info->devfn = devfn;
2580		info->segment = iommu->segment;
2581	} else {
2582		struct pci_dev *pdev = to_pci_dev(dev);
2583
2584		info->bus = pdev->bus->number;
2585		info->devfn = pdev->devfn;
2586		info->segment = pci_domain_nr(pdev->bus);
2587	}
2588
2589	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2590	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2591	info->ats_qdep = 0;
2592	info->dev = dev;
2593	info->domain = domain;
2594	info->iommu = iommu;
2595	info->pasid_table = NULL;
2596	info->auxd_enabled = 0;
2597	INIT_LIST_HEAD(&info->auxiliary_domains);
2598
2599	if (dev && dev_is_pci(dev)) {
2600		struct pci_dev *pdev = to_pci_dev(info->dev);
2601
2602		if (ecap_dev_iotlb_support(iommu->ecap) &&
2603		    pci_ats_supported(pdev) &&
2604		    dmar_find_matched_atsr_unit(pdev))
2605			info->ats_supported = 1;
2606
2607		if (sm_supported(iommu)) {
2608			if (pasid_supported(iommu)) {
2609				int features = pci_pasid_features(pdev);
2610				if (features >= 0)
2611					info->pasid_supported = features | 1;
2612			}
2613
2614			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2615			    pci_pri_supported(pdev))
2616				info->pri_supported = 1;
2617		}
2618	}
2619
2620	spin_lock_irqsave(&device_domain_lock, flags);
2621	if (dev)
2622		found = find_domain(dev);
2623
2624	if (!found) {
2625		struct device_domain_info *info2;
2626		info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2627						       info->devfn);
2628		if (info2) {
2629			found      = info2->domain;
2630			info2->dev = dev;
2631		}
2632	}
2633
2634	if (found) {
2635		spin_unlock_irqrestore(&device_domain_lock, flags);
2636		free_devinfo_mem(info);
2637		/* Caller must free the original domain */
2638		return found;
2639	}
2640
2641	spin_lock(&iommu->lock);
2642	ret = domain_attach_iommu(domain, iommu);
2643	spin_unlock(&iommu->lock);
2644
2645	if (ret) {
2646		spin_unlock_irqrestore(&device_domain_lock, flags);
2647		free_devinfo_mem(info);
2648		return NULL;
2649	}
2650
2651	list_add(&info->link, &domain->devices);
2652	list_add(&info->global, &device_domain_list);
2653	if (dev)
2654		dev_iommu_priv_set(dev, info);
2655	spin_unlock_irqrestore(&device_domain_lock, flags);
2656
2657	/* PASID table is mandatory for a PCI device in scalable mode. */
2658	if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2659		ret = intel_pasid_alloc_table(dev);
2660		if (ret) {
2661			dev_err(dev, "PASID table allocation failed\n");
2662			dmar_remove_one_dev_info(dev);
2663			return NULL;
2664		}
2665
2666		/* Setup the PASID entry for requests without PASID: */
2667		spin_lock_irqsave(&iommu->lock, flags);
2668		if (hw_pass_through && domain_type_is_si(domain))
2669			ret = intel_pasid_setup_pass_through(iommu, domain,
2670					dev, PASID_RID2PASID);
2671		else if (domain_use_first_level(domain))
2672			ret = domain_setup_first_level(iommu, domain, dev,
2673					PASID_RID2PASID);
2674		else
2675			ret = intel_pasid_setup_second_level(iommu, domain,
2676					dev, PASID_RID2PASID);
2677		spin_unlock_irqrestore(&iommu->lock, flags);
2678		if (ret) {
2679			dev_err(dev, "Setup RID2PASID failed\n");
2680			dmar_remove_one_dev_info(dev);
2681			return NULL;
2682		}
2683	}
2684
2685	if (dev && domain_context_mapping(domain, dev)) {
2686		dev_err(dev, "Domain context map failed\n");
2687		dmar_remove_one_dev_info(dev);
2688		return NULL;
2689	}
2690
2691	return domain;
2692}
2693
2694static int iommu_domain_identity_map(struct dmar_domain *domain,
2695				     unsigned long first_vpfn,
2696				     unsigned long last_vpfn)
2697{
2698	/*
2699	 * RMRR range might have overlap with physical memory range,
2700	 * clear it first
2701	 */
2702	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2703
2704	return __domain_mapping(domain, first_vpfn, NULL,
2705				first_vpfn, last_vpfn - first_vpfn + 1,
2706				DMA_PTE_READ|DMA_PTE_WRITE);
2707}
2708
2709static int md_domain_init(struct dmar_domain *domain, int guest_width);
2710
2711static int __init si_domain_init(int hw)
2712{
2713	struct dmar_rmrr_unit *rmrr;
2714	struct device *dev;
2715	int i, nid, ret;
2716
2717	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2718	if (!si_domain)
2719		return -EFAULT;
2720
2721	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2722		domain_exit(si_domain);
2723		return -EFAULT;
2724	}
2725
2726	if (hw)
2727		return 0;
2728
2729	for_each_online_node(nid) {
2730		unsigned long start_pfn, end_pfn;
2731		int i;
2732
2733		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2734			ret = iommu_domain_identity_map(si_domain,
2735					mm_to_dma_pfn(start_pfn),
2736					mm_to_dma_pfn(end_pfn));
2737			if (ret)
2738				return ret;
2739		}
2740	}
2741
2742	/*
2743	 * Identity map the RMRRs so that devices with RMRRs could also use
2744	 * the si_domain.
2745	 */
2746	for_each_rmrr_units(rmrr) {
2747		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2748					  i, dev) {
2749			unsigned long long start = rmrr->base_address;
2750			unsigned long long end = rmrr->end_address;
2751
2752			if (WARN_ON(end < start ||
2753				    end >> agaw_to_width(si_domain->agaw)))
2754				continue;
2755
2756			ret = iommu_domain_identity_map(si_domain,
2757					mm_to_dma_pfn(start >> PAGE_SHIFT),
2758					mm_to_dma_pfn(end >> PAGE_SHIFT));
2759			if (ret)
2760				return ret;
2761		}
2762	}
2763
2764	return 0;
2765}
2766
2767static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2768{
2769	struct dmar_domain *ndomain;
2770	struct intel_iommu *iommu;
2771	u8 bus, devfn;
2772
2773	iommu = device_to_iommu(dev, &bus, &devfn);
2774	if (!iommu)
2775		return -ENODEV;
2776
2777	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2778	if (ndomain != domain)
2779		return -EBUSY;
2780
2781	return 0;
2782}
2783
2784static bool device_has_rmrr(struct device *dev)
2785{
2786	struct dmar_rmrr_unit *rmrr;
2787	struct device *tmp;
2788	int i;
2789
2790	rcu_read_lock();
2791	for_each_rmrr_units(rmrr) {
2792		/*
2793		 * Return TRUE if this RMRR contains the device that
2794		 * is passed in.
2795		 */
2796		for_each_active_dev_scope(rmrr->devices,
2797					  rmrr->devices_cnt, i, tmp)
2798			if (tmp == dev ||
2799			    is_downstream_to_pci_bridge(dev, tmp)) {
2800				rcu_read_unlock();
2801				return true;
2802			}
2803	}
2804	rcu_read_unlock();
2805	return false;
2806}
2807
2808/**
2809 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2810 * is relaxable (ie. is allowed to be not enforced under some conditions)
2811 * @dev: device handle
2812 *
2813 * We assume that PCI USB devices with RMRRs have them largely
2814 * for historical reasons and that the RMRR space is not actively used post
2815 * boot.  This exclusion may change if vendors begin to abuse it.
2816 *
2817 * The same exception is made for graphics devices, with the requirement that
2818 * any use of the RMRR regions will be torn down before assigning the device
2819 * to a guest.
2820 *
2821 * Return: true if the RMRR is relaxable, false otherwise
2822 */
2823static bool device_rmrr_is_relaxable(struct device *dev)
2824{
2825	struct pci_dev *pdev;
2826
2827	if (!dev_is_pci(dev))
2828		return false;
2829
2830	pdev = to_pci_dev(dev);
2831	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2832		return true;
2833	else
2834		return false;
2835}
2836
2837/*
2838 * There are a couple cases where we need to restrict the functionality of
2839 * devices associated with RMRRs.  The first is when evaluating a device for
2840 * identity mapping because problems exist when devices are moved in and out
2841 * of domains and their respective RMRR information is lost.  This means that
2842 * a device with associated RMRRs will never be in a "passthrough" domain.
2843 * The second is use of the device through the IOMMU API.  This interface
2844 * expects to have full control of the IOVA space for the device.  We cannot
2845 * satisfy both the requirement that RMRR access is maintained and have an
2846 * unencumbered IOVA space.  We also have no ability to quiesce the device's
2847 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2848 * We therefore prevent devices associated with an RMRR from participating in
2849 * the IOMMU API, which eliminates them from device assignment.
2850 *
2851 * In both cases, devices which have relaxable RMRRs are not concerned by this
2852 * restriction. See device_rmrr_is_relaxable comment.
2853 */
2854static bool device_is_rmrr_locked(struct device *dev)
2855{
2856	if (!device_has_rmrr(dev))
2857		return false;
2858
2859	if (device_rmrr_is_relaxable(dev))
2860		return false;
2861
2862	return true;
2863}
2864
2865/*
2866 * Return the required default domain type for a specific device.
2867 *
2868 * @dev: the device in query
2869 * @startup: true if this is during early boot
2870 *
2871 * Returns:
2872 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2873 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2874 *  - 0: both identity and dynamic domains work for this device
2875 */
2876static int device_def_domain_type(struct device *dev)
2877{
2878	if (dev_is_pci(dev)) {
2879		struct pci_dev *pdev = to_pci_dev(dev);
2880
2881		/*
2882		 * Prevent any device marked as untrusted from getting
2883		 * placed into the statically identity mapping domain.
2884		 */
2885		if (pdev->untrusted)
2886			return IOMMU_DOMAIN_DMA;
2887
2888		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2889			return IOMMU_DOMAIN_IDENTITY;
2890
2891		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2892			return IOMMU_DOMAIN_IDENTITY;
2893	}
2894
2895	return 0;
2896}
2897
2898static void intel_iommu_init_qi(struct intel_iommu *iommu)
2899{
2900	/*
2901	 * Start from the sane iommu hardware state.
2902	 * If the queued invalidation is already initialized by us
2903	 * (for example, while enabling interrupt-remapping) then
2904	 * we got the things already rolling from a sane state.
2905	 */
2906	if (!iommu->qi) {
2907		/*
2908		 * Clear any previous faults.
2909		 */
2910		dmar_fault(-1, iommu);
2911		/*
2912		 * Disable queued invalidation if supported and already enabled
2913		 * before OS handover.
2914		 */
2915		dmar_disable_qi(iommu);
2916	}
2917
2918	if (dmar_enable_qi(iommu)) {
2919		/*
2920		 * Queued Invalidate not enabled, use Register Based Invalidate
2921		 */
2922		iommu->flush.flush_context = __iommu_flush_context;
2923		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2924		pr_info("%s: Using Register based invalidation\n",
2925			iommu->name);
2926	} else {
2927		iommu->flush.flush_context = qi_flush_context;
2928		iommu->flush.flush_iotlb = qi_flush_iotlb;
2929		pr_info("%s: Using Queued invalidation\n", iommu->name);
2930	}
2931}
2932
2933static int copy_context_table(struct intel_iommu *iommu,
2934			      struct root_entry *old_re,
2935			      struct context_entry **tbl,
2936			      int bus, bool ext)
2937{
2938	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2939	struct context_entry *new_ce = NULL, ce;
2940	struct context_entry *old_ce = NULL;
2941	struct root_entry re;
2942	phys_addr_t old_ce_phys;
2943
2944	tbl_idx = ext ? bus * 2 : bus;
2945	memcpy(&re, old_re, sizeof(re));
2946
2947	for (devfn = 0; devfn < 256; devfn++) {
2948		/* First calculate the correct index */
2949		idx = (ext ? devfn * 2 : devfn) % 256;
2950
2951		if (idx == 0) {
2952			/* First save what we may have and clean up */
2953			if (new_ce) {
2954				tbl[tbl_idx] = new_ce;
2955				__iommu_flush_cache(iommu, new_ce,
2956						    VTD_PAGE_SIZE);
2957				pos = 1;
2958			}
2959
2960			if (old_ce)
2961				memunmap(old_ce);
2962
2963			ret = 0;
2964			if (devfn < 0x80)
2965				old_ce_phys = root_entry_lctp(&re);
2966			else
2967				old_ce_phys = root_entry_uctp(&re);
2968
2969			if (!old_ce_phys) {
2970				if (ext && devfn == 0) {
2971					/* No LCTP, try UCTP */
2972					devfn = 0x7f;
2973					continue;
2974				} else {
2975					goto out;
2976				}
2977			}
2978
2979			ret = -ENOMEM;
2980			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2981					MEMREMAP_WB);
2982			if (!old_ce)
2983				goto out;
2984
2985			new_ce = alloc_pgtable_page(iommu->node);
2986			if (!new_ce)
2987				goto out_unmap;
2988
2989			ret = 0;
2990		}
2991
2992		/* Now copy the context entry */
2993		memcpy(&ce, old_ce + idx, sizeof(ce));
2994
2995		if (!__context_present(&ce))
2996			continue;
2997
2998		did = context_domain_id(&ce);
2999		if (did >= 0 && did < cap_ndoms(iommu->cap))
3000			set_bit(did, iommu->domain_ids);
3001
3002		/*
3003		 * We need a marker for copied context entries. This
3004		 * marker needs to work for the old format as well as
3005		 * for extended context entries.
3006		 *
3007		 * Bit 67 of the context entry is used. In the old
3008		 * format this bit is available to software, in the
3009		 * extended format it is the PGE bit, but PGE is ignored
3010		 * by HW if PASIDs are disabled (and thus still
3011		 * available).
3012		 *
3013		 * So disable PASIDs first and then mark the entry
3014		 * copied. This means that we don't copy PASID
3015		 * translations from the old kernel, but this is fine as
3016		 * faults there are not fatal.
3017		 */
3018		context_clear_pasid_enable(&ce);
3019		context_set_copied(&ce);
3020
3021		new_ce[idx] = ce;
3022	}
3023
3024	tbl[tbl_idx + pos] = new_ce;
3025
3026	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3027
3028out_unmap:
3029	memunmap(old_ce);
3030
3031out:
3032	return ret;
3033}
3034
3035static int copy_translation_tables(struct intel_iommu *iommu)
3036{
3037	struct context_entry **ctxt_tbls;
3038	struct root_entry *old_rt;
3039	phys_addr_t old_rt_phys;
3040	int ctxt_table_entries;
3041	unsigned long flags;
3042	u64 rtaddr_reg;
3043	int bus, ret;
3044	bool new_ext, ext;
3045
3046	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3047	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3048	new_ext    = !!ecap_ecs(iommu->ecap);
3049
3050	/*
3051	 * The RTT bit can only be changed when translation is disabled,
3052	 * but disabling translation means to open a window for data
3053	 * corruption. So bail out and don't copy anything if we would
3054	 * have to change the bit.
3055	 */
3056	if (new_ext != ext)
3057		return -EINVAL;
3058
3059	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3060	if (!old_rt_phys)
3061		return -EINVAL;
3062
3063	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3064	if (!old_rt)
3065		return -ENOMEM;
3066
3067	/* This is too big for the stack - allocate it from slab */
3068	ctxt_table_entries = ext ? 512 : 256;
3069	ret = -ENOMEM;
3070	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3071	if (!ctxt_tbls)
3072		goto out_unmap;
3073
3074	for (bus = 0; bus < 256; bus++) {
3075		ret = copy_context_table(iommu, &old_rt[bus],
3076					 ctxt_tbls, bus, ext);
3077		if (ret) {
3078			pr_err("%s: Failed to copy context table for bus %d\n",
3079				iommu->name, bus);
3080			continue;
3081		}
3082	}
3083
3084	spin_lock_irqsave(&iommu->lock, flags);
3085
3086	/* Context tables are copied, now write them to the root_entry table */
3087	for (bus = 0; bus < 256; bus++) {
3088		int idx = ext ? bus * 2 : bus;
3089		u64 val;
3090
3091		if (ctxt_tbls[idx]) {
3092			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3093			iommu->root_entry[bus].lo = val;
3094		}
3095
3096		if (!ext || !ctxt_tbls[idx + 1])
3097			continue;
3098
3099		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3100		iommu->root_entry[bus].hi = val;
3101	}
3102
3103	spin_unlock_irqrestore(&iommu->lock, flags);
3104
3105	kfree(ctxt_tbls);
3106
3107	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3108
3109	ret = 0;
3110
3111out_unmap:
3112	memunmap(old_rt);
3113
3114	return ret;
3115}
3116
3117#ifdef CONFIG_INTEL_IOMMU_SVM
3118static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3119{
3120	struct intel_iommu *iommu = data;
3121	ioasid_t ioasid;
3122
3123	if (!iommu)
3124		return INVALID_IOASID;
3125	/*
3126	 * VT-d virtual command interface always uses the full 20 bit
3127	 * PASID range. Host can partition guest PASID range based on
3128	 * policies but it is out of guest's control.
3129	 */
3130	if (min < PASID_MIN || max > intel_pasid_max_id)
3131		return INVALID_IOASID;
3132
3133	if (vcmd_alloc_pasid(iommu, &ioasid))
3134		return INVALID_IOASID;
3135
3136	return ioasid;
3137}
3138
3139static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3140{
3141	struct intel_iommu *iommu = data;
3142
3143	if (!iommu)
3144		return;
3145	/*
3146	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3147	 * We can only free the PASID when all the devices are unbound.
3148	 */
3149	if (ioasid_find(NULL, ioasid, NULL)) {
3150		pr_alert("Cannot free active IOASID %d\n", ioasid);
3151		return;
3152	}
3153	vcmd_free_pasid(iommu, ioasid);
3154}
3155
3156static void register_pasid_allocator(struct intel_iommu *iommu)
3157{
3158	/*
3159	 * If we are running in the host, no need for custom allocator
3160	 * in that PASIDs are allocated from the host system-wide.
3161	 */
3162	if (!cap_caching_mode(iommu->cap))
3163		return;
3164
3165	if (!sm_supported(iommu)) {
3166		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3167		return;
3168	}
3169
3170	/*
3171	 * Register a custom PASID allocator if we are running in a guest,
3172	 * guest PASID must be obtained via virtual command interface.
3173	 * There can be multiple vIOMMUs in each guest but only one allocator
3174	 * is active. All vIOMMU allocators will eventually be calling the same
3175	 * host allocator.
3176	 */
3177	if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3178		return;
3179
3180	pr_info("Register custom PASID allocator\n");
3181	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3182	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3183	iommu->pasid_allocator.pdata = (void *)iommu;
3184	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3185		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3186		/*
3187		 * Disable scalable mode on this IOMMU if there
3188		 * is no custom allocator. Mixing SM capable vIOMMU
3189		 * and non-SM vIOMMU are not supported.
3190		 */
3191		intel_iommu_sm = 0;
3192	}
3193}
3194#endif
3195
3196static int __init init_dmars(void)
3197{
3198	struct dmar_drhd_unit *drhd;
3199	struct intel_iommu *iommu;
3200	int ret;
3201
3202	/*
3203	 * for each drhd
3204	 *    allocate root
3205	 *    initialize and program root entry to not present
3206	 * endfor
3207	 */
3208	for_each_drhd_unit(drhd) {
3209		/*
3210		 * lock not needed as this is only incremented in the single
3211		 * threaded kernel __init code path all other access are read
3212		 * only
3213		 */
3214		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3215			g_num_of_iommus++;
3216			continue;
3217		}
3218		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3219	}
3220
3221	/* Preallocate enough resources for IOMMU hot-addition */
3222	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3223		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3224
3225	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3226			GFP_KERNEL);
3227	if (!g_iommus) {
3228		pr_err("Allocating global iommu array failed\n");
3229		ret = -ENOMEM;
3230		goto error;
3231	}
3232
3233	for_each_iommu(iommu, drhd) {
3234		if (drhd->ignored) {
3235			iommu_disable_translation(iommu);
3236			continue;
3237		}
3238
3239		/*
3240		 * Find the max pasid size of all IOMMU's in the system.
3241		 * We need to ensure the system pasid table is no bigger
3242		 * than the smallest supported.
3243		 */
3244		if (pasid_supported(iommu)) {
3245			u32 temp = 2 << ecap_pss(iommu->ecap);
3246
3247			intel_pasid_max_id = min_t(u32, temp,
3248						   intel_pasid_max_id);
3249		}
3250
3251		g_iommus[iommu->seq_id] = iommu;
3252
3253		intel_iommu_init_qi(iommu);
3254
3255		ret = iommu_init_domains(iommu);
3256		if (ret)
3257			goto free_iommu;
3258
3259		init_translation_status(iommu);
3260
3261		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3262			iommu_disable_translation(iommu);
3263			clear_translation_pre_enabled(iommu);
3264			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3265				iommu->name);
3266		}
3267
3268		/*
3269		 * TBD:
3270		 * we could share the same root & context tables
3271		 * among all IOMMU's. Need to Split it later.
3272		 */
3273		ret = iommu_alloc_root_entry(iommu);
3274		if (ret)
3275			goto free_iommu;
3276
3277		if (translation_pre_enabled(iommu)) {
3278			pr_info("Translation already enabled - trying to copy translation structures\n");
3279
3280			ret = copy_translation_tables(iommu);
3281			if (ret) {
3282				/*
3283				 * We found the IOMMU with translation
3284				 * enabled - but failed to copy over the
3285				 * old root-entry table. Try to proceed
3286				 * by disabling translation now and
3287				 * allocating a clean root-entry table.
3288				 * This might cause DMAR faults, but
3289				 * probably the dump will still succeed.
3290				 */
3291				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3292				       iommu->name);
3293				iommu_disable_translation(iommu);
3294				clear_translation_pre_enabled(iommu);
3295			} else {
3296				pr_info("Copied translation tables from previous kernel for %s\n",
3297					iommu->name);
3298			}
3299		}
3300
3301		if (!ecap_pass_through(iommu->ecap))
3302			hw_pass_through = 0;
3303		intel_svm_check(iommu);
3304	}
3305
3306	/*
3307	 * Now that qi is enabled on all iommus, set the root entry and flush
3308	 * caches. This is required on some Intel X58 chipsets, otherwise the
3309	 * flush_context function will loop forever and the boot hangs.
3310	 */
3311	for_each_active_iommu(iommu, drhd) {
3312		iommu_flush_write_buffer(iommu);
3313#ifdef CONFIG_INTEL_IOMMU_SVM
3314		register_pasid_allocator(iommu);
3315#endif
3316		iommu_set_root_entry(iommu);
3317		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3318		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3319	}
3320
3321#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3322	dmar_map_gfx = 0;
3323#endif
3324
3325	if (!dmar_map_gfx)
3326		iommu_identity_mapping |= IDENTMAP_GFX;
3327
3328	check_tylersburg_isoch();
3329
3330	ret = si_domain_init(hw_pass_through);
3331	if (ret)
3332		goto free_iommu;
3333
3334	/*
3335	 * for each drhd
3336	 *   enable fault log
3337	 *   global invalidate context cache
3338	 *   global invalidate iotlb
3339	 *   enable translation
3340	 */
3341	for_each_iommu(iommu, drhd) {
3342		if (drhd->ignored) {
3343			/*
3344			 * we always have to disable PMRs or DMA may fail on
3345			 * this device
3346			 */
3347			if (force_on)
3348				iommu_disable_protect_mem_regions(iommu);
3349			continue;
3350		}
3351
3352		iommu_flush_write_buffer(iommu);
3353
3354#ifdef CONFIG_INTEL_IOMMU_SVM
3355		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3356			/*
3357			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3358			 * could cause possible lock race condition.
3359			 */
3360			up_write(&dmar_global_lock);
3361			ret = intel_svm_enable_prq(iommu);
3362			down_write(&dmar_global_lock);
3363			if (ret)
3364				goto free_iommu;
3365		}
3366#endif
3367		ret = dmar_set_interrupt(iommu);
3368		if (ret)
3369			goto free_iommu;
3370	}
3371
3372	return 0;
3373
3374free_iommu:
3375	for_each_active_iommu(iommu, drhd) {
3376		disable_dmar_iommu(iommu);
3377		free_dmar_iommu(iommu);
3378	}
3379
3380	kfree(g_iommus);
3381
3382error:
3383	return ret;
3384}
3385
3386/* This takes a number of _MM_ pages, not VTD pages */
3387static unsigned long intel_alloc_iova(struct device *dev,
3388				     struct dmar_domain *domain,
3389				     unsigned long nrpages, uint64_t dma_mask)
3390{
3391	unsigned long iova_pfn;
3392
3393	/*
3394	 * Restrict dma_mask to the width that the iommu can handle.
3395	 * First-level translation restricts the input-address to a
3396	 * canonical address (i.e., address bits 63:N have the same
3397	 * value as address bit [N-1], where N is 48-bits with 4-level
3398	 * paging and 57-bits with 5-level paging). Hence, skip bit
3399	 * [N-1].
3400	 */
3401	if (domain_use_first_level(domain))
3402		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3403				 dma_mask);
3404	else
3405		dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3406				 dma_mask);
3407
3408	/* Ensure we reserve the whole size-aligned region */
3409	nrpages = __roundup_pow_of_two(nrpages);
3410
3411	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3412		/*
3413		 * First try to allocate an io virtual address in
3414		 * DMA_BIT_MASK(32) and if that fails then try allocating
3415		 * from higher range
3416		 */
3417		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3418					   IOVA_PFN(DMA_BIT_MASK(32)), false);
3419		if (iova_pfn)
3420			return iova_pfn;
3421	}
3422	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3423				   IOVA_PFN(dma_mask), true);
3424	if (unlikely(!iova_pfn)) {
3425		dev_err_once(dev, "Allocating %ld-page iova failed\n",
3426			     nrpages);
3427		return 0;
3428	}
3429
3430	return iova_pfn;
3431}
3432
3433static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3434				     size_t size, int dir, u64 dma_mask)
3435{
3436	struct dmar_domain *domain;
3437	phys_addr_t start_paddr;
3438	unsigned long iova_pfn;
3439	int prot = 0;
3440	int ret;
3441	struct intel_iommu *iommu;
3442	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3443
3444	BUG_ON(dir == DMA_NONE);
3445
3446	if (unlikely(attach_deferred(dev)))
3447		do_deferred_attach(dev);
3448
3449	domain = find_domain(dev);
3450	if (!domain)
3451		return DMA_MAPPING_ERROR;
3452
3453	iommu = domain_get_iommu(domain);
3454	size = aligned_nrpages(paddr, size);
3455
3456	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3457	if (!iova_pfn)
3458		goto error;
3459
3460	/*
3461	 * Check if DMAR supports zero-length reads on write only
3462	 * mappings..
3463	 */
3464	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3465			!cap_zlr(iommu->cap))
3466		prot |= DMA_PTE_READ;
3467	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3468		prot |= DMA_PTE_WRITE;
3469	/*
3470	 * paddr - (paddr + size) might be partial page, we should map the whole
3471	 * page.  Note: if two part of one page are separately mapped, we
3472	 * might have two guest_addr mapping to the same host paddr, but this
3473	 * is not a big problem
3474	 */
3475	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3476				 mm_to_dma_pfn(paddr_pfn), size, prot);
3477	if (ret)
3478		goto error;
3479
3480	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3481	start_paddr += paddr & ~PAGE_MASK;
3482
3483	trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3484
3485	return start_paddr;
3486
3487error:
3488	if (iova_pfn)
3489		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3490	dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3491		size, (unsigned long long)paddr, dir);
3492	return DMA_MAPPING_ERROR;
3493}
3494
3495static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3496				 unsigned long offset, size_t size,
3497				 enum dma_data_direction dir,
3498				 unsigned long attrs)
3499{
3500	return __intel_map_single(dev, page_to_phys(page) + offset,
3501				  size, dir, *dev->dma_mask);
3502}
3503
3504static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3505				     size_t size, enum dma_data_direction dir,
3506				     unsigned long attrs)
3507{
3508	return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3509}
3510
3511static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3512{
3513	struct dmar_domain *domain;
3514	unsigned long start_pfn, last_pfn;
3515	unsigned long nrpages;
3516	unsigned long iova_pfn;
3517	struct intel_iommu *iommu;
3518	struct page *freelist;
3519	struct pci_dev *pdev = NULL;
3520
3521	domain = find_domain(dev);
3522	BUG_ON(!domain);
3523
3524	iommu = domain_get_iommu(domain);
3525
3526	iova_pfn = IOVA_PFN(dev_addr);
3527
3528	nrpages = aligned_nrpages(dev_addr, size);
3529	start_pfn = mm_to_dma_pfn(iova_pfn);
3530	last_pfn = start_pfn + nrpages - 1;
3531
3532	if (dev_is_pci(dev))
3533		pdev = to_pci_dev(dev);
3534
3535	freelist = domain_unmap(domain, start_pfn, last_pfn);
3536	if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3537			!has_iova_flush_queue(&domain->iovad)) {
3538		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3539				      nrpages, !freelist, 0);
3540		/* free iova */
3541		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3542		dma_free_pagelist(freelist);
3543	} else {
3544		queue_iova(&domain->iovad, iova_pfn, nrpages,
3545			   (unsigned long)freelist);
3546		/*
3547		 * queue up the release of the unmap to save the 1/6th of the
3548		 * cpu used up by the iotlb flush operation...
3549		 */
3550	}
3551
3552	trace_unmap_single(dev, dev_addr, size);
3553}
3554
3555static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3556			     size_t size, enum dma_data_direction dir,
3557			     unsigned long attrs)
3558{
3559	intel_unmap(dev, dev_addr, size);
3560}
3561
3562static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3563		size_t size, enum dma_data_direction dir, unsigned long attrs)
3564{
3565	intel_unmap(dev, dev_addr, size);
3566}
3567
3568static void *intel_alloc_coherent(struct device *dev, size_t size,
3569				  dma_addr_t *dma_handle, gfp_t flags,
3570				  unsigned long attrs)
3571{
3572	struct page *page = NULL;
3573	int order;
3574
3575	if (unlikely(attach_deferred(dev)))
3576		do_deferred_attach(dev);
3577
3578	size = PAGE_ALIGN(size);
3579	order = get_order(size);
3580
3581	if (gfpflags_allow_blocking(flags)) {
3582		unsigned int count = size >> PAGE_SHIFT;
3583
3584		page = dma_alloc_from_contiguous(dev, count, order,
3585						 flags & __GFP_NOWARN);
3586	}
3587
3588	if (!page)
3589		page = alloc_pages(flags, order);
3590	if (!page)
3591		return NULL;
3592	memset(page_address(page), 0, size);
3593
3594	*dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3595					 DMA_BIDIRECTIONAL,
3596					 dev->coherent_dma_mask);
3597	if (*dma_handle != DMA_MAPPING_ERROR)
3598		return page_address(page);
3599	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3600		__free_pages(page, order);
3601
3602	return NULL;
3603}
3604
3605static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3606				dma_addr_t dma_handle, unsigned long attrs)
3607{
3608	int order;
3609	struct page *page = virt_to_page(vaddr);
3610
3611	size = PAGE_ALIGN(size);
3612	order = get_order(size);
3613
3614	intel_unmap(dev, dma_handle, size);
3615	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3616		__free_pages(page, order);
3617}
3618
3619static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3620			   int nelems, enum dma_data_direction dir,
3621			   unsigned long attrs)
3622{
3623	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3624	unsigned long nrpages = 0;
3625	struct scatterlist *sg;
3626	int i;
3627
3628	for_each_sg(sglist, sg, nelems, i) {
3629		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3630	}
3631
3632	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3633
3634	trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3635}
3636
3637static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3638			enum dma_data_direction dir, unsigned long attrs)
3639{
3640	int i;
3641	struct dmar_domain *domain;
3642	size_t size = 0;
3643	int prot = 0;
3644	unsigned long iova_pfn;
3645	int ret;
3646	struct scatterlist *sg;
3647	unsigned long start_vpfn;
3648	struct intel_iommu *iommu;
3649
3650	BUG_ON(dir == DMA_NONE);
3651
3652	if (unlikely(attach_deferred(dev)))
3653		do_deferred_attach(dev);
3654
3655	domain = find_domain(dev);
3656	if (!domain)
3657		return 0;
3658
3659	iommu = domain_get_iommu(domain);
3660
3661	for_each_sg(sglist, sg, nelems, i)
3662		size += aligned_nrpages(sg->offset, sg->length);
3663
3664	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3665				*dev->dma_mask);
3666	if (!iova_pfn) {
3667		sglist->dma_length = 0;
3668		return 0;
3669	}
3670
3671	/*
3672	 * Check if DMAR supports zero-length reads on write only
3673	 * mappings..
3674	 */
3675	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3676			!cap_zlr(iommu->cap))
3677		prot |= DMA_PTE_READ;
3678	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3679		prot |= DMA_PTE_WRITE;
3680
3681	start_vpfn = mm_to_dma_pfn(iova_pfn);
3682
3683	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3684	if (unlikely(ret)) {
3685		dma_pte_free_pagetable(domain, start_vpfn,
3686				       start_vpfn + size - 1,
3687				       agaw_to_level(domain->agaw) + 1);
3688		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3689		return 0;
3690	}
3691
3692	for_each_sg(sglist, sg, nelems, i)
3693		trace_map_sg(dev, i + 1, nelems, sg);
3694
3695	return nelems;
3696}
3697
3698static u64 intel_get_required_mask(struct device *dev)
3699{
3700	return DMA_BIT_MASK(32);
3701}
3702
3703static const struct dma_map_ops intel_dma_ops = {
3704	.alloc = intel_alloc_coherent,
3705	.free = intel_free_coherent,
3706	.map_sg = intel_map_sg,
3707	.unmap_sg = intel_unmap_sg,
3708	.map_page = intel_map_page,
3709	.unmap_page = intel_unmap_page,
3710	.map_resource = intel_map_resource,
3711	.unmap_resource = intel_unmap_resource,
3712	.dma_supported = dma_direct_supported,
3713	.mmap = dma_common_mmap,
3714	.get_sgtable = dma_common_get_sgtable,
3715	.get_required_mask = intel_get_required_mask,
3716};
3717
3718static void
3719bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3720		   enum dma_data_direction dir, enum dma_sync_target target)
3721{
3722	struct dmar_domain *domain;
3723	phys_addr_t tlb_addr;
3724
3725	domain = find_domain(dev);
3726	if (WARN_ON(!domain))
3727		return;
3728
3729	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3730	if (is_swiotlb_buffer(tlb_addr))
3731		swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3732}
3733
3734static dma_addr_t
3735bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3736		  enum dma_data_direction dir, unsigned long attrs,
3737		  u64 dma_mask)
3738{
3739	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3740	struct dmar_domain *domain;
3741	struct intel_iommu *iommu;
3742	unsigned long iova_pfn;
3743	unsigned long nrpages;
3744	phys_addr_t tlb_addr;
3745	int prot = 0;
3746	int ret;
3747
3748	if (unlikely(attach_deferred(dev)))
3749		do_deferred_attach(dev);
3750
3751	domain = find_domain(dev);
3752
3753	if (WARN_ON(dir == DMA_NONE || !domain))
3754		return DMA_MAPPING_ERROR;
3755
3756	iommu = domain_get_iommu(domain);
3757	if (WARN_ON(!iommu))
3758		return DMA_MAPPING_ERROR;
3759
3760	nrpages = aligned_nrpages(0, size);
3761	iova_pfn = intel_alloc_iova(dev, domain,
3762				    dma_to_mm_pfn(nrpages), dma_mask);
3763	if (!iova_pfn)
3764		return DMA_MAPPING_ERROR;
3765
3766	/*
3767	 * Check if DMAR supports zero-length reads on write only
3768	 * mappings..
3769	 */
3770	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3771			!cap_zlr(iommu->cap))
3772		prot |= DMA_PTE_READ;
3773	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3774		prot |= DMA_PTE_WRITE;
3775
3776	/*
3777	 * If both the physical buffer start address and size are
3778	 * page aligned, we don't need to use a bounce page.
3779	 */
3780	if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3781		tlb_addr = swiotlb_tbl_map_single(dev,
3782				__phys_to_dma(dev, io_tlb_start),
3783				paddr, size, aligned_size, dir, attrs);
3784		if (tlb_addr == DMA_MAPPING_ERROR) {
3785			goto swiotlb_error;
3786		} else {
3787			/* Cleanup the padding area. */
3788			void *padding_start = phys_to_virt(tlb_addr);
3789			size_t padding_size = aligned_size;
3790
3791			if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3792			    (dir == DMA_TO_DEVICE ||
3793			     dir == DMA_BIDIRECTIONAL)) {
3794				padding_start += size;
3795				padding_size -= size;
3796			}
3797
3798			memset(padding_start, 0, padding_size);
3799		}
3800	} else {
3801		tlb_addr = paddr;
3802	}
3803
3804	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3805				 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3806	if (ret)
3807		goto mapping_error;
3808
3809	trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3810
3811	return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3812
3813mapping_error:
3814	if (is_swiotlb_buffer(tlb_addr))
3815		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3816					 aligned_size, dir, attrs);
3817swiotlb_error:
3818	free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3819	dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3820		size, (unsigned long long)paddr, dir);
3821
3822	return DMA_MAPPING_ERROR;
3823}
3824
3825static void
3826bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3827		    enum dma_data_direction dir, unsigned long attrs)
3828{
3829	size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3830	struct dmar_domain *domain;
3831	phys_addr_t tlb_addr;
3832
3833	domain = find_domain(dev);
3834	if (WARN_ON(!domain))
3835		return;
3836
3837	tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3838	if (WARN_ON(!tlb_addr))
3839		return;
3840
3841	intel_unmap(dev, dev_addr, size);
3842	if (is_swiotlb_buffer(tlb_addr))
3843		swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3844					 aligned_size, dir, attrs);
3845
3846	trace_bounce_unmap_single(dev, dev_addr, size);
3847}
3848
3849static dma_addr_t
3850bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3851		size_t size, enum dma_data_direction dir, unsigned long attrs)
3852{
3853	return bounce_map_single(dev, page_to_phys(page) + offset,
3854				 size, dir, attrs, *dev->dma_mask);
3855}
3856
3857static dma_addr_t
3858bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3859		    enum dma_data_direction dir, unsigned long attrs)
3860{
3861	return bounce_map_single(dev, phys_addr, size,
3862				 dir, attrs, *dev->dma_mask);
3863}
3864
3865static void
3866bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3867		  enum dma_data_direction dir, unsigned long attrs)
3868{
3869	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3870}
3871
3872static void
3873bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3874		      enum dma_data_direction dir, unsigned long attrs)
3875{
3876	bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3877}
3878
3879static void
3880bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3881		enum dma_data_direction dir, unsigned long attrs)
3882{
3883	struct scatterlist *sg;
3884	int i;
3885
3886	for_each_sg(sglist, sg, nelems, i)
3887		bounce_unmap_page(dev, sg->dma_address,
3888				  sg_dma_len(sg), dir, attrs);
3889}
3890
3891static int
3892bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3893	      enum dma_data_direction dir, unsigned long attrs)
3894{
3895	int i;
3896	struct scatterlist *sg;
3897
3898	for_each_sg(sglist, sg, nelems, i) {
3899		sg->dma_address = bounce_map_page(dev, sg_page(sg),
3900						  sg->offset, sg->length,
3901						  dir, attrs);
3902		if (sg->dma_address == DMA_MAPPING_ERROR)
3903			goto out_unmap;
3904		sg_dma_len(sg) = sg->length;
3905	}
3906
3907	for_each_sg(sglist, sg, nelems, i)
3908		trace_bounce_map_sg(dev, i + 1, nelems, sg);
3909
3910	return nelems;
3911
3912out_unmap:
3913	bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3914	return 0;
3915}
3916
3917static void
3918bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3919			   size_t size, enum dma_data_direction dir)
3920{
3921	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3922}
3923
3924static void
3925bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3926			      size_t size, enum dma_data_direction dir)
3927{
3928	bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3929}
3930
3931static void
3932bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3933		       int nelems, enum dma_data_direction dir)
3934{
3935	struct scatterlist *sg;
3936	int i;
3937
3938	for_each_sg(sglist, sg, nelems, i)
3939		bounce_sync_single(dev, sg_dma_address(sg),
3940				   sg_dma_len(sg), dir, SYNC_FOR_CPU);
3941}
3942
3943static void
3944bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3945			  int nelems, enum dma_data_direction dir)
3946{
3947	struct scatterlist *sg;
3948	int i;
3949
3950	for_each_sg(sglist, sg, nelems, i)
3951		bounce_sync_single(dev, sg_dma_address(sg),
3952				   sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3953}
3954
3955static const struct dma_map_ops bounce_dma_ops = {
3956	.alloc			= intel_alloc_coherent,
3957	.free			= intel_free_coherent,
3958	.map_sg			= bounce_map_sg,
3959	.unmap_sg		= bounce_unmap_sg,
3960	.map_page		= bounce_map_page,
3961	.unmap_page		= bounce_unmap_page,
3962	.sync_single_for_cpu	= bounce_sync_single_for_cpu,
3963	.sync_single_for_device	= bounce_sync_single_for_device,
3964	.sync_sg_for_cpu	= bounce_sync_sg_for_cpu,
3965	.sync_sg_for_device	= bounce_sync_sg_for_device,
3966	.map_resource		= bounce_map_resource,
3967	.unmap_resource		= bounce_unmap_resource,
3968	.dma_supported		= dma_direct_supported,
3969};
3970
3971static inline int iommu_domain_cache_init(void)
3972{
3973	int ret = 0;
3974
3975	iommu_domain_cache = kmem_cache_create("iommu_domain",
3976					 sizeof(struct dmar_domain),
3977					 0,
3978					 SLAB_HWCACHE_ALIGN,
3979
3980					 NULL);
3981	if (!iommu_domain_cache) {
3982		pr_err("Couldn't create iommu_domain cache\n");
3983		ret = -ENOMEM;
3984	}
3985
3986	return ret;
3987}
3988
3989static inline int iommu_devinfo_cache_init(void)
3990{
3991	int ret = 0;
3992
3993	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3994					 sizeof(struct device_domain_info),
3995					 0,
3996					 SLAB_HWCACHE_ALIGN,
3997					 NULL);
3998	if (!iommu_devinfo_cache) {
3999		pr_err("Couldn't create devinfo cache\n");
4000		ret = -ENOMEM;
4001	}
4002
4003	return ret;
4004}
4005
4006static int __init iommu_init_mempool(void)
4007{
4008	int ret;
4009	ret = iova_cache_get();
4010	if (ret)
4011		return ret;
4012
4013	ret = iommu_domain_cache_init();
4014	if (ret)
4015		goto domain_error;
4016
4017	ret = iommu_devinfo_cache_init();
4018	if (!ret)
4019		return ret;
4020
4021	kmem_cache_destroy(iommu_domain_cache);
4022domain_error:
4023	iova_cache_put();
4024
4025	return -ENOMEM;
4026}
4027
4028static void __init iommu_exit_mempool(void)
4029{
4030	kmem_cache_destroy(iommu_devinfo_cache);
4031	kmem_cache_destroy(iommu_domain_cache);
4032	iova_cache_put();
4033}
4034
4035static void __init init_no_remapping_devices(void)
4036{
4037	struct dmar_drhd_unit *drhd;
4038	struct device *dev;
4039	int i;
4040
4041	for_each_drhd_unit(drhd) {
4042		if (!drhd->include_all) {
4043			for_each_active_dev_scope(drhd->devices,
4044						  drhd->devices_cnt, i, dev)
4045				break;
4046			/* ignore DMAR unit if no devices exist */
4047			if (i == drhd->devices_cnt)
4048				drhd->ignored = 1;
4049		}
4050	}
4051
4052	for_each_active_drhd_unit(drhd) {
4053		if (drhd->include_all)
4054			continue;
4055
4056		for_each_active_dev_scope(drhd->devices,
4057					  drhd->devices_cnt, i, dev)
4058			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4059				break;
4060		if (i < drhd->devices_cnt)
4061			continue;
4062
4063		/* This IOMMU has *only* gfx devices. Either bypass it or
4064		   set the gfx_mapped flag, as appropriate */
4065		drhd->gfx_dedicated = 1;
4066		if (!dmar_map_gfx)
4067			drhd->ignored = 1;
4068	}
4069}
4070
4071#ifdef CONFIG_SUSPEND
4072static int init_iommu_hw(void)
4073{
4074	struct dmar_drhd_unit *drhd;
4075	struct intel_iommu *iommu = NULL;
4076
4077	for_each_active_iommu(iommu, drhd)
4078		if (iommu->qi)
4079			dmar_reenable_qi(iommu);
4080
4081	for_each_iommu(iommu, drhd) {
4082		if (drhd->ignored) {
4083			/*
4084			 * we always have to disable PMRs or DMA may fail on
4085			 * this device
4086			 */
4087			if (force_on)
4088				iommu_disable_protect_mem_regions(iommu);
4089			continue;
4090		}
4091
4092		iommu_flush_write_buffer(iommu);
4093
4094		iommu_set_root_entry(iommu);
4095
4096		iommu->flush.flush_context(iommu, 0, 0, 0,
4097					   DMA_CCMD_GLOBAL_INVL);
4098		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4099		iommu_enable_translation(iommu);
4100		iommu_disable_protect_mem_regions(iommu);
4101	}
4102
4103	return 0;
4104}
4105
4106static void iommu_flush_all(void)
4107{
4108	struct dmar_drhd_unit *drhd;
4109	struct intel_iommu *iommu;
4110
4111	for_each_active_iommu(iommu, drhd) {
4112		iommu->flush.flush_context(iommu, 0, 0, 0,
4113					   DMA_CCMD_GLOBAL_INVL);
4114		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4115					 DMA_TLB_GLOBAL_FLUSH);
4116	}
4117}
4118
4119static int iommu_suspend(void)
4120{
4121	struct dmar_drhd_unit *drhd;
4122	struct intel_iommu *iommu = NULL;
4123	unsigned long flag;
4124
4125	for_each_active_iommu(iommu, drhd) {
4126		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4127						 GFP_ATOMIC);
4128		if (!iommu->iommu_state)
4129			goto nomem;
4130	}
4131
4132	iommu_flush_all();
4133
4134	for_each_active_iommu(iommu, drhd) {
4135		iommu_disable_translation(iommu);
4136
4137		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4138
4139		iommu->iommu_state[SR_DMAR_FECTL_REG] =
4140			readl(iommu->reg + DMAR_FECTL_REG);
4141		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4142			readl(iommu->reg + DMAR_FEDATA_REG);
4143		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4144			readl(iommu->reg + DMAR_FEADDR_REG);
4145		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4146			readl(iommu->reg + DMAR_FEUADDR_REG);
4147
4148		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4149	}
4150	return 0;
4151
4152nomem:
4153	for_each_active_iommu(iommu, drhd)
4154		kfree(iommu->iommu_state);
4155
4156	return -ENOMEM;
4157}
4158
4159static void iommu_resume(void)
4160{
4161	struct dmar_drhd_unit *drhd;
4162	struct intel_iommu *iommu = NULL;
4163	unsigned long flag;
4164
4165	if (init_iommu_hw()) {
4166		if (force_on)
4167			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4168		else
4169			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4170		return;
4171	}
4172
4173	for_each_active_iommu(iommu, drhd) {
4174
4175		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4176
4177		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4178			iommu->reg + DMAR_FECTL_REG);
4179		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4180			iommu->reg + DMAR_FEDATA_REG);
4181		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4182			iommu->reg + DMAR_FEADDR_REG);
4183		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4184			iommu->reg + DMAR_FEUADDR_REG);
4185
4186		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4187	}
4188
4189	for_each_active_iommu(iommu, drhd)
4190		kfree(iommu->iommu_state);
4191}
4192
4193static struct syscore_ops iommu_syscore_ops = {
4194	.resume		= iommu_resume,
4195	.suspend	= iommu_suspend,
4196};
4197
4198static void __init init_iommu_pm_ops(void)
4199{
4200	register_syscore_ops(&iommu_syscore_ops);
4201}
4202
4203#else
4204static inline void init_iommu_pm_ops(void) {}
4205#endif	/* CONFIG_PM */
4206
4207static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4208{
4209	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4210	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4211	    rmrr->end_address <= rmrr->base_address ||
4212	    arch_rmrr_sanity_check(rmrr))
4213		return -EINVAL;
4214
4215	return 0;
4216}
4217
4218int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4219{
4220	struct acpi_dmar_reserved_memory *rmrr;
4221	struct dmar_rmrr_unit *rmrru;
4222
4223	rmrr = (struct acpi_dmar_reserved_memory *)header;
4224	if (rmrr_sanity_check(rmrr)) {
4225		pr_warn(FW_BUG
4226			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4227			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4228			   rmrr->base_address, rmrr->end_address,
4229			   dmi_get_system_info(DMI_BIOS_VENDOR),
4230			   dmi_get_system_info(DMI_BIOS_VERSION),
4231			   dmi_get_system_info(DMI_PRODUCT_VERSION));
4232		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4233	}
4234
4235	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4236	if (!rmrru)
4237		goto out;
4238
4239	rmrru->hdr = header;
4240
4241	rmrru->base_address = rmrr->base_address;
4242	rmrru->end_address = rmrr->end_address;
4243
4244	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4245				((void *)rmrr) + rmrr->header.length,
4246				&rmrru->devices_cnt);
4247	if (rmrru->devices_cnt && rmrru->devices == NULL)
4248		goto free_rmrru;
4249
4250	list_add(&rmrru->list, &dmar_rmrr_units);
4251
4252	return 0;
4253free_rmrru:
4254	kfree(rmrru);
4255out:
4256	return -ENOMEM;
4257}
4258
4259static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4260{
4261	struct dmar_atsr_unit *atsru;
4262	struct acpi_dmar_atsr *tmp;
4263
4264	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4265				dmar_rcu_check()) {
4266		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4267		if (atsr->segment != tmp->segment)
4268			continue;
4269		if (atsr->header.length != tmp->header.length)
4270			continue;
4271		if (memcmp(atsr, tmp, atsr->header.length) == 0)
4272			return atsru;
4273	}
4274
4275	return NULL;
4276}
4277
4278int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4279{
4280	struct acpi_dmar_atsr *atsr;
4281	struct dmar_atsr_unit *atsru;
4282
4283	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4284		return 0;
4285
4286	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4287	atsru = dmar_find_atsr(atsr);
4288	if (atsru)
4289		return 0;
4290
4291	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4292	if (!atsru)
4293		return -ENOMEM;
4294
4295	/*
4296	 * If memory is allocated from slab by ACPI _DSM method, we need to
4297	 * copy the memory content because the memory buffer will be freed
4298	 * on return.
4299	 */
4300	atsru->hdr = (void *)(atsru + 1);
4301	memcpy(atsru->hdr, hdr, hdr->length);
4302	atsru->include_all = atsr->flags & 0x1;
4303	if (!atsru->include_all) {
4304		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4305				(void *)atsr + atsr->header.length,
4306				&atsru->devices_cnt);
4307		if (atsru->devices_cnt && atsru->devices == NULL) {
4308			kfree(atsru);
4309			return -ENOMEM;
4310		}
4311	}
4312
4313	list_add_rcu(&atsru->list, &dmar_atsr_units);
4314
4315	return 0;
4316}
4317
4318static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4319{
4320	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4321	kfree(atsru);
4322}
4323
4324int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4325{
4326	struct acpi_dmar_atsr *atsr;
4327	struct dmar_atsr_unit *atsru;
4328
4329	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4330	atsru = dmar_find_atsr(atsr);
4331	if (atsru) {
4332		list_del_rcu(&atsru->list);
4333		synchronize_rcu();
4334		intel_iommu_free_atsr(atsru);
4335	}
4336
4337	return 0;
4338}
4339
4340int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4341{
4342	int i;
4343	struct device *dev;
4344	struct acpi_dmar_atsr *atsr;
4345	struct dmar_atsr_unit *atsru;
4346
4347	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4348	atsru = dmar_find_atsr(atsr);
4349	if (!atsru)
4350		return 0;
4351
4352	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4353		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4354					  i, dev)
4355			return -EBUSY;
4356	}
4357
4358	return 0;
4359}
4360
4361static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4362{
4363	int sp, ret;
4364	struct intel_iommu *iommu = dmaru->iommu;
4365
4366	if (g_iommus[iommu->seq_id])
4367		return 0;
4368
4369	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4370		pr_warn("%s: Doesn't support hardware pass through.\n",
4371			iommu->name);
4372		return -ENXIO;
4373	}
4374	if (!ecap_sc_support(iommu->ecap) &&
4375	    domain_update_iommu_snooping(iommu)) {
4376		pr_warn("%s: Doesn't support snooping.\n",
4377			iommu->name);
4378		return -ENXIO;
4379	}
4380	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4381	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4382		pr_warn("%s: Doesn't support large page.\n",
4383			iommu->name);
4384		return -ENXIO;
4385	}
4386
4387	/*
4388	 * Disable translation if already enabled prior to OS handover.
4389	 */
4390	if (iommu->gcmd & DMA_GCMD_TE)
4391		iommu_disable_translation(iommu);
4392
4393	g_iommus[iommu->seq_id] = iommu;
4394	ret = iommu_init_domains(iommu);
4395	if (ret == 0)
4396		ret = iommu_alloc_root_entry(iommu);
4397	if (ret)
4398		goto out;
4399
4400	intel_svm_check(iommu);
4401
4402	if (dmaru->ignored) {
4403		/*
4404		 * we always have to disable PMRs or DMA may fail on this device
4405		 */
4406		if (force_on)
4407			iommu_disable_protect_mem_regions(iommu);
4408		return 0;
4409	}
4410
4411	intel_iommu_init_qi(iommu);
4412	iommu_flush_write_buffer(iommu);
4413
4414#ifdef CONFIG_INTEL_IOMMU_SVM
4415	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4416		ret = intel_svm_enable_prq(iommu);
4417		if (ret)
4418			goto disable_iommu;
4419	}
4420#endif
4421	ret = dmar_set_interrupt(iommu);
4422	if (ret)
4423		goto disable_iommu;
4424
4425	iommu_set_root_entry(iommu);
4426	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4427	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4428	iommu_enable_translation(iommu);
4429
4430	iommu_disable_protect_mem_regions(iommu);
4431	return 0;
4432
4433disable_iommu:
4434	disable_dmar_iommu(iommu);
4435out:
4436	free_dmar_iommu(iommu);
4437	return ret;
4438}
4439
4440int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4441{
4442	int ret = 0;
4443	struct intel_iommu *iommu = dmaru->iommu;
4444
4445	if (!intel_iommu_enabled)
4446		return 0;
4447	if (iommu == NULL)
4448		return -EINVAL;
4449
4450	if (insert) {
4451		ret = intel_iommu_add(dmaru);
4452	} else {
4453		disable_dmar_iommu(iommu);
4454		free_dmar_iommu(iommu);
4455	}
4456
4457	return ret;
4458}
4459
4460static void intel_iommu_free_dmars(void)
4461{
4462	struct dmar_rmrr_unit *rmrru, *rmrr_n;
4463	struct dmar_atsr_unit *atsru, *atsr_n;
4464
4465	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4466		list_del(&rmrru->list);
4467		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4468		kfree(rmrru);
4469	}
4470
4471	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4472		list_del(&atsru->list);
4473		intel_iommu_free_atsr(atsru);
4474	}
4475}
4476
4477int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4478{
4479	int i, ret = 1;
4480	struct pci_bus *bus;
4481	struct pci_dev *bridge = NULL;
4482	struct device *tmp;
4483	struct acpi_dmar_atsr *atsr;
4484	struct dmar_atsr_unit *atsru;
4485
4486	dev = pci_physfn(dev);
4487	for (bus = dev->bus; bus; bus = bus->parent) {
4488		bridge = bus->self;
4489		/* If it's an integrated device, allow ATS */
4490		if (!bridge)
4491			return 1;
4492		/* Connected via non-PCIe: no ATS */
4493		if (!pci_is_pcie(bridge) ||
4494		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4495			return 0;
4496		/* If we found the root port, look it up in the ATSR */
4497		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4498			break;
4499	}
4500
4501	rcu_read_lock();
4502	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4503		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4504		if (atsr->segment != pci_domain_nr(dev->bus))
4505			continue;
4506
4507		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4508			if (tmp == &bridge->dev)
4509				goto out;
4510
4511		if (atsru->include_all)
4512			goto out;
4513	}
4514	ret = 0;
4515out:
4516	rcu_read_unlock();
4517
4518	return ret;
4519}
4520
4521int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4522{
4523	int ret;
4524	struct dmar_rmrr_unit *rmrru;
4525	struct dmar_atsr_unit *atsru;
4526	struct acpi_dmar_atsr *atsr;
4527	struct acpi_dmar_reserved_memory *rmrr;
4528
4529	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4530		return 0;
4531
4532	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4533		rmrr = container_of(rmrru->hdr,
4534				    struct acpi_dmar_reserved_memory, header);
4535		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4536			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4537				((void *)rmrr) + rmrr->header.length,
4538				rmrr->segment, rmrru->devices,
4539				rmrru->devices_cnt);
4540			if (ret < 0)
4541				return ret;
4542		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4543			dmar_remove_dev_scope(info, rmrr->segment,
4544				rmrru->devices, rmrru->devices_cnt);
4545		}
4546	}
4547
4548	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4549		if (atsru->include_all)
4550			continue;
4551
4552		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4553		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4554			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4555					(void *)atsr + atsr->header.length,
4556					atsr->segment, atsru->devices,
4557					atsru->devices_cnt);
4558			if (ret > 0)
4559				break;
4560			else if (ret < 0)
4561				return ret;
4562		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4563			if (dmar_remove_dev_scope(info, atsr->segment,
4564					atsru->devices, atsru->devices_cnt))
4565				break;
4566		}
4567	}
4568
4569	return 0;
4570}
4571
4572static int intel_iommu_memory_notifier(struct notifier_block *nb,
4573				       unsigned long val, void *v)
4574{
4575	struct memory_notify *mhp = v;
4576	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4577	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4578			mhp->nr_pages - 1);
4579
4580	switch (val) {
4581	case MEM_GOING_ONLINE:
4582		if (iommu_domain_identity_map(si_domain,
4583					      start_vpfn, last_vpfn)) {
4584			pr_warn("Failed to build identity map for [%lx-%lx]\n",
4585				start_vpfn, last_vpfn);
4586			return NOTIFY_BAD;
4587		}
4588		break;
4589
4590	case MEM_OFFLINE:
4591	case MEM_CANCEL_ONLINE:
4592		{
4593			struct dmar_drhd_unit *drhd;
4594			struct intel_iommu *iommu;
4595			struct page *freelist;
4596
4597			freelist = domain_unmap(si_domain,
4598						start_vpfn, last_vpfn);
4599
4600			rcu_read_lock();
4601			for_each_active_iommu(iommu, drhd)
4602				iommu_flush_iotlb_psi(iommu, si_domain,
4603					start_vpfn, mhp->nr_pages,
4604					!freelist, 0);
4605			rcu_read_unlock();
4606			dma_free_pagelist(freelist);
4607		}
4608		break;
4609	}
4610
4611	return NOTIFY_OK;
4612}
4613
4614static struct notifier_block intel_iommu_memory_nb = {
4615	.notifier_call = intel_iommu_memory_notifier,
4616	.priority = 0
4617};
4618
4619static void free_all_cpu_cached_iovas(unsigned int cpu)
4620{
4621	int i;
4622
4623	for (i = 0; i < g_num_of_iommus; i++) {
4624		struct intel_iommu *iommu = g_iommus[i];
4625		struct dmar_domain *domain;
4626		int did;
4627
4628		if (!iommu)
4629			continue;
4630
4631		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4632			domain = get_iommu_domain(iommu, (u16)did);
4633
4634			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4635				continue;
4636
4637			free_cpu_cached_iovas(cpu, &domain->iovad);
4638		}
4639	}
4640}
4641
4642static int intel_iommu_cpu_dead(unsigned int cpu)
4643{
4644	free_all_cpu_cached_iovas(cpu);
4645	return 0;
4646}
4647
4648static void intel_disable_iommus(void)
4649{
4650	struct intel_iommu *iommu = NULL;
4651	struct dmar_drhd_unit *drhd;
4652
4653	for_each_iommu(iommu, drhd)
4654		iommu_disable_translation(iommu);
4655}
4656
4657void intel_iommu_shutdown(void)
4658{
4659	struct dmar_drhd_unit *drhd;
4660	struct intel_iommu *iommu = NULL;
4661
4662	if (no_iommu || dmar_disabled)
4663		return;
4664
4665	down_write(&dmar_global_lock);
4666
4667	/* Disable PMRs explicitly here. */
4668	for_each_iommu(iommu, drhd)
4669		iommu_disable_protect_mem_regions(iommu);
4670
4671	/* Make sure the IOMMUs are switched off */
4672	intel_disable_iommus();
4673
4674	up_write(&dmar_global_lock);
4675}
4676
4677static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4678{
4679	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4680
4681	return container_of(iommu_dev, struct intel_iommu, iommu);
4682}
4683
4684static ssize_t intel_iommu_show_version(struct device *dev,
4685					struct device_attribute *attr,
4686					char *buf)
4687{
4688	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4689	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4690	return sprintf(buf, "%d:%d\n",
4691		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4692}
4693static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4694
4695static ssize_t intel_iommu_show_address(struct device *dev,
4696					struct device_attribute *attr,
4697					char *buf)
4698{
4699	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4700	return sprintf(buf, "%llx\n", iommu->reg_phys);
4701}
4702static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4703
4704static ssize_t intel_iommu_show_cap(struct device *dev,
4705				    struct device_attribute *attr,
4706				    char *buf)
4707{
4708	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4709	return sprintf(buf, "%llx\n", iommu->cap);
4710}
4711static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4712
4713static ssize_t intel_iommu_show_ecap(struct device *dev,
4714				    struct device_attribute *attr,
4715				    char *buf)
4716{
4717	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4718	return sprintf(buf, "%llx\n", iommu->ecap);
4719}
4720static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4721
4722static ssize_t intel_iommu_show_ndoms(struct device *dev,
4723				      struct device_attribute *attr,
4724				      char *buf)
4725{
4726	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4727	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4728}
4729static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4730
4731static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4732					   struct device_attribute *attr,
4733					   char *buf)
4734{
4735	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4736	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4737						  cap_ndoms(iommu->cap)));
4738}
4739static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4740
4741static struct attribute *intel_iommu_attrs[] = {
4742	&dev_attr_version.attr,
4743	&dev_attr_address.attr,
4744	&dev_attr_cap.attr,
4745	&dev_attr_ecap.attr,
4746	&dev_attr_domains_supported.attr,
4747	&dev_attr_domains_used.attr,
4748	NULL,
4749};
4750
4751static struct attribute_group intel_iommu_group = {
4752	.name = "intel-iommu",
4753	.attrs = intel_iommu_attrs,
4754};
4755
4756const struct attribute_group *intel_iommu_groups[] = {
4757	&intel_iommu_group,
4758	NULL,
4759};
4760
4761static inline bool has_external_pci(void)
4762{
4763	struct pci_dev *pdev = NULL;
4764
4765	for_each_pci_dev(pdev)
4766		if (pdev->external_facing)
4767			return true;
4768
4769	return false;
4770}
4771
4772static int __init platform_optin_force_iommu(void)
4773{
4774	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4775		return 0;
4776
4777	if (no_iommu || dmar_disabled)
4778		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4779
4780	/*
4781	 * If Intel-IOMMU is disabled by default, we will apply identity
4782	 * map for all devices except those marked as being untrusted.
4783	 */
4784	if (dmar_disabled)
4785		iommu_set_default_passthrough(false);
4786
4787	dmar_disabled = 0;
4788	no_iommu = 0;
4789
4790	return 1;
4791}
4792
4793static int __init probe_acpi_namespace_devices(void)
4794{
4795	struct dmar_drhd_unit *drhd;
4796	/* To avoid a -Wunused-but-set-variable warning. */
4797	struct intel_iommu *iommu __maybe_unused;
4798	struct device *dev;
4799	int i, ret = 0;
4800
4801	for_each_active_iommu(iommu, drhd) {
4802		for_each_active_dev_scope(drhd->devices,
4803					  drhd->devices_cnt, i, dev) {
4804			struct acpi_device_physical_node *pn;
4805			struct iommu_group *group;
4806			struct acpi_device *adev;
4807
4808			if (dev->bus != &acpi_bus_type)
4809				continue;
4810
4811			adev = to_acpi_device(dev);
4812			mutex_lock(&adev->physical_node_lock);
4813			list_for_each_entry(pn,
4814					    &adev->physical_node_list, node) {
4815				group = iommu_group_get(pn->dev);
4816				if (group) {
4817					iommu_group_put(group);
4818					continue;
4819				}
4820
4821				pn->dev->bus->iommu_ops = &intel_iommu_ops;
4822				ret = iommu_probe_device(pn->dev);
4823				if (ret)
4824					break;
4825			}
4826			mutex_unlock(&adev->physical_node_lock);
4827
4828			if (ret)
4829				return ret;
4830		}
4831	}
4832
4833	return 0;
4834}
4835
4836int __init intel_iommu_init(void)
4837{
4838	int ret = -ENODEV;
4839	struct dmar_drhd_unit *drhd;
4840	struct intel_iommu *iommu;
4841
4842	/*
4843	 * Intel IOMMU is required for a TXT/tboot launch or platform
4844	 * opt in, so enforce that.
4845	 */
4846	force_on = tboot_force_iommu() || platform_optin_force_iommu();
4847
4848	if (iommu_init_mempool()) {
4849		if (force_on)
4850			panic("tboot: Failed to initialize iommu memory\n");
4851		return -ENOMEM;
4852	}
4853
4854	down_write(&dmar_global_lock);
4855	if (dmar_table_init()) {
4856		if (force_on)
4857			panic("tboot: Failed to initialize DMAR table\n");
4858		goto out_free_dmar;
4859	}
4860
4861	if (dmar_dev_scope_init() < 0) {
4862		if (force_on)
4863			panic("tboot: Failed to initialize DMAR device scope\n");
4864		goto out_free_dmar;
4865	}
4866
4867	up_write(&dmar_global_lock);
4868
4869	/*
4870	 * The bus notifier takes the dmar_global_lock, so lockdep will
4871	 * complain later when we register it under the lock.
4872	 */
4873	dmar_register_bus_notifier();
4874
4875	down_write(&dmar_global_lock);
4876
4877	if (!no_iommu)
4878		intel_iommu_debugfs_init();
4879
4880	if (no_iommu || dmar_disabled) {
4881		/*
4882		 * We exit the function here to ensure IOMMU's remapping and
4883		 * mempool aren't setup, which means that the IOMMU's PMRs
4884		 * won't be disabled via the call to init_dmars(). So disable
4885		 * it explicitly here. The PMRs were setup by tboot prior to
4886		 * calling SENTER, but the kernel is expected to reset/tear
4887		 * down the PMRs.
4888		 */
4889		if (intel_iommu_tboot_noforce) {
4890			for_each_iommu(iommu, drhd)
4891				iommu_disable_protect_mem_regions(iommu);
4892		}
4893
4894		/*
4895		 * Make sure the IOMMUs are switched off, even when we
4896		 * boot into a kexec kernel and the previous kernel left
4897		 * them enabled
4898		 */
4899		intel_disable_iommus();
4900		goto out_free_dmar;
4901	}
4902
4903	if (list_empty(&dmar_rmrr_units))
4904		pr_info("No RMRR found\n");
4905
4906	if (list_empty(&dmar_atsr_units))
4907		pr_info("No ATSR found\n");
4908
4909	if (dmar_init_reserved_ranges()) {
4910		if (force_on)
4911			panic("tboot: Failed to reserve iommu ranges\n");
4912		goto out_free_reserved_range;
4913	}
4914
4915	if (dmar_map_gfx)
4916		intel_iommu_gfx_mapped = 1;
4917
4918	init_no_remapping_devices();
4919
4920	ret = init_dmars();
4921	if (ret) {
4922		if (force_on)
4923			panic("tboot: Failed to initialize DMARs\n");
4924		pr_err("Initialization failed\n");
4925		goto out_free_reserved_range;
4926	}
4927	up_write(&dmar_global_lock);
4928
4929	init_iommu_pm_ops();
4930
4931	down_read(&dmar_global_lock);
4932	for_each_active_iommu(iommu, drhd) {
4933		iommu_device_sysfs_add(&iommu->iommu, NULL,
4934				       intel_iommu_groups,
4935				       "%s", iommu->name);
4936		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4937		iommu_device_register(&iommu->iommu);
4938	}
4939	up_read(&dmar_global_lock);
4940
4941	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4942	if (si_domain && !hw_pass_through)
4943		register_memory_notifier(&intel_iommu_memory_nb);
4944	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4945			  intel_iommu_cpu_dead);
4946
4947	down_read(&dmar_global_lock);
4948	if (probe_acpi_namespace_devices())
4949		pr_warn("ACPI name space devices didn't probe correctly\n");
4950
4951	/* Finally, we enable the DMA remapping hardware. */
4952	for_each_iommu(iommu, drhd) {
4953		if (!drhd->ignored && !translation_pre_enabled(iommu))
4954			iommu_enable_translation(iommu);
4955
4956		iommu_disable_protect_mem_regions(iommu);
4957	}
4958	up_read(&dmar_global_lock);
4959
4960	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4961
4962	intel_iommu_enabled = 1;
4963
4964	return 0;
4965
4966out_free_reserved_range:
4967	put_iova_domain(&reserved_iova_list);
4968out_free_dmar:
4969	intel_iommu_free_dmars();
4970	up_write(&dmar_global_lock);
4971	iommu_exit_mempool();
4972	return ret;
4973}
4974
4975static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4976{
4977	struct intel_iommu *iommu = opaque;
4978
4979	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4980	return 0;
4981}
4982
4983/*
4984 * NB - intel-iommu lacks any sort of reference counting for the users of
4985 * dependent devices.  If multiple endpoints have intersecting dependent
4986 * devices, unbinding the driver from any one of them will possibly leave
4987 * the others unable to operate.
4988 */
4989static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4990{
4991	if (!iommu || !dev || !dev_is_pci(dev))
4992		return;
4993
4994	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4995}
4996
4997static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4998{
4999	struct dmar_domain *domain;
5000	struct intel_iommu *iommu;
5001	unsigned long flags;
5002
5003	assert_spin_locked(&device_domain_lock);
5004
5005	if (WARN_ON(!info))
5006		return;
5007
5008	iommu = info->iommu;
5009	domain = info->domain;
5010
5011	if (info->dev) {
5012		if (dev_is_pci(info->dev) && sm_supported(iommu))
5013			intel_pasid_tear_down_entry(iommu, info->dev,
5014					PASID_RID2PASID, false);
5015
5016		iommu_disable_dev_iotlb(info);
5017		if (!dev_is_real_dma_subdevice(info->dev))
5018			domain_context_clear(iommu, info->dev);
5019		intel_pasid_free_table(info->dev);
5020	}
5021
5022	unlink_domain_info(info);
5023
5024	spin_lock_irqsave(&iommu->lock, flags);
5025	domain_detach_iommu(domain, iommu);
5026	spin_unlock_irqrestore(&iommu->lock, flags);
5027
5028	free_devinfo_mem(info);
5029}
5030
5031static void dmar_remove_one_dev_info(struct device *dev)
5032{
5033	struct device_domain_info *info;
5034	unsigned long flags;
5035
5036	spin_lock_irqsave(&device_domain_lock, flags);
5037	info = get_domain_info(dev);
5038	if (info)
5039		__dmar_remove_one_dev_info(info);
5040	spin_unlock_irqrestore(&device_domain_lock, flags);
5041}
5042
5043static int md_domain_init(struct dmar_domain *domain, int guest_width)
5044{
5045	int adjust_width;
5046
5047	/* calculate AGAW */
5048	domain->gaw = guest_width;
5049	adjust_width = guestwidth_to_adjustwidth(guest_width);
5050	domain->agaw = width_to_agaw(adjust_width);
5051
5052	domain->iommu_coherency = 0;
5053	domain->iommu_snooping = 0;
5054	domain->iommu_superpage = 0;
5055	domain->max_addr = 0;
5056
5057	/* always allocate the top pgd */
5058	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5059	if (!domain->pgd)
5060		return -ENOMEM;
5061	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5062	return 0;
5063}
5064
5065static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5066{
5067	init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5068	copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5069
5070	if (!intel_iommu_strict &&
5071	    init_iova_flush_queue(&dmar_domain->iovad,
5072				  iommu_flush_iova, iova_entry_free))
5073		pr_info("iova flush queue initialization failed\n");
5074}
5075
5076static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5077{
5078	struct dmar_domain *dmar_domain;
5079	struct iommu_domain *domain;
5080
5081	switch (type) {
5082	case IOMMU_DOMAIN_DMA:
5083	case IOMMU_DOMAIN_UNMANAGED:
5084		dmar_domain = alloc_domain(0);
5085		if (!dmar_domain) {
5086			pr_err("Can't allocate dmar_domain\n");
5087			return NULL;
5088		}
5089		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5090			pr_err("Domain initialization failed\n");
5091			domain_exit(dmar_domain);
5092			return NULL;
5093		}
5094
5095		if (type == IOMMU_DOMAIN_DMA)
5096			intel_init_iova_domain(dmar_domain);
5097
5098		domain_update_iommu_cap(dmar_domain);
5099
5100		domain = &dmar_domain->domain;
5101		domain->geometry.aperture_start = 0;
5102		domain->geometry.aperture_end   =
5103				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
5104		domain->geometry.force_aperture = true;
5105
5106		return domain;
5107	case IOMMU_DOMAIN_IDENTITY:
5108		return &si_domain->domain;
5109	default:
5110		return NULL;
5111	}
5112
5113	return NULL;
5114}
5115
5116static void intel_iommu_domain_free(struct iommu_domain *domain)
5117{
5118	if (domain != &si_domain->domain)
5119		domain_exit(to_dmar_domain(domain));
5120}
5121
5122/*
5123 * Check whether a @domain could be attached to the @dev through the
5124 * aux-domain attach/detach APIs.
5125 */
5126static inline bool
5127is_aux_domain(struct device *dev, struct iommu_domain *domain)
5128{
5129	struct device_domain_info *info = get_domain_info(dev);
5130
5131	return info && info->auxd_enabled &&
5132			domain->type == IOMMU_DOMAIN_UNMANAGED;
5133}
5134
5135static void auxiliary_link_device(struct dmar_domain *domain,
5136				  struct device *dev)
5137{
5138	struct device_domain_info *info = get_domain_info(dev);
5139
5140	assert_spin_locked(&device_domain_lock);
5141	if (WARN_ON(!info))
5142		return;
5143
5144	domain->auxd_refcnt++;
5145	list_add(&domain->auxd, &info->auxiliary_domains);
5146}
5147
5148static void auxiliary_unlink_device(struct dmar_domain *domain,
5149				    struct device *dev)
5150{
5151	struct device_domain_info *info = get_domain_info(dev);
5152
5153	assert_spin_locked(&device_domain_lock);
5154	if (WARN_ON(!info))
5155		return;
5156
5157	list_del(&domain->auxd);
5158	domain->auxd_refcnt--;
5159
5160	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5161		ioasid_free(domain->default_pasid);
5162}
5163
5164static int aux_domain_add_dev(struct dmar_domain *domain,
5165			      struct device *dev)
5166{
5167	int ret;
5168	unsigned long flags;
5169	struct intel_iommu *iommu;
5170
5171	iommu = device_to_iommu(dev, NULL, NULL);
5172	if (!iommu)
5173		return -ENODEV;
5174
5175	if (domain->default_pasid <= 0) {
5176		int pasid;
5177
5178		/* No private data needed for the default pasid */
5179		pasid = ioasid_alloc(NULL, PASID_MIN,
5180				     pci_max_pasids(to_pci_dev(dev)) - 1,
5181				     NULL);
5182		if (pasid == INVALID_IOASID) {
5183			pr_err("Can't allocate default pasid\n");
5184			return -ENODEV;
5185		}
5186		domain->default_pasid = pasid;
5187	}
5188
5189	spin_lock_irqsave(&device_domain_lock, flags);
5190	/*
5191	 * iommu->lock must be held to attach domain to iommu and setup the
5192	 * pasid entry for second level translation.
5193	 */
5194	spin_lock(&iommu->lock);
5195	ret = domain_attach_iommu(domain, iommu);
5196	if (ret)
5197		goto attach_failed;
5198
5199	/* Setup the PASID entry for mediated devices: */
5200	if (domain_use_first_level(domain))
5201		ret = domain_setup_first_level(iommu, domain, dev,
5202					       domain->default_pasid);
5203	else
5204		ret = intel_pasid_setup_second_level(iommu, domain, dev,
5205						     domain->default_pasid);
5206	if (ret)
5207		goto table_failed;
5208	spin_unlock(&iommu->lock);
5209
5210	auxiliary_link_device(domain, dev);
5211
5212	spin_unlock_irqrestore(&device_domain_lock, flags);
5213
5214	return 0;
5215
5216table_failed:
5217	domain_detach_iommu(domain, iommu);
5218attach_failed:
5219	spin_unlock(&iommu->lock);
5220	spin_unlock_irqrestore(&device_domain_lock, flags);
5221	if (!domain->auxd_refcnt && domain->default_pasid > 0)
5222		ioasid_free(domain->default_pasid);
5223
5224	return ret;
5225}
5226
5227static void aux_domain_remove_dev(struct dmar_domain *domain,
5228				  struct device *dev)
5229{
5230	struct device_domain_info *info;
5231	struct intel_iommu *iommu;
5232	unsigned long flags;
5233
5234	if (!is_aux_domain(dev, &domain->domain))
5235		return;
5236
5237	spin_lock_irqsave(&device_domain_lock, flags);
5238	info = get_domain_info(dev);
5239	iommu = info->iommu;
5240
5241	auxiliary_unlink_device(domain, dev);
5242
5243	spin_lock(&iommu->lock);
5244	intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5245	domain_detach_iommu(domain, iommu);
5246	spin_unlock(&iommu->lock);
5247
5248	spin_unlock_irqrestore(&device_domain_lock, flags);
5249}
5250
5251static int prepare_domain_attach_device(struct iommu_domain *domain,
5252					struct device *dev)
5253{
5254	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5255	struct intel_iommu *iommu;
5256	int addr_width;
5257
5258	iommu = device_to_iommu(dev, NULL, NULL);
5259	if (!iommu)
5260		return -ENODEV;
5261
5262	/* check if this iommu agaw is sufficient for max mapped address */
5263	addr_width = agaw_to_width(iommu->agaw);
5264	if (addr_width > cap_mgaw(iommu->cap))
5265		addr_width = cap_mgaw(iommu->cap);
5266
5267	if (dmar_domain->max_addr > (1LL << addr_width)) {
5268		dev_err(dev, "%s: iommu width (%d) is not "
5269		        "sufficient for the mapped address (%llx)\n",
5270		        __func__, addr_width, dmar_domain->max_addr);
5271		return -EFAULT;
5272	}
5273	dmar_domain->gaw = addr_width;
5274
5275	/*
5276	 * Knock out extra levels of page tables if necessary
5277	 */
5278	while (iommu->agaw < dmar_domain->agaw) {
5279		struct dma_pte *pte;
5280
5281		pte = dmar_domain->pgd;
5282		if (dma_pte_present(pte)) {
5283			dmar_domain->pgd = (struct dma_pte *)
5284				phys_to_virt(dma_pte_addr(pte));
5285			free_pgtable_page(pte);
5286		}
5287		dmar_domain->agaw--;
5288	}
5289
5290	return 0;
5291}
5292
5293static int intel_iommu_attach_device(struct iommu_domain *domain,
5294				     struct device *dev)
5295{
5296	int ret;
5297
5298	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5299	    device_is_rmrr_locked(dev)) {
5300		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5301		return -EPERM;
5302	}
5303
5304	if (is_aux_domain(dev, domain))
5305		return -EPERM;
5306
5307	/* normally dev is not mapped */
5308	if (unlikely(domain_context_mapped(dev))) {
5309		struct dmar_domain *old_domain;
5310
5311		old_domain = find_domain(dev);
5312		if (old_domain)
5313			dmar_remove_one_dev_info(dev);
5314	}
5315
5316	ret = prepare_domain_attach_device(domain, dev);
5317	if (ret)
5318		return ret;
5319
5320	return domain_add_dev_info(to_dmar_domain(domain), dev);
5321}
5322
5323static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5324					 struct device *dev)
5325{
5326	int ret;
5327
5328	if (!is_aux_domain(dev, domain))
5329		return -EPERM;
5330
5331	ret = prepare_domain_attach_device(domain, dev);
5332	if (ret)
5333		return ret;
5334
5335	return aux_domain_add_dev(to_dmar_domain(domain), dev);
5336}
5337
5338static void intel_iommu_detach_device(struct iommu_domain *domain,
5339				      struct device *dev)
5340{
5341	dmar_remove_one_dev_info(dev);
5342}
5343
5344static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5345					  struct device *dev)
5346{
5347	aux_domain_remove_dev(to_dmar_domain(domain), dev);
5348}
5349
5350/*
5351 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5352 * VT-d granularity. Invalidation is typically included in the unmap operation
5353 * as a result of DMA or VFIO unmap. However, for assigned devices guest
5354 * owns the first level page tables. Invalidations of translation caches in the
5355 * guest are trapped and passed down to the host.
5356 *
5357 * vIOMMU in the guest will only expose first level page tables, therefore
5358 * we do not support IOTLB granularity for request without PASID (second level).
5359 *
5360 * For example, to find the VT-d granularity encoding for IOTLB
5361 * type and page selective granularity within PASID:
5362 * X: indexed by iommu cache type
5363 * Y: indexed by enum iommu_inv_granularity
5364 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5365 */
5366
5367static const int
5368inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5369	/*
5370	 * PASID based IOTLB invalidation: PASID selective (per PASID),
5371	 * page selective (address granularity)
5372	 */
5373	{-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5374	/* PASID based dev TLBs */
5375	{-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5376	/* PASID cache */
5377	{-EINVAL, -EINVAL, -EINVAL}
5378};
5379
5380static inline int to_vtd_granularity(int type, int granu)
5381{
5382	return inv_type_granu_table[type][granu];
5383}
5384
5385static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5386{
5387	u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5388
5389	/* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5390	 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5391	 * granu size in contiguous memory.
5392	 */
5393	return order_base_2(nr_pages);
5394}
5395
5396#ifdef CONFIG_INTEL_IOMMU_SVM
5397static int
5398intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5399			   struct iommu_cache_invalidate_info *inv_info)
5400{
5401	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5402	struct device_domain_info *info;
5403	struct intel_iommu *iommu;
5404	unsigned long flags;
5405	int cache_type;
5406	u8 bus, devfn;
5407	u16 did, sid;
5408	int ret = 0;
5409	u64 size = 0;
5410
5411	if (!inv_info || !dmar_domain ||
5412	    inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
5413		return -EINVAL;
5414
5415	if (!dev || !dev_is_pci(dev))
5416		return -ENODEV;
5417
5418	iommu = device_to_iommu(dev, &bus, &devfn);
5419	if (!iommu)
5420		return -ENODEV;
5421
5422	if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5423		return -EINVAL;
5424
5425	spin_lock_irqsave(&device_domain_lock, flags);
5426	spin_lock(&iommu->lock);
5427	info = get_domain_info(dev);
5428	if (!info) {
5429		ret = -EINVAL;
5430		goto out_unlock;
5431	}
5432	did = dmar_domain->iommu_did[iommu->seq_id];
5433	sid = PCI_DEVID(bus, devfn);
5434
5435	/* Size is only valid in address selective invalidation */
5436	if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5437		size = to_vtd_size(inv_info->addr_info.granule_size,
5438				   inv_info->addr_info.nb_granules);
5439
5440	for_each_set_bit(cache_type,
5441			 (unsigned long *)&inv_info->cache,
5442			 IOMMU_CACHE_INV_TYPE_NR) {
5443		int granu = 0;
5444		u64 pasid = 0;
5445		u64 addr = 0;
5446
5447		granu = to_vtd_granularity(cache_type, inv_info->granularity);
5448		if (granu == -EINVAL) {
5449			pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5450					   cache_type, inv_info->granularity);
5451			break;
5452		}
5453
5454		/*
5455		 * PASID is stored in different locations based on the
5456		 * granularity.
5457		 */
5458		if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5459		    (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5460			pasid = inv_info->pasid_info.pasid;
5461		else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5462			 (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5463			pasid = inv_info->addr_info.pasid;
5464
5465		switch (BIT(cache_type)) {
5466		case IOMMU_CACHE_INV_TYPE_IOTLB:
5467			/* HW will ignore LSB bits based on address mask */
5468			if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5469			    size &&
5470			    (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5471				pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5472						   inv_info->addr_info.addr, size);
5473			}
5474
5475			/*
5476			 * If granu is PASID-selective, address is ignored.
5477			 * We use npages = -1 to indicate that.
5478			 */
5479			qi_flush_piotlb(iommu, did, pasid,
5480					mm_to_dma_pfn(inv_info->addr_info.addr),
5481					(granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5482					inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5483
5484			if (!info->ats_enabled)
5485				break;
5486			/*
5487			 * Always flush device IOTLB if ATS is enabled. vIOMMU
5488			 * in the guest may assume IOTLB flush is inclusive,
5489			 * which is more efficient.
5490			 */
5491			fallthrough;
5492		case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5493			/*
5494			 * PASID based device TLB invalidation does not support
5495			 * IOMMU_INV_GRANU_PASID granularity but only supports
5496			 * IOMMU_INV_GRANU_ADDR.
5497			 * The equivalent of that is we set the size to be the
5498			 * entire range of 64 bit. User only provides PASID info
5499			 * without address info. So we set addr to 0.
5500			 */
5501			if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5502				size = 64 - VTD_PAGE_SHIFT;
5503				addr = 0;
5504			} else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5505				addr = inv_info->addr_info.addr;
5506			}
5507
5508			if (info->ats_enabled)
5509				qi_flush_dev_iotlb_pasid(iommu, sid,
5510						info->pfsid, pasid,
5511						info->ats_qdep, addr,
5512						size);
5513			else
5514				pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5515			break;
5516		default:
5517			dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5518					    cache_type);
5519			ret = -EINVAL;
5520		}
5521	}
5522out_unlock:
5523	spin_unlock(&iommu->lock);
5524	spin_unlock_irqrestore(&device_domain_lock, flags);
5525
5526	return ret;
5527}
5528#endif
5529
5530static int intel_iommu_map(struct iommu_domain *domain,
5531			   unsigned long iova, phys_addr_t hpa,
5532			   size_t size, int iommu_prot, gfp_t gfp)
5533{
5534	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5535	u64 max_addr;
5536	int prot = 0;
5537	int ret;
5538
5539	if (iommu_prot & IOMMU_READ)
5540		prot |= DMA_PTE_READ;
5541	if (iommu_prot & IOMMU_WRITE)
5542		prot |= DMA_PTE_WRITE;
5543	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5544		prot |= DMA_PTE_SNP;
5545
5546	max_addr = iova + size;
5547	if (dmar_domain->max_addr < max_addr) {
5548		u64 end;
5549
5550		/* check if minimum agaw is sufficient for mapped address */
5551		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5552		if (end < max_addr) {
5553			pr_err("%s: iommu width (%d) is not "
5554			       "sufficient for the mapped address (%llx)\n",
5555			       __func__, dmar_domain->gaw, max_addr);
5556			return -EFAULT;
5557		}
5558		dmar_domain->max_addr = max_addr;
5559	}
5560	/* Round up size to next multiple of PAGE_SIZE, if it and
5561	   the low bits of hpa would take us onto the next page */
5562	size = aligned_nrpages(hpa, size);
5563	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5564				 hpa >> VTD_PAGE_SHIFT, size, prot);
5565	return ret;
5566}
5567
5568static size_t intel_iommu_unmap(struct iommu_domain *domain,
5569				unsigned long iova, size_t size,
5570				struct iommu_iotlb_gather *gather)
5571{
5572	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5573	struct page *freelist = NULL;
5574	unsigned long start_pfn, last_pfn;
5575	unsigned int npages;
5576	int iommu_id, level = 0;
5577
5578	/* Cope with horrid API which requires us to unmap more than the
5579	   size argument if it happens to be a large-page mapping. */
5580	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5581
5582	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5583		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5584
5585	start_pfn = iova >> VTD_PAGE_SHIFT;
5586	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5587
5588	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5589
5590	npages = last_pfn - start_pfn + 1;
5591
5592	for_each_domain_iommu(iommu_id, dmar_domain)
5593		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5594				      start_pfn, npages, !freelist, 0);
5595
5596	dma_free_pagelist(freelist);
5597
5598	if (dmar_domain->max_addr == iova + size)
5599		dmar_domain->max_addr = iova;
5600
5601	return size;
5602}
5603
5604static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5605					    dma_addr_t iova)
5606{
5607	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5608	struct dma_pte *pte;
5609	int level = 0;
5610	u64 phys = 0;
5611
5612	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5613	if (pte && dma_pte_present(pte))
5614		phys = dma_pte_addr(pte) +
5615			(iova & (BIT_MASK(level_to_offset_bits(level) +
5616						VTD_PAGE_SHIFT) - 1));
5617
5618	return phys;
5619}
5620
5621static inline bool scalable_mode_support(void)
5622{
5623	struct dmar_drhd_unit *drhd;
5624	struct intel_iommu *iommu;
5625	bool ret = true;
5626
5627	rcu_read_lock();
5628	for_each_active_iommu(iommu, drhd) {
5629		if (!sm_supported(iommu)) {
5630			ret = false;
5631			break;
5632		}
5633	}
5634	rcu_read_unlock();
5635
5636	return ret;
5637}
5638
5639static inline bool iommu_pasid_support(void)
5640{
5641	struct dmar_drhd_unit *drhd;
5642	struct intel_iommu *iommu;
5643	bool ret = true;
5644
5645	rcu_read_lock();
5646	for_each_active_iommu(iommu, drhd) {
5647		if (!pasid_supported(iommu)) {
5648			ret = false;
5649			break;
5650		}
5651	}
5652	rcu_read_unlock();
5653
5654	return ret;
5655}
5656
5657static inline bool nested_mode_support(void)
5658{
5659	struct dmar_drhd_unit *drhd;
5660	struct intel_iommu *iommu;
5661	bool ret = true;
5662
5663	rcu_read_lock();
5664	for_each_active_iommu(iommu, drhd) {
5665		if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5666			ret = false;
5667			break;
5668		}
5669	}
5670	rcu_read_unlock();
5671
5672	return ret;
5673}
5674
5675static bool intel_iommu_capable(enum iommu_cap cap)
5676{
5677	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5678		return domain_update_iommu_snooping(NULL) == 1;
5679	if (cap == IOMMU_CAP_INTR_REMAP)
5680		return irq_remapping_enabled == 1;
5681
5682	return false;
5683}
5684
5685static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5686{
5687	struct intel_iommu *iommu;
5688
5689	iommu = device_to_iommu(dev, NULL, NULL);
5690	if (!iommu)
5691		return ERR_PTR(-ENODEV);
5692
5693	if (translation_pre_enabled(iommu))
5694		dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5695
5696	return &iommu->iommu;
5697}
5698
5699static void intel_iommu_release_device(struct device *dev)
5700{
5701	struct intel_iommu *iommu;
5702
5703	iommu = device_to_iommu(dev, NULL, NULL);
5704	if (!iommu)
5705		return;
5706
5707	dmar_remove_one_dev_info(dev);
5708
5709	set_dma_ops(dev, NULL);
5710}
5711
5712static void intel_iommu_probe_finalize(struct device *dev)
5713{
5714	struct iommu_domain *domain;
5715
5716	domain = iommu_get_domain_for_dev(dev);
5717	if (device_needs_bounce(dev))
5718		set_dma_ops(dev, &bounce_dma_ops);
5719	else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5720		set_dma_ops(dev, &intel_dma_ops);
5721	else
5722		set_dma_ops(dev, NULL);
5723}
5724
5725static void intel_iommu_get_resv_regions(struct device *device,
5726					 struct list_head *head)
5727{
5728	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5729	struct iommu_resv_region *reg;
5730	struct dmar_rmrr_unit *rmrr;
5731	struct device *i_dev;
5732	int i;
5733
5734	down_read(&dmar_global_lock);
5735	for_each_rmrr_units(rmrr) {
5736		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5737					  i, i_dev) {
5738			struct iommu_resv_region *resv;
5739			enum iommu_resv_type type;
5740			size_t length;
5741
5742			if (i_dev != device &&
5743			    !is_downstream_to_pci_bridge(device, i_dev))
5744				continue;
5745
5746			length = rmrr->end_address - rmrr->base_address + 1;
5747
5748			type = device_rmrr_is_relaxable(device) ?
5749				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5750
5751			resv = iommu_alloc_resv_region(rmrr->base_address,
5752						       length, prot, type);
5753			if (!resv)
5754				break;
5755
5756			list_add_tail(&resv->list, head);
5757		}
5758	}
5759	up_read(&dmar_global_lock);
5760
5761#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5762	if (dev_is_pci(device)) {
5763		struct pci_dev *pdev = to_pci_dev(device);
5764
5765		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5766			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5767						   IOMMU_RESV_DIRECT_RELAXABLE);
5768			if (reg)
5769				list_add_tail(&reg->list, head);
5770		}
5771	}
5772#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5773
5774	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5775				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5776				      0, IOMMU_RESV_MSI);
5777	if (!reg)
5778		return;
5779	list_add_tail(&reg->list, head);
5780}
5781
5782int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5783{
5784	struct device_domain_info *info;
5785	struct context_entry *context;
5786	struct dmar_domain *domain;
5787	unsigned long flags;
5788	u64 ctx_lo;
5789	int ret;
5790
5791	domain = find_domain(dev);
5792	if (!domain)
5793		return -EINVAL;
5794
5795	spin_lock_irqsave(&device_domain_lock, flags);
5796	spin_lock(&iommu->lock);
5797
5798	ret = -EINVAL;
5799	info = get_domain_info(dev);
5800	if (!info || !info->pasid_supported)
5801		goto out;
5802
5803	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5804	if (WARN_ON(!context))
5805		goto out;
5806
5807	ctx_lo = context[0].lo;
5808
5809	if (!(ctx_lo & CONTEXT_PASIDE)) {
5810		ctx_lo |= CONTEXT_PASIDE;
5811		context[0].lo = ctx_lo;
5812		wmb();
5813		iommu->flush.flush_context(iommu,
5814					   domain->iommu_did[iommu->seq_id],
5815					   PCI_DEVID(info->bus, info->devfn),
5816					   DMA_CCMD_MASK_NOBIT,
5817					   DMA_CCMD_DEVICE_INVL);
5818	}
5819
5820	/* Enable PASID support in the device, if it wasn't already */
5821	if (!info->pasid_enabled)
5822		iommu_enable_dev_iotlb(info);
5823
5824	ret = 0;
5825
5826 out:
5827	spin_unlock(&iommu->lock);
5828	spin_unlock_irqrestore(&device_domain_lock, flags);
5829
5830	return ret;
5831}
5832
5833static void intel_iommu_apply_resv_region(struct device *dev,
5834					  struct iommu_domain *domain,
5835					  struct iommu_resv_region *region)
5836{
5837	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5838	unsigned long start, end;
5839
5840	start = IOVA_PFN(region->start);
5841	end   = IOVA_PFN(region->start + region->length - 1);
5842
5843	WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5844}
5845
5846static struct iommu_group *intel_iommu_device_group(struct device *dev)
5847{
5848	if (dev_is_pci(dev))
5849		return pci_device_group(dev);
5850	return generic_device_group(dev);
5851}
5852
5853static int intel_iommu_enable_auxd(struct device *dev)
5854{
5855	struct device_domain_info *info;
5856	struct intel_iommu *iommu;
5857	unsigned long flags;
5858	int ret;
5859
5860	iommu = device_to_iommu(dev, NULL, NULL);
5861	if (!iommu || dmar_disabled)
5862		return -EINVAL;
5863
5864	if (!sm_supported(iommu) || !pasid_supported(iommu))
5865		return -EINVAL;
5866
5867	ret = intel_iommu_enable_pasid(iommu, dev);
5868	if (ret)
5869		return -ENODEV;
5870
5871	spin_lock_irqsave(&device_domain_lock, flags);
5872	info = get_domain_info(dev);
5873	info->auxd_enabled = 1;
5874	spin_unlock_irqrestore(&device_domain_lock, flags);
5875
5876	return 0;
5877}
5878
5879static int intel_iommu_disable_auxd(struct device *dev)
5880{
5881	struct device_domain_info *info;
5882	unsigned long flags;
5883
5884	spin_lock_irqsave(&device_domain_lock, flags);
5885	info = get_domain_info(dev);
5886	if (!WARN_ON(!info))
5887		info->auxd_enabled = 0;
5888	spin_unlock_irqrestore(&device_domain_lock, flags);
5889
5890	return 0;
5891}
5892
5893/*
5894 * A PCI express designated vendor specific extended capability is defined
5895 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5896 * for system software and tools to detect endpoint devices supporting the
5897 * Intel scalable IO virtualization without host driver dependency.
5898 *
5899 * Returns the address of the matching extended capability structure within
5900 * the device's PCI configuration space or 0 if the device does not support
5901 * it.
5902 */
5903static int siov_find_pci_dvsec(struct pci_dev *pdev)
5904{
5905	int pos;
5906	u16 vendor, id;
5907
5908	pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5909	while (pos) {
5910		pci_read_config_word(pdev, pos + 4, &vendor);
5911		pci_read_config_word(pdev, pos + 8, &id);
5912		if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5913			return pos;
5914
5915		pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5916	}
5917
5918	return 0;
5919}
5920
5921static bool
5922intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5923{
5924	if (feat == IOMMU_DEV_FEAT_AUX) {
5925		int ret;
5926
5927		if (!dev_is_pci(dev) || dmar_disabled ||
5928		    !scalable_mode_support() || !iommu_pasid_support())
5929			return false;
5930
5931		ret = pci_pasid_features(to_pci_dev(dev));
5932		if (ret < 0)
5933			return false;
5934
5935		return !!siov_find_pci_dvsec(to_pci_dev(dev));
5936	}
5937
5938	if (feat == IOMMU_DEV_FEAT_SVA) {
5939		struct device_domain_info *info = get_domain_info(dev);
5940
5941		return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5942			info->pasid_supported && info->pri_supported &&
5943			info->ats_supported;
5944	}
5945
5946	return false;
5947}
5948
5949static int
5950intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5951{
5952	if (feat == IOMMU_DEV_FEAT_AUX)
5953		return intel_iommu_enable_auxd(dev);
5954
5955	if (feat == IOMMU_DEV_FEAT_SVA) {
5956		struct device_domain_info *info = get_domain_info(dev);
5957
5958		if (!info)
5959			return -EINVAL;
5960
5961		if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5962			return 0;
5963	}
5964
5965	return -ENODEV;
5966}
5967
5968static int
5969intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5970{
5971	if (feat == IOMMU_DEV_FEAT_AUX)
5972		return intel_iommu_disable_auxd(dev);
5973
5974	return -ENODEV;
5975}
5976
5977static bool
5978intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5979{
5980	struct device_domain_info *info = get_domain_info(dev);
5981
5982	if (feat == IOMMU_DEV_FEAT_AUX)
5983		return scalable_mode_support() && info && info->auxd_enabled;
5984
5985	return false;
5986}
5987
5988static int
5989intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5990{
5991	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5992
5993	return dmar_domain->default_pasid > 0 ?
5994			dmar_domain->default_pasid : -EINVAL;
5995}
5996
5997static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5998					   struct device *dev)
5999{
6000	return attach_deferred(dev);
6001}
6002
6003static int
6004intel_iommu_domain_set_attr(struct iommu_domain *domain,
6005			    enum iommu_attr attr, void *data)
6006{
6007	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6008	unsigned long flags;
6009	int ret = 0;
6010
6011	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6012		return -EINVAL;
6013
6014	switch (attr) {
6015	case DOMAIN_ATTR_NESTING:
6016		spin_lock_irqsave(&device_domain_lock, flags);
6017		if (nested_mode_support() &&
6018		    list_empty(&dmar_domain->devices)) {
6019			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6020			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6021		} else {
6022			ret = -ENODEV;
6023		}
6024		spin_unlock_irqrestore(&device_domain_lock, flags);
6025		break;
6026	default:
6027		ret = -EINVAL;
6028		break;
6029	}
6030
6031	return ret;
6032}
6033
6034/*
6035 * Check that the device does not live on an external facing PCI port that is
6036 * marked as untrusted. Such devices should not be able to apply quirks and
6037 * thus not be able to bypass the IOMMU restrictions.
6038 */
6039static bool risky_device(struct pci_dev *pdev)
6040{
6041	if (pdev->untrusted) {
6042		pci_info(pdev,
6043			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6044			 pdev->vendor, pdev->device);
6045		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6046		return true;
6047	}
6048	return false;
6049}
6050
6051const struct iommu_ops intel_iommu_ops = {
6052	.capable		= intel_iommu_capable,
6053	.domain_alloc		= intel_iommu_domain_alloc,
6054	.domain_free		= intel_iommu_domain_free,
6055	.domain_set_attr	= intel_iommu_domain_set_attr,
6056	.attach_dev		= intel_iommu_attach_device,
6057	.detach_dev		= intel_iommu_detach_device,
6058	.aux_attach_dev		= intel_iommu_aux_attach_device,
6059	.aux_detach_dev		= intel_iommu_aux_detach_device,
6060	.aux_get_pasid		= intel_iommu_aux_get_pasid,
6061	.map			= intel_iommu_map,
6062	.unmap			= intel_iommu_unmap,
6063	.iova_to_phys		= intel_iommu_iova_to_phys,
6064	.probe_device		= intel_iommu_probe_device,
6065	.probe_finalize		= intel_iommu_probe_finalize,
6066	.release_device		= intel_iommu_release_device,
6067	.get_resv_regions	= intel_iommu_get_resv_regions,
6068	.put_resv_regions	= generic_iommu_put_resv_regions,
6069	.apply_resv_region	= intel_iommu_apply_resv_region,
6070	.device_group		= intel_iommu_device_group,
6071	.dev_has_feat		= intel_iommu_dev_has_feat,
6072	.dev_feat_enabled	= intel_iommu_dev_feat_enabled,
6073	.dev_enable_feat	= intel_iommu_dev_enable_feat,
6074	.dev_disable_feat	= intel_iommu_dev_disable_feat,
6075	.is_attach_deferred	= intel_iommu_is_attach_deferred,
6076	.def_domain_type	= device_def_domain_type,
6077	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
6078#ifdef CONFIG_INTEL_IOMMU_SVM
6079	.cache_invalidate	= intel_iommu_sva_invalidate,
6080	.sva_bind_gpasid	= intel_svm_bind_gpasid,
6081	.sva_unbind_gpasid	= intel_svm_unbind_gpasid,
6082	.sva_bind		= intel_svm_bind,
6083	.sva_unbind		= intel_svm_unbind,
6084	.sva_get_pasid		= intel_svm_get_pasid,
6085	.page_response		= intel_svm_page_response,
6086#endif
6087};
6088
6089static void quirk_iommu_igfx(struct pci_dev *dev)
6090{
6091	if (risky_device(dev))
6092		return;
6093
6094	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6095	dmar_map_gfx = 0;
6096}
6097
6098/* G4x/GM45 integrated gfx dmar support is totally busted. */
6099DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6100DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6101DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6102DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6103DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6104DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6105DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6106
6107/* Broadwell igfx malfunctions with dmar */
6108DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6109DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6110DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6111DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6112DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6113DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6114DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6115DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6116DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6117DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6118DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6119DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6120DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6121DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6122DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6123DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6124DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6125DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6126DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6127DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6128DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6129DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6130DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6131DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6132
6133static void quirk_iommu_rwbf(struct pci_dev *dev)
6134{
6135	if (risky_device(dev))
6136		return;
6137
6138	/*
6139	 * Mobile 4 Series Chipset neglects to set RWBF capability,
6140	 * but needs it. Same seems to hold for the desktop versions.
6141	 */
6142	pci_info(dev, "Forcing write-buffer flush capability\n");
6143	rwbf_quirk = 1;
6144}
6145
6146DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6147DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6148DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6149DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6150DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6151DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6152DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6153
6154#define GGC 0x52
6155#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
6156#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
6157#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
6158#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
6159#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
6160#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
6161#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
6162#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
6163
6164static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6165{
6166	unsigned short ggc;
6167
6168	if (risky_device(dev))
6169		return;
6170
6171	if (pci_read_config_word(dev, GGC, &ggc))
6172		return;
6173
6174	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6175		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6176		dmar_map_gfx = 0;
6177	} else if (dmar_map_gfx) {
6178		/* we have to ensure the gfx device is idle before we flush */
6179		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6180		intel_iommu_strict = 1;
6181       }
6182}
6183DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6184DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6185DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6186DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6187
6188static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6189{
6190	unsigned short ver;
6191
6192	if (!IS_GFX_DEVICE(dev))
6193		return;
6194
6195	ver = (dev->device >> 8) & 0xff;
6196	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6197	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6198	    ver != 0x9a)
6199		return;
6200
6201	if (risky_device(dev))
6202		return;
6203
6204	pci_info(dev, "Skip IOMMU disabling for graphics\n");
6205	iommu_skip_te_disable = 1;
6206}
6207DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6208
6209/* On Tylersburg chipsets, some BIOSes have been known to enable the
6210   ISOCH DMAR unit for the Azalia sound device, but not give it any
6211   TLB entries, which causes it to deadlock. Check for that.  We do
6212   this in a function called from init_dmars(), instead of in a PCI
6213   quirk, because we don't want to print the obnoxious "BIOS broken"
6214   message if VT-d is actually disabled.
6215*/
6216static void __init check_tylersburg_isoch(void)
6217{
6218	struct pci_dev *pdev;
6219	uint32_t vtisochctrl;
6220
6221	/* If there's no Azalia in the system anyway, forget it. */
6222	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6223	if (!pdev)
6224		return;
6225
6226	if (risky_device(pdev)) {
6227		pci_dev_put(pdev);
6228		return;
6229	}
6230
6231	pci_dev_put(pdev);
6232
6233	/* System Management Registers. Might be hidden, in which case
6234	   we can't do the sanity check. But that's OK, because the
6235	   known-broken BIOSes _don't_ actually hide it, so far. */
6236	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6237	if (!pdev)
6238		return;
6239
6240	if (risky_device(pdev)) {
6241		pci_dev_put(pdev);
6242		return;
6243	}
6244
6245	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6246		pci_dev_put(pdev);
6247		return;
6248	}
6249
6250	pci_dev_put(pdev);
6251
6252	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6253	if (vtisochctrl & 1)
6254		return;
6255
6256	/* Drop all bits other than the number of TLB entries */
6257	vtisochctrl &= 0x1c;
6258
6259	/* If we have the recommended number of TLB entries (16), fine. */
6260	if (vtisochctrl == 0x10)
6261		return;
6262
6263	/* Zero TLB entries? You get to ride the short bus to school. */
6264	if (!vtisochctrl) {
6265		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6266		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6267		     dmi_get_system_info(DMI_BIOS_VENDOR),
6268		     dmi_get_system_info(DMI_BIOS_VERSION),
6269		     dmi_get_system_info(DMI_PRODUCT_VERSION));
6270		iommu_identity_mapping |= IDENTMAP_AZALIA;
6271		return;
6272	}
6273
6274	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6275	       vtisochctrl);
6276}